# Match vz to rs through vz ids

In [1]:
import pandas as pd
from elasticsearch import Elasticsearch
from exploretools import Explorer

es = Elasticsearch()
ex = Explorer(es)

In [3]:
docs_vz = {'index': 'verejnezakazky_2018-02-13', 
           'doc_type': 'verejnazakazka'}
docs_rs = {'index': 'hlidacsmluv_2018-02-18', 
           'doc_type': 'smlouva'}

### Get all vz ids

In [4]:
ecz = ex.unique_exact(docs_vz, 'evidencniCisloZakazky')
# vz_id = ex.unique_exact(docs_vz, 'id') # this is finding nothing (based on a few queries compared to ecz)

Went through 185790 entries.
Found 180171 unique entries.


### Explore matches manually

In [105]:
ecz[idx]

'60061212'

In [121]:
idx=19333
r=es.search(**docs_rs, body={'query': {'term': {'prilohy.plainTextContent': ecz[idx]}},
                             '_source': ['id']},
           size=10)
r

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': 'pre382499364',
    '_index': 'hlidacsmluv_2018-02-18',
    '_score': 14.781748,
    '_source': {'id': 'pre382499364'},
    '_type': 'smlouva'}],
  'max_score': 14.781748,
  'total': 1},
 'timed_out': False,
 'took': 299}

In [122]:
import json
r=es.search(**docs_rs, body={'query': {'term': {'prilohy.plainTextContent': ecz[idx]}}})
with open('data/work/tmp.txt', 'w', encoding='utf8') as f:
    json.dump(r['hits']['hits'][0], f, ensure_ascii=False)
ecz[idx]

'366914'

### Match all

In [48]:
def msearch_body(field, vals, limit = None):
    body = []
    for val in vals:
        body.append({})
        body.append({'query':   {'term': {field: val}},
                     '_source': ['id'],
                     'size':    1000})
        if limit is not None and len(body)//2 > limit:
            break
    return body

def get_rs_ids(response):
    get_rs_id = lambda hit: hit['_source']['id']
    hits = response['hits']['hits']
    return list(map(get_rs_id, hits))

In [72]:
matches = {}
batch_size = 1000
for i in range(0, len(ecz), batch_size):
    batch_end = min(i+batch_size, len(ecz))
    batch = ecz[i:batch_end]
    body = msearch_body('prilohy.plainTextContent', batch)
    
    responses = es.msearch(**docs_rs, body=body, request_timeout=600)['responses']
    
    rs_ids = map(get_rs_ids, responses)
    
    matches.update(dict(zip(batch, rs_ids)))
    
    with open('data/work/matches.json', 'w') as f:
        json.dump(matches, f)
        
    print(f"{i}/{len(ecz)}")

0/180171
1000/180171
2000/180171
3000/180171
4000/180171
5000/180171
6000/180171
7000/180171
8000/180171
9000/180171
10000/180171
11000/180171
12000/180171
13000/180171
14000/180171
15000/180171
16000/180171
17000/180171
18000/180171
19000/180171
20000/180171
21000/180171
22000/180171
23000/180171
24000/180171
25000/180171
26000/180171
27000/180171
28000/180171
29000/180171
30000/180171
31000/180171
32000/180171
33000/180171
34000/180171
35000/180171
36000/180171
37000/180171
38000/180171
39000/180171
40000/180171
41000/180171
42000/180171
43000/180171
44000/180171
45000/180171
46000/180171
47000/180171
48000/180171
49000/180171
50000/180171
51000/180171
52000/180171
53000/180171
54000/180171
55000/180171
56000/180171
57000/180171
58000/180171
59000/180171
60000/180171
61000/180171
62000/180171
63000/180171
64000/180171
65000/180171
66000/180171
67000/180171
68000/180171
69000/180171
70000/180171
71000/180171
72000/180171
73000/180171
74000/180171
75000/180171
76000/180171
77000/180171

### Check number of matches

In [83]:
sm = pd.Series(matches)
sm.apply(len).describe()

count    180171.000000
mean          1.683306
std           8.328173
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max         753.000000
dtype: float64

In [84]:
sm.apply(len).value_counts().head(10)

0    114006
1     23333
4     10152
2     10102
5      7022
3      4644
6      3313
7      1799
8      1069
9       713
dtype: int64

## Conclusion
All vz ids (`evidencniCisloZakazky`) were matched to all plaintext documents of all rs contracts (`prilohy.plainTextContent`)

### Quantitative
Total number of unique vz ids: 180 171

Un-matchable: 114 006

Matches exactly one contract: 23 333

### Qualitative
Procedure: 
1. Choose some random vz id which matches to a rs document
2. Print the document a look for the id
3. Look at the neighborhood of the detected id

Based on a few searches, it seems that there will be many false positives, especially for the short numerical vz ids since the number can have different meaning. Some true positives had words such as verejna zakazka, vestnik VZ, etc. around them. I suppose that for a more precise matching, co-occurence of these words should be detected.