# Record linkage con Python Record Linkage Toolkit

In [None]:
import recordlinkage
import pandas as pd

### Record Linkage per valutare le prestazioni del sistema attraverso due dataset appositamente costruiti (dataset_l e dataset_r)

In [None]:
def getCompleteDataset(path):
    df = pd.read_csv(path, index_col='id')

    df = df[df.name.isnull() == False]

    df['name'] = df['name'].replace(r'\s+|\\n|\\r', ' ', regex=True)
    df['name'] = df['name'].str.upper()
    df['industry'] = df['industry'].str.upper()
    df['country'] = df['country'].str.upper()
    #df['headquarters'] = df['headquarters'].str.upper()
    df['address'] = df['address'].str.upper()
    df['sector'] = df['sector'].str.upper()
    df['ceo'] = df['ceo'].str.upper()
    #df['founders'] = df['founders'].str.upper()
    #df['area_served'] = df['area_served'].str.upper()

    return df


df_a = getCompleteDataset('datasets_test/dataset_l.csv')
df_b = getCompleteDataset('datasets_test/dataset_r.csv')

display(df_a)
display(df_b)

In [None]:
indexer = recordlinkage.Index()
indexer.block('name')
candidate_links = indexer.index(df_a, df_b)

In [None]:
set_no_copy = set()
for (c,b) in candidate_links:
    if (b,c) not in set_no_copy and (c != b):
        set_no_copy.add((c,b))

list_no_copy = list(set_no_copy)
# gli elementi delle coppie vengono distribuite su due liste parallele
list_0 = [x[0] for x in list_no_copy]
list_1 = [x[1] for x in list_no_copy]
multi_index = pd.MultiIndex.from_arrays([list_0, list_1])

In [None]:
compare = recordlinkage.Compare()

compare.string('name', 'name', method='jarowinkler', threshold=0.7)
# compare.string('industry', 'industry', method='jarowinkler', threshold=0.85)
compare.string('country', 'country', method='jarowinkler', threshold=0.5, missing_value=1)
compare.string('headquarters', 'country', method='jarowinkler', threshold=0.5, missing_value=1)
# compare.string('country', 'headquarters', method='jarowinkler', threshold=0.5)
# compare.string('headquarters', 'headquarters', method='jarowinkler', threshold=0.5)


# compare.string('headquarters', 'headquarters', method='jarowinkler', threshold=0.85)
# compare.string('ceo', 'ceo', method='jarowinkler', threshold=0.85)
# compare.string('sector', 'sector', method='jarowinkler', threshold=0.85)

# The comparison vectors
compare_vectors = compare.compute(candidate_links, df_a, df_b)
compare_vectors = compare_vectors[compare_vectors.index.isin(multi_index)]

In [None]:
ecm = recordlinkage.ECMClassifier()
matches = ecm.fit_predict(compare_vectors)

In [None]:
matches_l = [x[0] for x in matches.values]
matches_r = [x[1] for x in matches.values]
df_match_sistema = pd.DataFrame({"idl": matches_l, "idr":matches_r})
df_match_sistema.to_csv('datasets_test/match_sistema.csv', index=False)