# Machine learning gazetteer matching: main runs
Notebook to use to generate our main results. We train a model with different combinations of features, but on the same random subset of training and test data per run, for directly comparable runs.

In [1]:
import pandas as pd
import os
import random
from sklearn.ensemble import RandomForestClassifier

from gazmatch import matchutils as mu
from gazmatch import mlrunner as ml

### Preliminaries

In [2]:
### PATHS ###
data_dir = 'data'
path_geonames_csv = os.path.join(data_dir, 'geonames_ch_swisscoords.csv')
path_swissnames_csv = os.path.join(data_dir, 'swissnames_new_uuids.csv')
# with original UUIDs
path_ground_truth_list = os.path.join(data_dir, 'ground_truth_flat.pkl')
# modified record-unique IDs
path_ground_truth_dict_new_uuids = os.path.join(data_dir, 'ground_truth_new_uuids.pkl')
path_ground_truth_list_new_uuids = os.path.join(data_dir, 'ground_truth_flat_new_uuids.pkl')
# landcover data
land_cover_dir = 'ArealStatistik\\arealstatistik_nolc_2004\\rawdata\\stand_20130918_endstand'
land_cover_csv = 'AREA_NOLC04_27_130918.csv'

# candidate selection: matches per train and test record
M_train = 30
M_test = 30
# train to test split
train_portion = .75
# number of loops using these settings (e.g. 10)
num_loops = 2

## One-time data preparation
- Prepare the landcover data
- Read in the two gazetteers
- Read in ground truth data

In [3]:
# land cover data
df_areal = pd.read_csv(os.path.join(data_dir, land_cover_dir, land_cover_csv), sep=';', encoding='utf-8', low_memory=False)
print(df_areal.shape)
# prepare for Nearest Neighbour searches (this takes a while!)
neighbour_obj = mu.prepare_neighbours(df_areal)

(4128498, 13)


In [4]:
# load SwissNames3D augmented with lat, lons
swissnames = pd.read_csv(path_swissnames_csv, sep='\t', encoding='utf-8', low_memory=False)
print("We have %s records in SwissNames3D in %s columns." %(swissnames.shape[0], swissnames.shape[1]))

### OPTIONAL: prepare swissnames for spatial filtering to use a hard distance filter in candidate selection
#neighbour_sn = mu.prepare_neighbours_swissnames(swissnames)

# load GeoNames with all WGS84 lat, lons projected to swiss coordinates
geonames = pd.read_csv(path_geonames_csv, sep='\t', encoding='utf-8', low_memory=False)
print("We have %s records in GeoNames in %s columns." %(geonames.shape[0], geonames.shape[1]))

# ground truth (i.e. positive matches for each geonames record)
ground_truth_dict = mu.load_data(path_ground_truth_dict_new_uuids)
# sanity check - this should be 400
print("Our ground truth dict has %s GeoNames records." %len(ground_truth_dict))

We have 313562 records in SwissNames3D in 21 columns.
We have 67796 records in GeoNames in 21 columns.
Our ground truth dict has 400 GeoNames records.


## Machine learning gazetteer matching

In [5]:
# function to avoid too much repetition below
def run_with_features(train, test, features_to_use, description, verbose=True):
    X = train[features_to_use]
    y = train['match_bin']
    rfc = RandomForestClassifier(n_estimators=200)
    rfc = rfc.fit(X, y)
    predictions = rfc.predict(test[features_to_use])
    # results
    if verbose:
        print("\n# %s, results:" %description)
    pairs = mu.get_positives_as_pairs(test, predictions)
    pairs_old = [(item[0], mu.new_uuid_to_old(item[1])) for item in pairs]
    precision,recall,f1 = mu.evaluate_results_deep(pairs_old, ground_truth_filtered, verbose=True)
    return precision,recall,f1

### Main loop

In [6]:
results = []

for i in range(num_loops):
    print("\n### Run number %s ###" %(i+1))
    
    ### SPLIT GEONAMES IDS INTO TRAIN:TEST AND PREPARE POSITIVES ###
    N_train = int(len(ground_truth_dict)*train_portion)
    train_gn_records = random.sample(ground_truth_dict.keys(), N_train)
    test_gn_records = [k for k in ground_truth_dict.keys() if k not in train_gn_records]
    print("We selected %s geonames features to be in our training set." %len(train_gn_records))
    print("We have %s geonames features left for our test set." %len(test_gn_records))
    df_pos_train, df_pos_test = mu.prepare_positives(train_gn_records, test_gn_records, ground_truth_dict, verbose=True)
    
    ### TRAINING DATA PREPARATION ###
    geonames_train = geonames[geonames['geonameid'].isin(train_gn_records)]
    df_geonames_train = geonames_train.set_index('geonameid')
    # candidate selection
    candidate_selecter = ml.CandidateSelecter(gazetteer_source=df_geonames_train, 
                                              gazetteer_target=swissnames)
    matches_train = candidate_selecter.get_candidates_geoleven(id_list=train_gn_records, 
                                                                         matches_per_record=M_train)
    ### OPTIONAL: candidate selection with a hard spatial filter (max_distance), to speed things up
    #candidate_selecter = ml.CandidateSelecter(gazetteer_source=df_geonames_train, 
                                              #gazetteer_target=swissnames, neigh_obj=neighbour_sn)
    #matches_train = candidate_selecter.get_candidates_geoleven_prefilter(id_list=train_gn_records, matches_per_record=M_train, 
                                                                         #max_distance=50000, filter_by_type=True)
    # turn matches dict into list of pos (if any) and neg matches
    pos_train, neg_train = mu.split_pos_neg(matches_train, ground_truth_dict)
    pos_train_uniques = list(set(pos_train))
    print("We had %s positive and %s negative pairs retained in candidate selection." 
          %(len(pos_train_uniques), len(neg_train)))
    print("We kept %s out of a possible %s positive pairs, i.e. %.3f" 
          %(len(pos_train_uniques), df_pos_train.shape[0], len(pos_train_uniques)/df_pos_train.shape[0]))
    # prepare negatives from the ones mined
    df_neg_train_all = pd.DataFrame.from_records(neg_train, columns=['geonamesid', 'swissnamesid', 'match'])
    df_neg_train = df_neg_train_all.drop_duplicates()
    # take all positives from ground truth (not just ones mined) and combine with negs
    df_train = df_pos_train.append(df_neg_train)
    # actual ratio of neg to pos in final training set
    vc = df_train['match'].value_counts()
    print("The actual ratio of neg to pos is %.3f" %(vc[mu.NO_MATCH_STRING]/vc[mu.MATCH_STRING]))
    
    ### MATCHING FEATURES FOR TRAINING DATA ###
    swissnames_train = swissnames[swissnames['UUID'].isin(df_train['swissnamesid'].tolist())]
    df_swissnames_train = swissnames_train.set_index(['UUID'])
    # compute matching features
    fc = ml.FeatureComputer(matches=df_train, gazetteer_source=df_geonames_train, 
                            gazetteer_target=df_swissnames_train, landcover_data=df_areal, neigh_obj=neighbour_obj)
    fc.compute_features_all(verbose=True)
    df_train_all = fc.df.copy()
    df_train_all['match_bin'] = pd.factorize(df_train_all['match'])[0]
    
    ### TEST DATA PREPARATION ###
    geonames_test = geonames[geonames['geonameid'].isin(test_gn_records)]
    df_geonames_test = geonames_test.set_index('geonameid')
    # candidate selection: again here other options available e.g. a hard spatial filter
    candidate_selecter = ml.CandidateSelecter(gazetteer_source=df_geonames_test, gazetteer_target=swissnames)
    matches_test = candidate_selecter.get_candidates_geoleven(id_list=test_gn_records, matches_per_record=M_test)
    # turn matches dict into list of pos and neg matches
    pos_test, neg_test = mu.split_pos_neg(matches_test, ground_truth_dict)
    pos_test_uniques = list(set(pos_test))
    print("We had %s positive and %s negative pairs retained in candidate selection." 
          %(len(pos_test_uniques), len(neg_test)))
    print("We kept %s out of a possible %s positive pairs, i.e. %.3f (upper bound on recall)." 
          %(len(pos_test_uniques), df_pos_test.shape[0], len(pos_test_uniques)/df_pos_test.shape[0]))
    # recall upper bound proper, with original uuids
    recall_upper_bound = mu.calculate_recall_upper_bound(pos_test_uniques,df_pos_test)
    # prepare negatives as dataframe
    df_neg_test_all = pd.DataFrame.from_records(neg_test, columns=['geonamesid', 'swissnamesid', 'match'])
    df_neg_test = df_neg_test_all.drop_duplicates()
    # prepare the test matches using all (pos+neg) matches obtained from candidate selection
    df_test_pos = pd.DataFrame.from_records(pos_test_uniques, columns=['geonamesid', 'swissnamesid', 'match'])
    df_test = df_test_pos.append(df_neg_test)
    vc = df_test['match'].value_counts()
    print("Test set: the ratio of neg to pos is %.3f" %(vc[mu.NO_MATCH_STRING]/vc[mu.MATCH_STRING]))
    
    ### MATCHING FEATURES FOR TEST DATA ###
    swissnames_test = swissnames[swissnames['UUID'].isin(df_test['swissnamesid'].tolist())]
    df_swissnames_test = swissnames_test.set_index(['UUID'])
    fc = ml.FeatureComputer(matches=df_test, gazetteer_source=df_geonames_test, 
                             gazetteer_target=df_swissnames_test, landcover_data=df_areal, neigh_obj=neighbour_obj)
    fc.compute_features_all(verbose=True)
    df_test_all = fc.df.copy()
    df_test_all['match_bin'] = pd.factorize(df_test_all['match'])[0]
    for colname in df_test_all.columns:
        if colname not in df_train_all.columns:
            df_train_all[colname] = 0
    for colname in df_train_all.columns:
        if colname not in df_test_all.columns:
            df_test_all[colname] = 0
    
    ### EVALUATION PRELIMINARIES ###
    # prepare and filter ground truth for new UUIDs...
    ground_truth_pairs_new_uuids = mu.load_data(path_ground_truth_list_new_uuids)
    ground_truth_new_uuids_filtered = mu.filter_ground_truth_for_test(ground_truth_pairs_new_uuids, test_gn_records)
    # ...and old UUIDs for our results
    ground_truth_pairs = mu.load_data(path_ground_truth_list)
    ground_truth_filtered = mu.filter_ground_truth_for_test(ground_truth_pairs, test_gn_records)
    
    ### RANDOM FOREST ###
    # 1) basic
    desc = "basic"
    feature_columns = ['distance', 'leven_min']
    if i == 0:
        print("feature columns: %s" %feature_columns)
    p,r,f = run_with_features(df_train_all, df_test_all, feature_columns, desc)
    results.append((p,r,f,recall_upper_bound))
    
    # 2) str
    desc = "str"
    feature_columns = ['distance', 'leven', 'leven_comma', 'leven_alt', 'leven_min', 'leven_dam_norm', 'jaro', 'jaro_winkler']    
    if i == 0:
        print("feature columns: %s" %feature_columns)
    p,r,f = run_with_features(df_train_all, df_test_all, feature_columns, desc)
    results.append((p,r,f,recall_upper_bound))
    
    # 3) basic-type
    desc = "basic-type"
    cols_to_exclude = ['geonamesid', 'swissnamesid', 'match', 'match_bin', 'leven', 'leven_comma', 'leven_alt', 
                       'leven_dam_norm', 'jaro', 'jaro_winkler', 'elev_dist', 
                       'gn_10', 'gn_20', 'gn_30', 'gn_40', 'gn_50', 'gn_60', 
                       'sn_10', 'sn_20', 'sn_30', 'sn_40', 'sn_50', 'sn_60', 
                       'gn_mode_10', 'gn_mode_20', 'gn_mode_30', 'gn_mode_40', 'gn_mode_50', 'gn_mode_60', 
                       'sn_mode_10', 'sn_mode_20', 'sn_mode_30', 'sn_mode_40', 'sn_mode_50', 'sn_mode_60', 
                       'lc_distance']
    feature_columns = [colname for colname in list(df_train_all.columns) if colname not in cols_to_exclude]
    if i == 0:
        print("feature columns: %s" %feature_columns)
    p,r,f = run_with_features(df_train_all, df_test_all, feature_columns, desc)
    results.append((p,r,f,recall_upper_bound))
    
    # 4) str-type
    desc = "str-type"
    cols_to_exclude = ['geonamesid', 'swissnamesid', 'match', 'match_bin', 
                       'gn_10', 'gn_20', 'gn_30', 'gn_40', 'gn_50', 'gn_60', 
                       'sn_10', 'sn_20', 'sn_30', 'sn_40', 'sn_50', 'sn_60', 
                       'gn_mode_10', 'gn_mode_20', 'gn_mode_30', 'gn_mode_40', 'gn_mode_50', 'gn_mode_60', 
                       'sn_mode_10', 'sn_mode_20', 'sn_mode_30', 'sn_mode_40', 'sn_mode_50', 'sn_mode_60', 
                       'elev_dist', 'lc_distance']
    feature_columns = [colname for colname in list(df_train_all.columns) if colname not in cols_to_exclude]
    if i == 0:
        print("feature columns: %s" %feature_columns)
    p,r,f = run_with_features(df_train_all, df_test_all, feature_columns, desc)
    results.append((p,r,f,recall_upper_bound))
      
    # 5) str-elev-lc
    desc = "str-elev-lc"
    feature_columns = ['distance', 'leven', 'leven_comma', 'leven_alt', 'leven_min', 'leven_dam_norm', 'jaro', 'jaro_winkler',
                       'gn_10', 'gn_20', 'gn_30', 'gn_40', 'gn_50', 'gn_60', 
                       'sn_10', 'sn_20', 'sn_30', 'sn_40', 'sn_50', 'sn_60', 
                       'gn_mode_10', 'gn_mode_20', 'gn_mode_30', 'gn_mode_40', 'gn_mode_50', 'gn_mode_60', 
                       'sn_mode_10', 'sn_mode_20', 'sn_mode_30', 'sn_mode_40', 'sn_mode_50', 'sn_mode_60',
                       'elev_dist', 'lc_distance']
    if i == 0:
        print("feature columns: %s" %feature_columns)
    p,r,f = run_with_features(df_train_all, df_test_all, feature_columns, desc)
    results.append((p,r,f,recall_upper_bound))
    
    # 6) str-type-lcd
    desc = "str-type-lcd"
    cols_to_exclude = ['geonamesid', 'swissnamesid', 'match', 'match_bin', 
                       'gn_10', 'gn_20', 'gn_30', 'gn_40', 'gn_50', 'gn_60', 
                       'sn_10', 'sn_20', 'sn_30', 'sn_40', 'sn_50', 'sn_60', 
                       'gn_mode_10', 'gn_mode_20', 'gn_mode_30', 'gn_mode_40', 'gn_mode_50', 'gn_mode_60', 
                       'sn_mode_10', 'sn_mode_20', 'sn_mode_30', 'sn_mode_40', 'sn_mode_50', 'sn_mode_60',
                       'elev_dist']
    feature_columns = [colname for colname in list(df_train_all.columns) if colname not in cols_to_exclude]
    if i == 0:
        print("feature columns: %s" %feature_columns)
    p,r,f = run_with_features(df_train_all, df_test_all, feature_columns, desc)
    results.append((p,r,f,recall_upper_bound))
    
    # 7) all-min
    desc = "all-min"
    cols_to_exclude = ['geonamesid', 'swissnamesid', 'match', 'match_bin', 'leven', 'leven_comma', 'leven_alt', 
                       'leven_dam_norm', 'jaro', 'jaro_winkler',  
                       'gn_10', 'gn_20', 'gn_30', 'gn_40', 'gn_50', 'gn_60', 
                       'sn_10', 'sn_20', 'sn_30', 'sn_40', 'sn_50', 'sn_60', 
                       'gn_mode_10', 'gn_mode_20', 'gn_mode_30', 'gn_mode_40', 'gn_mode_50', 'gn_mode_60', 
                       'sn_mode_10', 'sn_mode_20', 'sn_mode_30', 'sn_mode_40', 'sn_mode_50', 'sn_mode_60']
    feature_columns = [colname for colname in list(df_train_all.columns) if colname not in cols_to_exclude]
    if i == 0:
        print("feature columns: %s" %feature_columns)
    p,r,f = run_with_features(df_train_all, df_test_all, feature_columns, desc)
    results.append((p,r,f,recall_upper_bound))
    
    # 8) all
    desc = "all"
    cols_to_exclude = ['geonamesid', 'swissnamesid', 'match', 'match_bin']
    feature_columns = [colname for colname in list(df_train_all.columns) if colname not in cols_to_exclude]
    if i == 0:
        print("feature columns: %s" %feature_columns)
    p,r,f = run_with_features(df_train_all, df_test_all, feature_columns, desc)
    results.append((p,r,f,recall_upper_bound))


### Run number 1 ###
We selected 300 geonames features to be in our training set.
We have 100 geonames features left for our test set.
Training: We have 371 positive matches and 12 records with no match
Testing: We have 129 positive matches and 2 records with no match
Processing record 0...
Processing record 50...
Processing record 100...
Processing record 150...
Processing record 200...
Processing record 250...
We had 330 positive and 8647 negative pairs retained in candidate selection.
We kept 330 out of a possible 371 positive pairs, i.e. 0.889
The actual ratio of neg to pos is 23.302
Calculating point-to-point distance between matches...
Calculating Levenshtein distance between names...
Calculating Levenshtein distance between de-commaed names...
Calculating min Levenshtein distance on alternate names...
Getting the min Levenshtein distance overall...
Calculating the normalized Levenshtein-Damerau distance...
Calculating Jaro similarity...
Calculating Jaro-Winkler similarity...
Ge


# all, results:
  0.912 precision
  0.897 recall
  0.904 f1

### Run number 2 ###
We selected 300 geonames features to be in our training set.
We have 100 geonames features left for our test set.
Training: We have 381 positive matches and 8 records with no match
Testing: We have 119 positive matches and 6 records with no match
Processing record 0...
Processing record 50...
Processing record 100...
Processing record 150...
Processing record 200...
Processing record 250...
We had 335 positive and 8636 negative pairs retained in candidate selection.
We kept 335 out of a possible 381 positive pairs, i.e. 0.879
The actual ratio of neg to pos is 22.661
Calculating point-to-point distance between matches...
Calculating Levenshtein distance between names...
Calculating Levenshtein distance between de-commaed names...
Calculating min Levenshtein distance on alternate names...
Getting the min Levenshtein distance overall...
Calculating the normalized Levenshtein-Damerau distance...
Calculating 

### Results

In [7]:
print("precision\trecall\t\tf1 \t\tmax_recall")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - ")
for p,r,f,rec_ub in results:
    print("%.4f   \t%.4f   \t%.4f   \t%.4f" %(p,r,f,rec_ub))

precision	recall		f1 		max_recall
- - - - - - - - - - - - - - - - - - - - - - - - - - 
0.8673   	0.8448   	0.8559   	0.9397
0.9320   	0.8276   	0.8767   	0.9397
0.9189   	0.8793   	0.8987   	0.9397
0.9196   	0.8879   	0.9035   	0.9397
0.8718   	0.8793   	0.8755   	0.9397
0.9369   	0.8966   	0.9163   	0.9397
0.9450   	0.8879   	0.9156   	0.9397
0.9123   	0.8966   	0.9043   	0.9397
0.8091   	0.7807   	0.7946   	0.9211
0.8624   	0.8246   	0.8430   	0.9211
0.9510   	0.8509   	0.8981   	0.9211
0.9174   	0.8772   	0.8969   	0.9211
0.8739   	0.8509   	0.8622   	0.9211
0.9352   	0.8860   	0.9099   	0.9211
0.9505   	0.8421   	0.8930   	0.9211
0.9259   	0.8772   	0.9009   	0.9211
