# Machine learning gazetteer matching: learning curve
Notebook showing one run testing how performance changes when we increase the training set size (40 pairs at a time).

In [1]:
import pandas as pd
import numpy as np
import os
import random
from sklearn.ensemble import RandomForestClassifier
    
from gazmatch import matchutils as mu
from gazmatch import mlrunner as ml

## Preliminaries

In [2]:
### PATHS ###
data_dir = 'data'
path_geonames_csv = os.path.join(data_dir, 'geonames_ch_swisscoords.csv')
path_swissnames_csv = os.path.join(data_dir, 'swissnames_new_uuids.csv')
path_test_set_ids = os.path.join(data_dir, 'test_set_ids.pkl')
# with original UUIDs
path_ground_truth_list = os.path.join(data_dir, 'ground_truth_flat.pkl')
# modified record-unique IDs
path_ground_truth_dict_new_uuids = os.path.join(data_dir, 'ground_truth_new_uuids.pkl')
path_ground_truth_list_new_uuids = os.path.join(data_dir, 'ground_truth_flat_new_uuids.pkl')
# landcover data
land_cover_dir = 'ArealStatistik\\arealstatistik_nolc_2004\\rawdata\\stand_20130918_endstand'
land_cover_csv = 'AREA_NOLC04_27_130918.csv'

# candidate selection: matches per train and test record
M_train = 30
M_test = 30

### One-time data preparation
- Prepare the landcover data
- Read in the two gazetteers
- Read in ground truth data
- Do initial training set
- Prepare the test data since it's always the same

In [3]:
# land cover data
df_areal = pd.read_csv(os.path.join(data_dir, land_cover_dir, land_cover_csv), sep=';', encoding='utf-8', low_memory=False)
print(df_areal.shape)
# prepare for Nearest Neighbour searches (this takes a while!)
neighbour_obj = mu.prepare_neighbours(df_areal)

(4128498, 13)


In [4]:
# load SwissNames3D augmented with lat, lons
swissnames = pd.read_csv(path_swissnames_csv, sep='\t', encoding='utf-8', low_memory=False)
print("We have %s records in SwissNames3D in %s columns." %(swissnames.shape[0], swissnames.shape[1]))

# load GeoNames with all WGS84 lat, lons projected to swiss coordinates
geonames = pd.read_csv(path_geonames_csv, sep='\t', encoding='utf-8', low_memory=False)
print("We have %s records in GeoNames in %s columns." %(geonames.shape[0], geonames.shape[1]))

# ground truth (i.e. positive matches for each geonames record)
ground_truth_dict = mu.load_data(path_ground_truth_dict_new_uuids)
# sanity check - this should be 400
print("Our ground truth dict has %s GeoNames records." %len(ground_truth_dict))

# geonames IDs in our fixed, balanced test set
test_gn_ids = mu.load_data(path_test_set_ids)
# sanity check - this should be 80
print("Our fixed test set has %s GeoNames records." %len(test_gn_ids))

We have 313562 records in SwissNames3D in 21 columns.
We have 67796 records in GeoNames in 21 columns.
Our ground truth dict has 400 GeoNames records.
Our fixed test set has 80 GeoNames records.


In [5]:
# must take the test set ids out of consideration
potential_training_ids = set(ground_truth_dict.keys()) - test_gn_ids
print("We have %s GeoNames records to potentially use in our training data." %len(potential_training_ids))

We have 320 GeoNames records to potentially use in our training data.


### Test data preparation

In [6]:
### TEST DATA PREPARATION : ONCE outside the loop ###
df_pos_test = mu.prepare_positives_from_ids(test_gn_ids, ground_truth_dict, verbose=True)
# trim geonames
geonames_test = geonames[geonames['geonameid'].isin(test_gn_ids)]
df_geonames_test = geonames_test.set_index('geonameid')
# candidate selection
candidate_selecter = ml.CandidateSelecter(gazetteer_source=df_geonames_test, gazetteer_target=swissnames)
matches_test = candidate_selecter.get_candidates_geoleven(id_list=test_gn_ids, matches_per_record=M_test)
# turn matches dict into list of pos and neg matches
pos_test, neg_test = mu.split_pos_neg(matches_test, ground_truth_dict)
pos_test_uniques = list(set(pos_test))
print("We had %s positive and %s negative pairs retained in candidate selection." %(len(pos_test_uniques), len(neg_test)))
print("We kept %s out of a possible %s positive pairs, i.e. %.3f (upper bound on recall)." 
  %(len(pos_test_uniques), df_pos_test.shape[0], len(pos_test_uniques)/df_pos_test.shape[0]))
# recall upper bound proper, with original uuids
recall_upper_bound = mu.calculate_recall_upper_bound(pos_test_uniques,df_pos_test)
# prepare negatives as dataframe
df_neg_test_all = pd.DataFrame.from_records(neg_test, columns=['geonamesid', 'swissnamesid', 'match'])
df_neg_test = df_neg_test_all.drop_duplicates()
# prepare the test matches using all (pos+neg) matches obtained from candidate selection
df_test_pos = pd.DataFrame.from_records(pos_test_uniques, columns=['geonamesid', 'swissnamesid', 'match'])
df_test = df_test_pos.append(df_neg_test)
vc = df_test['match'].value_counts()
print("Test set: the ratio of neg to pos is %.3f" %(vc[mu.NO_MATCH_STRING]/vc[mu.MATCH_STRING]))

### MATCHING FEATURES FOR TEST DATA : also ONCE outside the loop ###
swissnames_test = swissnames[swissnames['UUID'].isin(df_test['swissnamesid'].tolist())]
df_swissnames_test = swissnames_test.set_index(['UUID'])
fc = ml.FeatureComputer(matches=df_test, gazetteer_source=df_geonames_test, 
                         gazetteer_target=df_swissnames_test, landcover_data=df_areal, neigh_obj=neighbour_obj)
fc.compute_features_all(verbose=True)
df_test_all = fc.df.copy()
df_test_all['match_bin'] = pd.factorize(df_test_all['match'])[0]

We have 106 positive matches and 1 records with no match
Processing record 0...
Processing record 50...
We had 92 positive and 2302 negative pairs retained in candidate selection.
We kept 92 out of a possible 106 positive pairs, i.e. 0.868 (upper bound on recall).
Test set: the ratio of neg to pos is 25.022
Calculating point-to-point distance between matches...
Calculating Levenshtein distance between names...
Calculating Levenshtein distance between de-commaed names...
Calculating min Levenshtein distance on alternate names...
Getting the min Levenshtein distance overall...
Calculating the normalized Levenshtein-Damerau distance...
Calculating Jaro similarity...
Calculating Jaro-Winkler similarity...
Getting the absolute elevation distance...
Getting the feature types from both gazetteers...
Getting the dummy variables for feature types...
Getting the landcover classes for source and target...
Getting the dummy variables for landcover classes...
Getting the 'mode' landcover classes fo

### Evaluation preliminaries

In [7]:
# prepare and filter ground truth for new UUIDs...
ground_truth_pairs_new_uuids = mu.load_data(path_ground_truth_list_new_uuids)
ground_truth_new_uuids_filtered = mu.filter_ground_truth_for_test(ground_truth_pairs_new_uuids, test_gn_ids)
# ...and original UUIDs for our results
ground_truth_pairs = mu.load_data(path_ground_truth_list)
ground_truth_filtered = mu.filter_ground_truth_for_test(ground_truth_pairs, test_gn_ids)

### Training data preparation

In [8]:
### PREPARE ALL POTENTIAL TRAINING DATA ###
df_pos_train = mu.prepare_positives_from_ids(potential_training_ids, ground_truth_dict, verbose=True)
# trim geonames to just stuff in training data
geonames_train = geonames[geonames['geonameid'].isin(potential_training_ids)]
df_geonames_train = geonames_train.set_index('geonameid')
candidate_selecter = ml.CandidateSelecter(gazetteer_source=df_geonames_train, gazetteer_target=swissnames)
matches_train = candidate_selecter.get_candidates_geoleven(id_list=potential_training_ids, matches_per_record=M_train)
# turn matches dict into list of pos (if any) and neg matches
pos_train, neg_train = mu.split_pos_neg(matches_train, ground_truth_dict)
pos_train_uniques = list(set(pos_train))
print("We had %s positive and %s negative pairs retained in candidate selection." %(len(pos_train_uniques), len(neg_train)))
print("We kept %s out of a possible %s positive pairs, i.e. %.3f" 
  %(len(pos_train_uniques), df_pos_train.shape[0], len(pos_train_uniques)/df_pos_train.shape[0]))
# prepare negatives from the ones mined
df_neg_train_all = pd.DataFrame.from_records(neg_train, columns=['geonamesid', 'swissnamesid', 'match'])
df_neg_train = df_neg_train_all.drop_duplicates()
# take all positives from ground truth (not just ones mined) and combine with negs
df_train = df_pos_train.append(df_neg_train)
# actual ratio of neg to pos in final training set
vc = df_train['match'].value_counts()
print("The actual ratio of neg to pos is %.3f" %(vc[mu.NO_MATCH_STRING]/vc[mu.MATCH_STRING]))

### MATCHING FEATURES FOR TRAINING DATA ###
# trim swissnames to just records that appear in training data
swissnames_train = swissnames[swissnames['UUID'].isin(df_train['swissnamesid'].tolist())]
df_swissnames_train = swissnames_train.set_index(['UUID'])
# compute matching features
fc = ml.FeatureComputer(matches=df_train, gazetteer_source=df_geonames_train, 
                        gazetteer_target=df_swissnames_train, landcover_data=df_areal, neigh_obj=neighbour_obj)
fc.compute_features_all(verbose=True)
df_train_full = fc.df.copy()
# make another column for integers instead of 'match/no_match'
df_train_full['match_bin'] = pd.factorize(df_train_full['match'])[0]

### MASSAGE DFs COLUMNS ACROSS TRAIN AND TEST ###
for colname in df_test_all.columns:
    if colname not in df_train_full.columns:
        df_train_full[colname] = 0
for colname in df_train_full.columns:
    if colname not in df_test_all.columns:
        df_test_all[colname] = 0

We have 394 positive matches and 13 records with no match
Processing record 0...
Processing record 50...
Processing record 100...
Processing record 150...
Processing record 200...
Processing record 250...
Processing record 300...
We had 351 positive and 9224 negative pairs retained in candidate selection.
We kept 351 out of a possible 394 positive pairs, i.e. 0.891
The actual ratio of neg to pos is 23.406
Calculating point-to-point distance between matches...
Calculating Levenshtein distance between names...
Calculating Levenshtein distance between de-commaed names...
Calculating min Levenshtein distance on alternate names...
Getting the min Levenshtein distance overall...
Calculating the normalized Levenshtein-Damerau distance...
Calculating Jaro similarity...
Calculating Jaro-Winkler similarity...
Getting the absolute elevation distance...
Getting the feature types from both gazetteers...
Getting the dummy variables for feature types...
Getting the landcover classes for source and ta

## Main loop

In [9]:
results = []
training_size_increment = 40
current_training_ids = set()

for i in range(8):
    print("\n### training set size %s ###" %(i+1))
    potential_extra_ids = potential_training_ids - current_training_ids
    print("We have %s potential ids from which we will sample an extra 40" %len(potential_extra_ids))
    extra_training_ids = set(random.sample(potential_extra_ids, training_size_increment))
    train_gn_ids = current_training_ids.union(extra_training_ids)
    print("We have %s training ids for this run" %len(train_gn_ids))
    
    ### SUBSET OUR TRAINING DATA ###
    df_train_all = df_train_full[df_train_full['geonamesid'].isin(train_gn_ids)].copy()
    print("We have %s pairs for training for this loop." %df_train_all.shape[0])

    ### RANDOM FOREST ###
    # 1) basic
    desc = "basic"
    feature_columns = ['distance', 'leven_min']
    X = df_train_all[feature_columns]
    y = df_train_all['match_bin']
    clf = RandomForestClassifier(n_estimators=200)
    clf = clf.fit(X, y)
    predictions = clf.predict(df_test_all[feature_columns])
    # results
    print("\n# %s, results:" %desc)
    pairs = mu.get_positives_as_pairs(df_test_all, predictions)
    pairs_old = [(item[0], mu.new_uuid_to_old(item[1])) for item in pairs]
    p,r,f1 = mu.evaluate_results_deep(pairs_old, ground_truth_filtered, verbose=True)
    results.append((p,r,f1,recall_upper_bound))

    # 2) basic-type
    desc = "basic-type"
    cols_to_exclude = ['geonamesid', 'swissnamesid', 'match', 'match_bin', 'leven', 'leven_comma', 'leven_alt', 
                       'leven_dam_norm', 'jaro', 'jaro_winkler', 'elev_dist', 
                       'gn_10', 'gn_20', 'gn_30', 'gn_40', 'gn_50', 'gn_60', 
                       'sn_10', 'sn_20', 'sn_30', 'sn_40', 'sn_50', 'sn_60', 
                       'gn_mode_10', 'gn_mode_20', 'gn_mode_30', 'gn_mode_40', 'gn_mode_50', 'gn_mode_60', 
                       'sn_mode_10', 'sn_mode_20', 'sn_mode_30', 'sn_mode_40', 'sn_mode_50', 'sn_mode_60', 
                       'lc_distance']
    feature_columns = [colname for colname in list(df_train_all.columns) if colname not in cols_to_exclude]
    X = df_train_all[feature_columns]
    y = df_train_all['match_bin']
    clf = RandomForestClassifier(n_estimators=200)
    clf = clf.fit(X, y)
    predictions = clf.predict(df_test_all[feature_columns])
    # results
    print("\n# %s, results:" %desc)
    pairs = mu.get_positives_as_pairs(df_test_all, predictions)
    pairs_old = [(item[0], mu.new_uuid_to_old(item[1])) for item in pairs]
    p,r,f1 = mu.evaluate_results_deep(pairs_old, ground_truth_filtered, verbose=True)
    results.append((p,r,f1,recall_upper_bound))

    # 3) str-type
    desc = "str-type"
    cols_to_exclude = ['geonamesid', 'swissnamesid', 'match', 'match_bin', 
                       'gn_10', 'gn_20', 'gn_30', 'gn_40', 'gn_50', 'gn_60', 
                       'sn_10', 'sn_20', 'sn_30', 'sn_40', 'sn_50', 'sn_60', 
                       'gn_mode_10', 'gn_mode_20', 'gn_mode_30', 'gn_mode_40', 'gn_mode_50', 'gn_mode_60', 
                       'sn_mode_10', 'sn_mode_20', 'sn_mode_30', 'sn_mode_40', 'sn_mode_50', 'sn_mode_60', 
                       'elev_dist', 'lc_distance']
    feature_columns = [colname for colname in list(df_train_all.columns) if colname not in cols_to_exclude]
    X = df_train_all[feature_columns]
    y = df_train_all['match_bin']
    clf = RandomForestClassifier(n_estimators=200)
    clf = clf.fit(X, y)
    predictions = clf.predict(df_test_all[feature_columns])
    # results
    print("\n# %s, results:" %desc)
    pairs = mu.get_positives_as_pairs(df_test_all, predictions)
    pairs_old = [(item[0], mu.new_uuid_to_old(item[1])) for item in pairs]
    p,r,f1 = mu.evaluate_results_deep(pairs_old, ground_truth_filtered, verbose=True)
    results.append((p,r,f1,recall_upper_bound))

    # 4) str-type-lcd
    desc = "str-type-lcd"
    cols_to_exclude = ['geonamesid', 'swissnamesid', 'match', 'match_bin', 
                       'gn_10', 'gn_20', 'gn_30', 'gn_40', 'gn_50', 'gn_60', 
                       'sn_10', 'sn_20', 'sn_30', 'sn_40', 'sn_50', 'sn_60', 
                       'gn_mode_10', 'gn_mode_20', 'gn_mode_30', 'gn_mode_40', 'gn_mode_50', 'gn_mode_60', 
                       'sn_mode_10', 'sn_mode_20', 'sn_mode_30', 'sn_mode_40', 'sn_mode_50', 'sn_mode_60',
                       'elev_dist']
    feature_columns = [colname for colname in list(df_train_all.columns) if colname not in cols_to_exclude]
    X = df_train_all[feature_columns]
    y = df_train_all['match_bin']
    clf = RandomForestClassifier(n_estimators=200)
    clf = clf.fit(X, y)
    predictions = clf.predict(df_test_all[feature_columns])
    # results
    print("\n# %s, results:" %desc)
    pairs = mu.get_positives_as_pairs(df_test_all, predictions)
    pairs_old = [(item[0], mu.new_uuid_to_old(item[1])) for item in pairs]
    p,r,f1 = mu.evaluate_results_deep(pairs_old, ground_truth_filtered, verbose=True)
    results.append((p,r,f1,recall_upper_bound))
    
    # finally prepare for next loop!
    current_training_ids = train_gn_ids.copy()


### training set size 1 ###
We have 320 potential ids from which we will sample an extra 40
We have 40 training ids for this run
We have 1204 pairs for training for this loop.

# basic, results:
  0.796 precision
  0.863 recall
  0.828 f1

# basic-type, results:
  0.844 precision
  0.853 recall
  0.848 f1

# str-type, results:
  0.808 precision
  0.842 recall
  0.825 f1

# str-type-lcd, results:
  0.810 precision
  0.853 recall
  0.831 f1

### training set size 2 ###
We have 280 potential ids from which we will sample an extra 40
We have 80 training ids for this run
We have 2403 pairs for training for this loop.

# basic, results:
  0.796 precision
  0.821 recall
  0.808 f1

# basic-type, results:
  0.890 precision
  0.853 recall
  0.871 f1

# str-type, results:
  0.832 precision
  0.884 recall
  0.857 f1

# str-type-lcd, results:
  0.856 precision
  0.874 recall
  0.865 f1

### training set size 3 ###
We have 240 potential ids from which we will sample an extra 40
We have 120 trainin

### Results

In [10]:
print("precision\trecall\t\tf1 \t\tmax_recall")
print("- - - - - - - - - - - - - - - - - - - - - - - - - - ")
for p,r,f1,rec_ub in results:
    print("%.4f   \t%.4f   \t%.4f   \t%.4f" %(p,r,f1,rec_ub))

precision	recall		f1 		max_recall
- - - - - - - - - - - - - - - - - - - - - - - - - - 
0.7961   	0.8632   	0.8283   	0.9263
0.8438   	0.8526   	0.8482   	0.9263
0.8081   	0.8421   	0.8247   	0.9263
0.8100   	0.8526   	0.8308   	0.9263
0.7959   	0.8211   	0.8083   	0.9263
0.8901   	0.8526   	0.8710   	0.9263
0.8317   	0.8842   	0.8571   	0.9263
0.8557   	0.8737   	0.8646   	0.9263
0.7980   	0.8316   	0.8144   	0.9263
0.9205   	0.8526   	0.8852   	0.9263
0.8557   	0.8737   	0.8646   	0.9263
0.8542   	0.8632   	0.8586   	0.9263
0.7843   	0.8421   	0.8122   	0.9263
0.9101   	0.8526   	0.8804   	0.9263
0.8557   	0.8737   	0.8646   	0.9263
0.8557   	0.8737   	0.8646   	0.9263
0.7547   	0.8421   	0.7960   	0.9263
0.9231   	0.8842   	0.9032   	0.9263
0.8571   	0.8842   	0.8705   	0.9263
0.9130   	0.8842   	0.8984   	0.9263
0.8000   	0.8421   	0.8205   	0.9263
0.8936   	0.8842   	0.8889   	0.9263
0.8737   	0.8737   	0.8737   	0.9263
0.8817   	0.8632   	0.8723   	0.9263
0.7857   	0.8105   	0.797