# Rule-based gazetteer matching: random-baseline

In [1]:
import pandas as pd
import os
    
# own file
from gazmatch import matchutils as mu

In [2]:
data_dir = 'data'

### Load data

In [3]:
# load ground truth data (pickled)
ground_truth_flat = mu.load_data(os.path.join(data_dir, 'ground_truth_flat.pkl'))
print("%s matching pairs" %len(ground_truth_flat))

468 matching pairs


In [4]:
# load SwissNames3D augmented with lat, lons
path_swissnames_csv = os.path.join(data_dir, 'swissnames_new_uuids.csv')
swissnames = pd.read_csv(path_swissnames_csv, sep='\t', encoding='utf-8', low_memory=False)
swissnames.shape

(313562, 21)

In [5]:
# source data (Geonames records)
path_to_source_data = os.path.join(data_dir, 'source_data_augmented.csv')
df_source = pd.read_csv(path_to_source_data, sep='\t', encoding='utf-8')

## Rule-based matching

In [6]:
def get_exact_matches(token, grouped_df, df_cols):
    try:
        df_ret = grouped_df.get_group(token)
        return df_ret
    except KeyError:
        return pd.DataFrame(columns=df_cols)

def single_run(source_type_groups, target_name_groups, target_cols, print_type_results=False, print_overall_results=False):
    matches_overall = []
    neg_overall = []
    if print_type_results:
        print("\tprec\trecall\tf1")
    for ftype, group in source_type_groups:
        matches_type = []
        neg_type = []
        gn_ids_type = []
        for index, gn_record in group.iterrows():
            name = gn_record['name']
            gn_id = gn_record['geonameid']
            gn_ids_type.append(gn_id)
            # 1. get any exact name matches as candidates
            df_exact_all = get_exact_matches(name, target_name_groups, target_cols)
            df_exact = df_exact_all.copy()
            if df_exact.empty:
                neg_type.append((gn_id, mu.NO_MATCH_STRING))
                neg_overall.append((gn_id, mu.NO_MATCH_STRING))
                continue
            # 2. take a random exact match as the match
            df_match = df_exact.sample() # defaults to n=1
            matches_type.append((gn_id, df_match.iloc[0]['UUID_old']))
            matches_overall.append((gn_id, df_match.iloc[0]['UUID_old']))
        # filter ground truth to ids of current feature type only
        gn_ids_type_set = set(gn_ids_type)
        ground_truth_type = [item for item in ground_truth_flat if item[0] in gn_ids_type_set]
        p_type, r_type, f1_type = mu.evaluate_results_list(matches_type, neg_type, ground_truth_type, verbose=False)
        if print_type_results:
            print("%s\t%.3f\t%.3f\t%.3f" %(ftype, p_type, r_type, f1_type))
            
    p, r, f1 = mu.evaluate_results_list(matches_overall, neg_overall, ground_truth_flat, verbose=False)
    if print_overall_results:
        print("%.3f\t%.3f\t%.3f" %(p, r, f1))
    return matches_overall, neg_overall

### Run once

In [7]:
# swissnames name groups
sn_name_groups = swissnames.groupby(['NAME'])

# geonames type groups
type_groups = df_source.groupby(['feature code'])

# show performance by type
test_single = single_run(type_groups, sn_name_groups, swissnames.columns, print_type_results=True, print_overall_results=False)

	prec	recall	f1
GLCR	0.939	0.529	0.676
HLL	0.720	0.679	0.699
LK	0.913	0.792	0.848
MT	0.625	0.556	0.588
PASS	0.895	0.654	0.756
PK	0.804	0.771	0.787
STM	0.629	0.259	0.367
VAL	0.794	0.600	0.684


### Run multiple times
Because there's a randomness element, the results change slightly at each run.

In [8]:
# run 10 times
print("prec\trecall\tf1")
for i in range(0,10):
    single_run(type_groups, sn_name_groups, swissnames.columns, print_type_results=False, print_overall_results=True)

prec	recall	f1
0.808	0.583	0.677
0.796	0.575	0.667
0.811	0.585	0.680
0.796	0.575	0.667
0.772	0.558	0.648
0.805	0.581	0.675
0.784	0.566	0.658
0.799	0.577	0.670
0.790	0.571	0.663
0.808	0.583	0.677
