# Rule-based gazetteer matching: name-custom-threshold

In [1]:
import pandas as pd
import os
    
# own file
from gazmatch import matchutils as mu

In [2]:
data_dir = 'data'

### Load data

In [3]:
# load ground truth data (pickled)
ground_truth_flat = mu.load_data(os.path.join(data_dir, 'ground_truth_flat.pkl'))
print("%s matching pairs" %len(ground_truth_flat))

468 matching pairs


In [4]:
# load SwissNames3D augmented with lat, lons
path_swissnames_csv = os.path.join(data_dir, 'swissnames_new_uuids.csv')
swissnames = pd.read_csv(path_swissnames_csv, sep='\t', encoding='utf-8', low_memory=False)
swissnames.shape

(313562, 21)

In [5]:
# source data (Geonames records)
path_to_source_data = os.path.join(data_dir, 'source_data_augmented.csv')
df_source = pd.read_csv(path_to_source_data, sep='\t', encoding='utf-8')

## Rule-based matching

In [6]:
# settings and prep
distance_threshold_dict = {
    'GLCR':5.0,  # matched types are points in SwissNames
    'HLL':5.0,  # matched types are points in SwissNames
    'LK':15.0,  # matched type is polygon in SwissNames
    'MT':5.0,  # matched types are points in SwissNames
    'PASS':5.0,  # matched types are points in SwissNames
    'PK':5.0,  # matched types are points in SwissNames
    'STM':15.0,  # matched type is line in SwissNames
    'VAL':15.0}  # matched type is polygon in SwissNames

# swissnames name groups
sn_name_groups = swissnames.groupby(['NAME'])

# geonames type groups
type_groups = df_source.groupby(['feature code'])

In [7]:
matches_overall = []
neg_overall = []
results_type = {}
#for file in files:
for ftype, group in type_groups:
    matches_type = []
    neg_type = []
    gn_ids_type = []
    for index, gn_record in group.iterrows():
        name = gn_record['name']
        alternatenames = str(gn_record['alternatenames'])
        gn_id = gn_record['geonameid']
        gn_ids_type.append(gn_id)
        # get any exact name matches as candidates, including altnames and the comma-flip trick
        df_exact_all = mu.find_all_exact_name_matches(name, alternatenames, sn_name_groups, swissnames)
        df_exact = df_exact_all.copy()
        if df_exact.empty:
            #print("Found no exact matches for %s" %name)
            neg_type.append((gn_id, mu.NO_MATCH_STRING))
            neg_overall.append((gn_id, mu.NO_MATCH_STRING))
            continue
        # 2. calculate distance to all exact matches
        gn_point = (gn_record['latitude'], gn_record['longitude'])
        df_exact['distances'] = df_exact.apply(lambda r: mu.calculate_distance_local(r, gn_point), axis=1)
        # 3. take matches within threshold distance as correct
        df_subset = df_exact[df_exact['distances'] < distance_threshold_dict[ftype]]
        if df_subset.shape[0] > 0:
            for sn_id in df_subset['UUID_old'].tolist():
                matches_type.append((gn_id, sn_id))
                matches_overall.append((gn_id, sn_id))
        else:
            # no records were below threshold
            neg_type.append((gn_id, mu.NO_MATCH_STRING))
            neg_overall.append((gn_id, mu.NO_MATCH_STRING))
    # filter ground truth to ids of current feature type only
    gn_ids_type_set = set(gn_ids_type)
    ground_truth_type = [item for item in ground_truth_flat if item[0] in gn_ids_type_set]
    results_type[ftype] = mu.evaluate_results_list(matches_type, neg_type, ground_truth_type, verbose=False)

print("prec\trecall\tf1")
p, r, f1 = mu.evaluate_results_list(matches_overall, neg_overall, ground_truth_flat)

prec	recall	f1
0.843	0.861	0.852


### Results by type

In [8]:
print("type\tprec\trecall\tf1")
for k,v in results_type.items():
    print("%s\t%.3f\t%.3f\t%.3f" %(k, v[0], v[1], v[2]))

type	prec	recall	f1
GLCR	1.000	0.966	0.982
HLL	0.881	0.981	0.929
LK	0.773	0.962	0.857
MT	0.755	0.889	0.816
PASS	0.962	0.962	0.962
PK	0.918	0.938	0.928
STM	0.672	0.529	0.592
VAL	0.750	0.800	0.774
