# Rule-based gazetteer matching: multi-threshold

In [1]:
import pandas as pd
import numpy as np
import os
    
# own file
from gazmatch import matchutils as mu

In [2]:
data_dir = 'data'

### Load/prepare data

In [3]:
# load ground truth data (pickled)
ground_truth_flat = mu.load_data(os.path.join(data_dir, 'ground_truth_flat.pkl'))
print("%s matching pairs" %len(ground_truth_flat))

468 matching pairs


In [4]:
# load SwissNames3D augmented with lat, lons
path_swissnames_csv = os.path.join(data_dir, 'swissnames_new_uuids.csv')
swissnames = pd.read_csv(path_swissnames_csv, sep='\t', encoding='utf-8', low_memory=False)
swissnames.shape

(313562, 21)

In [5]:
# load GeoNames with all WGS84 lat, lons projected to swiss coordinates
path_geonames_csv = os.path.join(data_dir, 'geonames_ch_swisscoords.csv')
geonames = pd.read_csv(path_geonames_csv, sep='\t', encoding='utf-8', low_memory=False)
print("We have %s features in GeoNames in %s columns." %(geonames.shape[0], geonames.shape[1]))

We have 67796 features in GeoNames in 21 columns.


In [6]:
# prepare data for land cover
geonames_lcd = geonames.set_index('geonameid')
# land cover data
land_cover_dir = 'ArealStatistik\\arealstatistik_nolc_2004\\rawdata\\stand_20130918_endstand'
land_cover_csv = 'AREA_NOLC04_27_130918.csv'
df_areal = pd.read_csv(os.path.join(data_dir, land_cover_dir, land_cover_csv), sep=';', encoding='utf-8', low_memory=False)
print(df_areal.shape)
# prepare for Nearest Neighbour searches (this takes a while!)
neighbour_obj = mu.prepare_neighbours(df_areal)

(4128498, 13)


In [7]:
# load source data (Geonames records)
path_to_source_data = os.path.join(data_dir, 'source_data_augmented.csv')
df_source = pd.read_csv(path_to_source_data, sep='\t', encoding='utf-8')

## Rule-based matching

In [8]:
def landcover_distance(row, gn_id, df_source, df_landcover, neighbours):
    gn_E = df_source.loc[gn_id, 'gn_E']
    gn_N = df_source.loc[gn_id, 'gn_N']
    E = row['E']
    N = row['N']
    # geonames vector
    gn_cell = neighbours.kneighbors(X=[[gn_E, gn_N]], n_neighbors=9, return_distance=False)
    gn_indices = gn_cell[0]
    gn_neighbour_subset = df_landcover.iloc[gn_indices, :]
    gn_vc_dict = gn_neighbour_subset['LC09_6'].value_counts().to_dict()
    gn_l = [gn_vc_dict[x] if x in gn_vc_dict else 0 for x in range(10, 61, 10)]
    # swissnames vector
    cell = neighbours.kneighbors(X=[[E, N]], n_neighbors=9, return_distance=False)
    indices = cell[0]
    neighbour_subset = df_landcover.iloc[indices, :]
    vc_dict = neighbour_subset['LC09_6'].value_counts().to_dict()
    l = [vc_dict[x] if x in vc_dict else 0 for x in range(10, 61, 10)]
    vector_diff = np.array(gn_l) - np.array(l)
    lc_distance = np.absolute(vector_diff).sum()
    return lc_distance

# for use on a row from swissnames, and a geonames elevation ('gn_z')
def elevation_distance(row, gn_z):
    sn_z = row['Z']
    # take the absolute value
    z_dist = abs(gn_z - sn_z)
    return z_dist

In [9]:
# settings and prep
distance_threshold_dict = {
    'GLCR':5.0,  # matched types are points in SwissNames
    'HLL':5.0,  # matched types are points in SwissNames
    'LK':15.0,  # matched type is polygon in SwissNames
    'MT':5.0,  # matched types are points in SwissNames
    'PASS':5.0,  # matched types are points in SwissNames
    'PK':5.0,  # matched types are points in SwissNames
    'STM':15.0,  # matched type is line in SwissNames
    'VAL':15.0}  # matched type is polygon in SwissNames

# swissnames name groups
sn_name_groups = swissnames.groupby(['NAME'])

# geonames type groups
type_groups = df_source.groupby(['feature code'])

In [10]:
matches_overall = []
neg_overall = []
results_type = {}
#for file in files:
for ftype, group in type_groups:
    matches_type = []
    neg_type = []
    gn_ids_type = []
    for index, gn_record in group.iterrows():
        name = gn_record['name']
        alternatenames = str(gn_record['alternatenames'])
        gn_point = (gn_record['latitude'], gn_record['longitude'])
        gn_id = gn_record['geonameid']
        gn_ids_type.append(gn_id)

        ### NAME : EXACT MATCHES ###
        df_exact_all = mu.find_all_exact_name_matches(name, alternatenames, sn_name_groups, swissnames)
        df_exact = df_exact_all.copy()
        if df_exact.empty:
            #print("Found no exact matches for %s" %name)
            neg_type.append((gn_id, mu.NO_MATCH_STRING))
            neg_overall.append((gn_id, mu.NO_MATCH_STRING))
            continue

        ### GEOGRAPHICAL DISTANCE FILTER ###
        df_exact['distances'] = df_exact.apply(lambda r: mu.calculate_distance_local(r, gn_point), axis=1)

        ### LANDCOVER DISTANCE ###
        df_exact['lcd'] = df_exact.apply(
            lambda r: landcover_distance(r, gn_record['geonameid'], geonames_lcd, df_areal, neighbour_obj), axis=1)
        # very coarse filter
        df_exact_lcd = df_exact[df_exact['lcd'] < 8.0].copy()
        if df_exact_lcd.empty:
            # no features were below threshold
            neg_type.append((gn_id, mu.NO_MATCH_STRING))
            neg_overall.append((gn_id, mu.NO_MATCH_STRING))
            continue

        ### ELEVATION DISTANCE ###
        df_exact_lcd['elev_dist'] = df_exact_lcd.apply(lambda r: elevation_distance(r, gn_record['dem']), axis=1)
        # very coarse filter
        df_exact_final = df_exact_lcd[df_exact_lcd['elev_dist'] < 400].copy()
        if df_exact_final.empty:
            # no features were below threshold
            neg_type.append((gn_id, mu.NO_MATCH_STRING))
            neg_overall.append((gn_id, mu.NO_MATCH_STRING))
            continue

        ### TYPE-SPECIFIC DISTANCE FILTER ###
        # 3. take matches within threshold distance as correct
        df_subset = df_exact_final[df_exact_final['distances'] < distance_threshold_dict[ftype]]
        #print("%s exact name matches; %s matches based on a %skm cut-off" %(df_exact.shape[0], df_subset.shape[0], distance_threshold))
        if df_subset.shape[0] > 0:
            for sn_id in df_subset['UUID_old'].tolist():
                matches_type.append((gn_id, sn_id))
                matches_overall.append((gn_id, sn_id))
        else:
            # no features were below threshold
            neg_type.append((gn_id, mu.NO_MATCH_STRING))
            neg_overall.append((gn_id, mu.NO_MATCH_STRING))
            
    # filter ground truth to ids of current feature type only
    gn_ids_type_set = set(gn_ids_type)
    ground_truth_type = [item for item in ground_truth_flat if item[0] in gn_ids_type_set]
    results_type[ftype] = mu.evaluate_results_list(matches_type, neg_type, ground_truth_type, verbose=False)

print("prec\trecall\tf1")
p, r, f1 = mu.evaluate_results_list(matches_overall, neg_overall, ground_truth_flat)

prec	recall	f1
0.914	0.677	0.778


### Results by type

In [11]:
print("type\tprec\trecall\tf1")
for k,v in results_type.items():
    print("%s\t%.3f\t%.3f\t%.3f" %(k, v[0], v[1], v[2]))

type	prec	recall	f1
GLCR	1.000	0.851	0.919
HLL	0.879	0.962	0.919
LK	0.961	0.925	0.942
MT	0.837	0.800	0.818
PASS	0.959	0.904	0.931
PK	0.917	0.917	0.917
STM	0.529	0.106	0.176
VAL	1.000	0.156	0.269
