# Rule-based gazetteer matching: linear-combination

In [1]:
import pandas as pd
import os
from leven import levenshtein
    
# own file
from gazmatch import matchutils as mu

In [2]:
data_dir = 'data'

### Load data

In [3]:
# load ground truth data (pickled)
ground_truth_flat = mu.load_data(os.path.join(data_dir, 'ground_truth_flat.pkl'))
print("full: %s pairs" %len(ground_truth_flat))

full: 468 pairs


In [4]:
# load SwissNames3D augmented with lat, lons
path_swissnames_csv = os.path.join(data_dir, 'swissnames_new_uuids.csv')
swissnames = pd.read_csv(path_swissnames_csv, sep='\t', encoding='utf-8', low_memory=False)
swissnames.shape

(313562, 21)

In [5]:
# source data (Geonames records)
path_to_source_data = os.path.join(data_dir, 'source_data_augmented.csv')
df_source = pd.read_csv(path_to_source_data, sep='\t', encoding='utf-8')

## Rule-based matching

In [6]:
# for use on a row from swissnames, and a geonames name ('geoname')
def get_edit_distance(row, geoname):
    sn_name = row['NAME']
    leven_dist = levenshtein(geoname, sn_name)
    return leven_dist

def similarity_points(row):
    weight1 = 0.5
    weight2 = 0.5
    sim_score = weight1*(row['leven_dist']) + weight2*(row['distances'])
    return sim_score

def similarity_lines_pols(row):
    weight1 = 0.7
    weight2 = 0.3
    sim_score = weight1*(row['leven_dist']) + weight2*row['distances']
    return sim_score

# settings and prep
similarity_threshold = 2.0

# soft feature type alignments
soft_type_align = {}
soft_type_align['LK'] = ['Seeteil', 'See']
soft_type_align['GLCR'] = ['Gletscher', 'Alpiner Gipfel']
soft_type_align['STM'] = ['Fliessgewaesser']
soft_type_align['PK'] = ['Hauptgipfel', 'Gipfel', 'Huegel', 'Haupthuegel', 'Alpiner Gipfel', 'Grat', 'Huegelzug', 'Felskopf']
soft_type_align['PASS'] = ['Pass', 'Graben']
soft_type_align['HLL'] = ['Hauptgipfel', 'Gipfel', 'Huegel', 'Haupthuegel', 'Alpiner Gipfel', 'Grat', 'Huegelzug', 'Felskopf']
soft_type_align['MT'] = ['Hauptgipfel', 'Gipfel', 'Huegel', 'Haupthuegel', 'Alpiner Gipfel', 'Grat', 'Huegelzug', 'Felskopf']
soft_type_align['VAL'] = ['Tal', 'Haupttal', 'Graben']
len(soft_type_align)

# swissnames name groups
sn_name_groups = swissnames.groupby(['NAME'])

# geonames type groups
type_groups = df_source.groupby(['feature code'])

points = ['PK', 'HLL', 'MT', 'GLCR', 'PASS']
lines_pols = ['STM', 'LK', 'VAL']

In [7]:
matches_overall = []
neg_overall = []
results_type = {}
for ftype, group in type_groups:
    matches_type = []
    neg_type = []
    gn_ids_type = []
    for index, gn_record in group.iterrows():
        name = gn_record['name']
        alternatenames = str(gn_record['alternatenames'])
        gn_type = gn_record['feature code']
        gn_point = (gn_record['latitude'], gn_record['longitude'])
        gn_id = gn_record['geonameid']
        gn_ids_type.append(gn_id)
        # get any exact name matches as candidates, including altnames and the comma-flip trick
        df_exact_all = mu.find_all_exact_name_matches(name, alternatenames, sn_name_groups, swissnames)
        df_exact = df_exact_all.copy()
        df_exact['leven_dist'] = 0
        # consider only features of compatible types for the rest: could also replace with a very coarse spatial filter
        df_candidates = swissnames[swissnames['OBJEKTART'].isin(soft_type_align[gn_type])].copy()
        # calculate levenshtein distance for just feature-type candidates
        df_candidates['leven_dist'] = df_candidates.apply(lambda r: get_edit_distance(r, name), axis=1)
        # add the exact matches and calculate geo-distances
        df_candidates = df_candidates.append(df_exact)  # don't drop duplicates because we lose info
        df_candidates['distances'] = df_candidates.apply(lambda r: mu.calculate_distance_local(r, gn_point), axis=1)
        # combine text distance with geodistance for a total score (two ways to combine depending on feature type)
        if gn_type in points:
            df_candidates['similarity'] = df_candidates.apply(similarity_points, axis=1)
        elif gn_type in lines_pols:
            df_candidates['similarity'] = df_candidates.apply(similarity_lines_pols, axis=1)
        # keep false positives low (i.e. keep precision high)
        df_subset = df_candidates[df_candidates['similarity'] < similarity_threshold]
        df_subset = df_subset.drop_duplicates()
        if df_subset.shape[0] > 0:
            for sn_id in df_subset['UUID_old'].tolist():
                matches_type.append((gn_id, sn_id))
                matches_overall.append((gn_id, sn_id))
        else:
            # no features were below threshold
            neg_type.append((gn_id, mu.NO_MATCH_STRING))
            neg_overall.append((gn_id, mu.NO_MATCH_STRING))
    
    # filter ground truth to ids of current feature type only
    gn_ids_type_set = set(gn_ids_type)
    ground_truth_type = [item for item in ground_truth_flat if item[0] in gn_ids_type_set]
    results_type[ftype] = mu.evaluate_results_list(matches_type, neg_type, ground_truth_type, verbose=False)

print("prec\trecall\tf1")
p, r, f1 = mu.evaluate_results_list(matches_overall, neg_overall, ground_truth_flat)

prec	recall	f1
0.871	0.833	0.852


### Results by type

In [8]:
print("type\tprec\trecall\tf1")
for k,v in results_type.items():
    print("%s\t%.3f\t%.3f\t%.3f" %(k, v[0], v[1], v[2]))

type	prec	recall	f1
GLCR	0.966	0.966	0.966
HLL	0.897	0.981	0.937
LK	0.855	1.000	0.922
MT	0.759	0.978	0.854
PASS	0.962	0.962	0.962
PK	0.902	0.958	0.929
STM	0.605	0.271	0.374
VAL	0.905	0.844	0.874
