# Matching preparation
- Build a flat ground truth structure which is just a list of matches (rather than having extra_matches listed in a 3rd column in a table, they are all treated as just another matching pair).
- Augment the ground truth by 're-hydrating' our data, going from the geonames IDs that we annotated to a fuller table where each ID is now a row with extra record info contained in GeoNames.

In [1]:
import pandas as pd
import pickle
import os

# own file
from gazmatch import gazetteers

In [2]:
data_dir = 'data'
path_to_ground_truth = os.path.join(data_dir, 'annotated_sample.csv')
df_matches_grouped = pd.read_csv(path_to_ground_truth, sep='\t', encoding='utf-8')
df_matches_grouped.head()

Unnamed: 0,geonameid,swissnamesid,extra_matches
0,11397484,{BA8C7006-4DE0-482A-9444-6BD1D3385E4A},{E274F0C0-15B5-4EDC-8446-B3633E168F4C}
1,11397700,{DB24E683-81C2-40C9-801F-EE758BFC4E5D},
2,11397591,{C81F0B97-0601-4285-8939-6C9319345EA0},{1E01D03B-9A91-4643-B89A-0CAC3D7EEE8D};{4051AB...
3,11397623,{91C23277-FC70-4B88-B468-E69D1CCBB9DB},{7E30D82D-5794-4415-88E1-30133B45CB93};{5F8049...
4,11397558,{5F8049DC-CE8E-4A07-91EE-D23AE256C21B},{91C23277-FC70-4B88-B468-E69D1CCBB9DB};{7E30D8...


### Flatten extra matches

In [3]:
ground_truth_flat = []

for index, row in df_matches_grouped.iterrows():
    gn_id = row['geonameid']
    sn_id = str(row['swissnamesid'])
    if sn_id.startswith("{"):
        ground_truth_flat.append((gn_id, sn_id))
    else:
        # we don't even have a top match
        continue
    sn_extra_ids = str(row['extra_matches'])
    # process the extra matches if we have any 
    if sn_extra_ids != 'nan':
        for sn_extra_id in str(sn_extra_ids).split(";"):
            sn_id_clean = sn_extra_id.strip()
            if sn_id_clean.startswith("{"):
                ground_truth_flat.append((gn_id, sn_id_clean))

print("flattened ground truth has %s pairs" %len(ground_truth_flat))

flattened ground truth has 468 pairs


In [4]:
# turn into a dataframe
cols = ['geonameid', 'swissnamesid']
df_matches_flat = pd.DataFrame.from_records(ground_truth_flat, columns=cols)
df_matches_flat.head()

Unnamed: 0,geonameid,swissnamesid
0,11397484,{BA8C7006-4DE0-482A-9444-6BD1D3385E4A}
1,11397484,{E274F0C0-15B5-4EDC-8446-B3633E168F4C}
2,11397700,{DB24E683-81C2-40C9-801F-EE758BFC4E5D}
3,11397591,{C81F0B97-0601-4285-8939-6C9319345EA0}
4,11397591,{1E01D03B-9A91-4643-B89A-0CAC3D7EEE8D}


In [5]:
# export to CSV
path_to_ground_truth_flat = os.path.join(data_dir, 'ground_truth_flat.csv')
df_matches_flat.to_csv(path_to_ground_truth_flat, sep='\t', index=False, encoding='utf-8')

# also pickle (serialize) the list directly
path_to_pickle = os.path.join(data_dir, 'ground_truth_flat.pkl')
with open(path_to_pickle, 'wb') as pickle_file:
    pickle.dump(ground_truth_flat, pickle_file)

### Augment ground truth ids

In [6]:
geonames = gazetteers.GeoNamesCH(data_dir=data_dir, verbose=True)

We have 67796 records in GeoNames for Switzerland and 19 columns.


In [7]:
geoname_ids = df_matches_grouped['geonameid'].tolist()
len(geoname_ids)

400

In [8]:
augmented_records = []

for geoname_id in geoname_ids:
    record = geonames.retrieve_by_id(geoname_id)
    augmented_records.append(record)

df_augmented = pd.DataFrame(augmented_records)
df_augmented.head()

Unnamed: 0,admin1 code,admin2 code,admin3 code,admin4 code,alternatenames,asciiname,cc2,country code,dem,elevation,feature class,feature code,geonameid,latitude,longitude,modification date,name,population,timezone
0,UR,400.0,1208.0,,,AElprigengletscher,,CH,2800,,H,GLCR,11397484,46.62199,8.48848,2016-12-28,Älprigengletscher,0,Europe/Zurich
1,VS,2309.0,6192.0,,,Birchgletscher,,CH,2788,,H,GLCR,11397700,46.40142,7.84096,2016-12-28,Birchgletscher,0,Europe/Zurich
2,UR,400.0,1220.0,,,Chueefadfirn,,CH,2754,,H,GLCR,11397591,46.77595,8.5097,2016-12-28,Chüefadfirn,0,Europe/Zurich
3,BE,250.0,782.0,,,Diechtergletscher,,CH,2870,,H,GLCR,11397623,46.65359,8.34409,2016-12-28,Diechtergletscher,0,Europe/Zurich
4,BE,250.0,782.0,,,Diechtergletscher,,CH,2975,,H,GLCR,11397558,46.63989,8.35707,2016-12-28,Diechtergletscher,0,Europe/Zurich


In [9]:
# export to CSV
path_to_augmented_data = os.path.join(data_dir, 'source_data_augmented.csv')
df_augmented.to_csv(path_to_augmented_data, sep='\t', index=False, encoding='utf-8')