Basic example of reading in the annotated data using the `pandas` python library.

In [1]:
import pandas as pd
import os

In [2]:
data_dir = 'data'
filename = 'annotated_sample.csv'
df = pd.read_csv(filepath_or_buffer=os.path.join(data_dir, filename), sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,geonameid,swissnamesid,extra_matches
0,11397484,{BA8C7006-4DE0-482A-9444-6BD1D3385E4A},{E274F0C0-15B5-4EDC-8446-B3633E168F4C}
1,11397700,{DB24E683-81C2-40C9-801F-EE758BFC4E5D},
2,11397591,{C81F0B97-0601-4285-8939-6C9319345EA0},{1E01D03B-9A91-4643-B89A-0CAC3D7EEE8D};{4051AB...
3,11397623,{91C23277-FC70-4B88-B468-E69D1CCBB9DB},{7E30D82D-5794-4415-88E1-30133B45CB93};{5F8049...
4,11397558,{5F8049DC-CE8E-4A07-91EE-D23AE256C21B},{91C23277-FC70-4B88-B468-E69D1CCBB9DB};{7E30D8...


In [3]:
# we have 400 geonames features
df.shape[0]

400

In [4]:
# if we found no convincing match for the geonames feature, it says 'no match' in the 'swissnamesid' column
df[df['swissnamesid'] == 'no match']

Unnamed: 0,geonameid,swissnamesid,extra_matches
155,2661385,no match,
158,7576920,no match,
170,10194798,no match,
175,2660158,no match,
185,6697713,no match,
254,6934032,no match,
266,6936352,no match,
310,2660792,no match,
313,2660677,no match,
351,2661660,no match,


In [5]:
# replacing 'no match' with NaN can be useful
import numpy as np
df = df.replace(to_replace='no match', value=np.nan)
# now we can use built-in functions like 'isnull'
df[df['swissnamesid'].isnull()]

Unnamed: 0,geonameid,swissnamesid,extra_matches
155,2661385,,
158,7576920,,
170,10194798,,
175,2660158,,
185,6697713,,
254,6934032,,
266,6936352,,
310,2660792,,
313,2660677,,
351,2661660,,
