In [1]:
import re
import copy 
import math
import itertools
import jellyfish
from tqdm import tqdm
import pandas as pd
import numpy as np
from datetime import datetime
import xml.etree.ElementTree as ET

ns = {'xml': 'http://www.w3.org/XML/1998/namespace',
      'dflt': 'http://www.tei-c.org/ns/1.0',
      'frus':'http://history.state.gov/frus/ns/1.0',
      'xi':'http://www.w3.org/2001/XInclude'
      }

In [2]:
def extract_city(doc):

    # city
    place_tag = doc.find('.//dflt:placeName',ns)
    if place_tag is not None:
        txt = "".join(place_tag.itertext())
        city = " ".join(txt.split())
    else:
        city = None

    global city_df
    city_df = pd.concat((city_df, pd.DataFrame({'name':[city]})),ignore_index=True)
    return

In [3]:
import glob
volume_root = 'frus1969-76'

city_df = pd.DataFrame(columns=['name'])


#for file in glob.glob('volumes/*'):
for file in glob.glob('volumes/'+volume_root+'*'):

    tree = ET.parse(file)
    root = tree.getroot()

    docs = root.findall('./dflt:text/dflt:body//dflt:div[@type="document"]', ns)
    for doc in docs:
        extract_city(doc)


In [None]:
#pd.set_option('display.max_rows', None)

In [4]:
city_df.dropna(inplace=True)
city_df.drop_duplicates(inplace=True)
city_df.reset_index(drop=True,inplace=True)

In [5]:
extension_col = city_df['name'].apply(lambda x: " ".join(x.split(',')[1:]))
name_col = city_df['name'].apply(lambda x: x.split(',')[0])
city_df['name'] = name_col
city_df['extension'] = extension_col
city_df['extension'] = city_df['extension'].apply(lambda x: None if len(x)==0 else x)

## wc matching

In [6]:
wc_df = pd.read_csv('world-cities.csv')

In [7]:
# helpers
def geo_match(pattern,string):
    
    if pattern !=pattern:
        return None
    elif re.search(pattern,string):
        return pattern
    else:
        return None

def f(string):

    if not string:
        return None
    
    tl = list(wc_df[wc_df['country'].apply(lambda pattern: True if geo_match(pattern,string) else False)].drop_duplicates(subset='country')['country'].values)
    if len(tl)==0:
        tl = list(wc_df[wc_df['subcountry'].apply(lambda pattern: True if geo_match(pattern,string) else False)].drop_duplicates(subset='country')['country'].values)

    if len(tl)==0:
        return None
    elif len(tl)==1:
        return tl[0]
    else:
        print(f'multi-match for {string}. Check later!')
        return tl

def f2(string):

    if not string:
        return None

    tl = list(wc_df[wc_df['subcountry'].apply(lambda pattern: True if geo_match(pattern,string) else False)].drop_duplicates(subset='country')['country'].values)

    if len(tl)==0:
        return None
    elif len(tl)==1:
        return tl[0]
    else:
        print(f'multi-match for {string}. Check later!')
        return tl

def merger(row):

    d1 = row['extension_match']
    d2 = row['wc_guess']

    if not d1 and d2!=d2:
        return None
    elif not d1:
        return d2
    else:
        return d1

In [8]:
city_df['extension_match'] = city_df['extension'].apply(lambda x:f(x))
city_df['wc_guess'] = city_df[city_df['extension'].isna()]['name'].apply(lambda x:f2(x))
#city_df['merged']=city_df.apply(lambda x: merger(x),axis=1)
city_df['merged']=city_df['extension_match']

city_df=city_df[['name','merged','extension_match','wc_guess']]
city_df.rename(columns={'merged':'country'},inplace=True)

multi-match for  Maryland. Check later!
multi-match for  Florida. Check later!
multi-match for  Maryland. Check later!
multi-match for Salzburg White House. Check later!
multi-match for Dar es Salaam. Check later!
multi-match for Salzburg. Check later!
multi-match for La Paz. Check later!
multi-match for San José. Check later!
multi-match for San Salvador. Check later!
multi-match for Montevideo. Check later!


resolve multi-match cases by hand before proceeding next part!

In [9]:
# save and edit
city_df.to_csv('tables/city_69_76.csv')

In [10]:
# load corrected one
city_df = pd.read_csv('tables/city_69_76.csv')

### wikidata matching

In [11]:
import pandas as pd
import dask.dataframe as dd
from collections import Counter
from dask.diagnostics import ProgressBar
from SPARQLWrapper import SPARQLWrapper, JSON

from tqdm import tqdm
tqdm.pandas()

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [12]:
user_agent = 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'

sparqlwd = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
sparqlwd.setReturnFormat(JSON)

In [13]:
def find_if_capital(name):

    try:
        query = """
        SELECT ?country ?countryLabel WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?city wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        ?city wdt:P31 wd:Q5119.
        ?city wdt:P17 ?country.
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en".}
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def find_if_bigcity(name):

    try:
        query = """
        SELECT ?country ?countryLabel WHERE {
        SERVICE wikibase:mwapi {
            bd:serviceParam wikibase:endpoint "www.wikidata.org";
                            wikibase:api "EntitySearch";
                            mwapi:search  \'"""+name+"""\';
                            mwapi:language "en".
            ?city wikibase:apiOutputItem mwapi:item.
            ?num wikibase:apiOrdinal true.
        }
        ?city (wdt:P31/wdt:P279*) wd:Q1549591.
        ?city wdt:P17 ?country.
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en".}
        }
        """
        
        sparqlwd.setQuery(query)

        return sparqlwd.query().convert()

    except Exception as e:
        print(f'name: {name}')
        print(f'error message: {e}')
        return {'head': {'vars': ['item']}, 'results': {'bindings': []}}


def process_name(row, f):

    name = row['name']

    res = f(name)

    candidates = []
    selected_country = None
    selected_tag = None

    for binding in res['results']['bindings']:
        candidates.append(binding['countryLabel']['value'])

    if len(candidates)>0:
        temp_country = Counter(candidates).most_common(1)[0][0]
        temp_tag = None

        for binding in res['results']['bindings']:
            if binding['countryLabel']['value'] == temp_country:
                temp_tag = binding['country']['value']
                break
        
        #selected_country.add(temp_country)
        selected_country = temp_country
        #wiki_tag.add(temp_tag)
        selected_tag = temp_tag

    return selected_country

In [15]:
# find country if city is capital
wiki_df = city_df[city_df['extension_match'].apply(lambda x: True if x!=x else False)]
city_df['wiki_capital_guess'] = wiki_df.apply(process_name,axis=1,f=find_if_capital)

# find country if city is big city but not capital
wiki_df = city_df[city_df['extension_match'].apply(lambda x: True if x!=x else False) & city_df['wiki_capital_guess'].apply(lambda x: False if x else True)]
city_df['wiki_bigcity_guess'] = wiki_df.apply(process_name,axis=1,f=find_if_bigcity)

In [16]:
def merger2(row):

    d1 = row['wiki_capital_guess']
    d2 = row['wiki_bigcity_guess']

    if (not d2 or d2!=d2) and (not d1 or d1!=d1):
        return None
    elif (not d2 or d2!=d2):
        return d1
    else:
        return d2

city_df['merged_wiki'] = city_df.apply(merger2,axis=1)

In [19]:
def merger3(row):

    d1 = row['country']
    d2 = row['merged_wiki']

    if not d2 and d1!=d1:
        return None
    elif d1!=d1:
        return d2
    else:
        return d1

city_df['country'] = city_df.apply(merger3,axis=1)

In [26]:
# for observation only
city_df[city_df['country'].apply(lambda x: False if x else True) & city_df['wc_guess'].apply(lambda x: True if x==x else False)]

Unnamed: 0.1,Unnamed: 0,name,country,extension_match,wc_guess,wiki_capital_guess,wiki_bigcity_guess,merged_wiki
4,4,Salzburg White House,,,"['Austria', 'Cape Verde']",,,
27,27,Zurich,,,Switzerland,,,
42,42,Crimea,,,Ukraine,,,
65,65,Kabul,,,Afghanistan,,,
102,102,Miesbach,,,Japan,,,
103,103,Martinique,,,Martinique,,,
104,104,Conakry,,,Guinea,,,
132,132,Hawaii,,,United States,,,
200,200,Bali,,,Indonesia,,,
221,221,Belize City,,,Belize,,,


resolve multi-match cases by hand before proceeding next part!

In [None]:
# save and edit
city_df.to_csv('tables/city_69_76.csv')

In [None]:
# load corrected one
city_df = pd.read_csv('tables/city_69_76.csv')

## misspelling matching

In [None]:
all_names = city_df['name'].values

def compute_sim(s1,func,s2):
    return func(s1,s2)

def find_matches(s2):

    spiro_dist_df = pd.DataFrame({'name_set':all_names,
                                'dam_lev_dist':[compute_sim(x, jellyfish.damerau_levenshtein_distance,s2) for x in all_names]})
    
    misspelling_idx = set(spiro_dist_df[(spiro_dist_df['dam_lev_dist'] <=1)].index.values)

    return misspelling_idx

In [None]:
t = {}
for idx in tqdm(range(len(all_names))):
    name = all_names[idx]
    t[idx]=find_matches(name)

In [None]:
scratch_t = copy.deepcopy(t)
changed_flag = True

while changed_flag:

    changed_flag = False

    for key in t:
        
        for matched_idx in t[key]:

            if key != matched_idx:
                if scratch_t.get(key, None) and scratch_t.get(matched_idx, None):
                    changed_flag = True
                    t[key] = t[key].union(t[matched_idx])
                    scratch_t.pop(matched_idx, None)
        
    unwanted = set(t.keys()) - set(scratch_t.keys())
    print(f'removing {len(unwanted)} keys.')
    for unwanted_key in unwanted: del t[unwanted_key]
    scratch_t = copy.deepcopy(t)
    print('---')
    

In [None]:
for temp_key in t:
    
    te_df = city_df.iloc[list(t[temp_key])]

    name_list = te_df['name'].values

    country_list = te_df['country'].values
    country_list = [c for c in country_list if c==c]
    country_list = list(set(country_list))
    if len(country_list)==0:
        country_list = None
    elif len(country_list)==1:
        country_list = country_list[0]

    city_df.at[temp_key, 'name_list'] = name_list
    city_df.at[temp_key, 'country'] = country_list

city_df = city_df.loc[t.keys()]