In [1]:
# Make sure we have the new pandas 1.0 (Jan 2020) so we can use the new string dtype
! pip install pandas --upgrade

# Fuzzy text matching
! pip install "textdistance[extras]" --upgrade

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)

import textdistance
textdist = np.vectorize(textdistance.levenshtein.normalized_similarity)

path = '/content/drive/My Drive/active/m5362_20sp_data_warehousing/PD4SDG/PD4SDG_new/data sources'

def write_file(df, fname, sheet='raw'):
    pd.DataFrame(df).to_excel(f'{path}/{fname}.xlsx', sheet_name=sheet)

def formatter(X):
    # convert to modern dtypes like "string" introduced in Pandas 1.0 (Jan 2020)
    df = X.copy().convert_dtypes()
    
    # trim whitespace, replace missing and length 0 or 1 strings with '', standard capitalization
    f = lambda x: x.str.strip().fillna('').str.lower() \
        .str.replace(r'^.{0,1}$', '').str.replace(r'^st\.', 'saint').str.replace(r'^st', 'saint').str.replace('&', 'and') \
        .str.title()
    
    # format column names, but use lowercase (my preference)
    df.columns = f(df.columns).str.lower()
    
    # format string columns
    strings = df.select_dtypes(include='string').columns   # finds string columns
    df[strings] = df[strings].apply(f)   # formats them
    return df

Collecting pandas
[?25l  Downloading https://files.pythonhosted.org/packages/bb/71/8f53bdbcbc67c912b888b40def255767e475402e9df64050019149b1a943/pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl (10.0MB)
[K     |████████████████████████████████| 10.0MB 2.8MB/s 
[31mERROR: google-colab 1.0.0 has requirement pandas~=0.25.0; python_version >= "3.0", but you'll have pandas 1.0.3 which is incompatible.[0m
Installing collected packages: pandas
  Found existing installation: pandas 0.25.3
    Uninstalling pandas-0.25.3:
      Successfully uninstalled pandas-0.25.3
Successfully installed pandas-1.0.3


Collecting textdistance[extras]
  Downloading https://files.pythonhosted.org/packages/3f/18/31397b687f50ffae65469175f07faa68f288e27fcd8716276004c42e5637/textdistance-4.1.5-py3-none-any.whl
Collecting python-Levenshtein; extra == "extras"
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K     |████████████████████████████████| 51kB 3.0MB/s 
Collecting jellyfish; extra == "extras"
[?25l  Downloading https://files.pythonhosted.org/packages/3f/80/bcacc7affb47be7279d7d35225e1a932416ed051b315a7f9df20acf04cbe/jellyfish-0.7.2.tar.gz (133kB)
[K     |████████████████████████████████| 143kB 8.0MB/s 
[?25hCollecting abydos; extra == "extras"
[?25l  Downloading https://files.pythonhosted.org/packages/7f/a5/ca258a571997be1c9483d6075bbc1b9487ae80f3bb3bf1f60db0b29f5aa6/abydos-0.5.0-py2.py3-none-any.whl (886kB)
[K     |████████████████████████████████| 890kB 42.6MB/s 
[?25hColle

In [0]:
country_data_raw = pd.read_csv(f"{path}/raw/countries of the world.csv", decimal=',')
rep =  {
    'Bahamas, The': 'Bahamas',
    'British Virgin Is.': 'British Virgin Islands',
    'Burma': 'Myanmar',
    'Central African Rep.': 'Central African Republic',
    'Congo, Dem. Rep.': 'Democratic Republic Of The Congo',
    'Congo, Repub. Of The': 'Republic Of The Congo',
    "Cote D'Ivoire": 'Ivory Coast',
    'Gambia, The': 'Gambia',
    # 'Gaza Strip':
    'Korea, North': 'North Korea',
    'Korea, South': 'South Korea',
    'Macau': 'Macao',
    'Micronesia, Fed. St.': 'Micronesia',
    'N. Mariana Islands': 'Northern Mariana Islands',
    # 'Netherlands Antilles':
    'Turks And Caicos Is': 'Turks And Caicos Islands',
    'Virgin Islands': 'U.S. Virgin Islands',
    # 'West Bank':
}
country_data = formatter(country_data_raw)
country_data['country'] = country_data['country'].replace(rep).astype('string')
country_data.set_index('country', inplace=True)

def combine(old):
    x = country_data.loc[old]
    y = x.iloc[0].copy()
    y[1:3] = x.iloc[:,1:3].sum().astype(int)
    y[3] = np.round(y[1] / y[2], 2)
    y[4:] = np.round(x.iloc[:,4:].mean(), 2)
    return y

pal = ['Gaza Strip','West Bank']
country_data.loc['Palestinian Territory'] = combine(pal)
country_data

with pd.ExcelWriter(f'{path}/country_data.xlsx') as writer:  
    country_data_raw.to_excel(writer, sheet_name='raw')
    country_data.to_excel(writer, sheet_name='clean')

In [3]:
world_raw = pd.read_json(f"{path}/raw/world-cities_json.json")
world = formatter(world_raw) \
    .rename(columns={'name':'city'}) \
    [['geonameid', 'city', 'subcountry', 'country']] \
    .sort_values(['country', 'subcountry', 'city']) \
    .set_index('geonameid')
world['nocountry'] =    world['city'] + ', ' + world['subcountry']
world['nosubcountry'] = world['city'] + ', '                              + world['country']
world['nocity'] =                              world['subcountry'] + ', ' + world['country']
world['all'] =          world['city'] + ', ' + world['subcountry'] + ', ' + world['country']

world_mini = {lev:world[lev].drop_duplicates().sort_values() for lev in world.columns}

world_mini['country']
world.head()

Unnamed: 0_level_0,city,subcountry,country,nocountry,nosubcountry,nocity,all
geonameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1142170,Fayzabad,Badakhshan,Afghanistan,"Fayzabad, Badakhshan","Fayzabad, Afghanistan","Badakhshan, Afghanistan","Fayzabad, Badakhshan, Afghanistan"
1141089,Ghormach,Badghis,Afghanistan,"Ghormach, Badghis","Ghormach, Afghanistan","Badghis, Afghanistan","Ghormach, Badghis, Afghanistan"
1147290,Balkh,Balkh,Afghanistan,"Balkh, Balkh","Balkh, Afghanistan","Balkh, Afghanistan","Balkh, Balkh, Afghanistan"
1136575,Khulm,Balkh,Afghanistan,"Khulm, Balkh","Khulm, Afghanistan","Balkh, Afghanistan","Khulm, Balkh, Afghanistan"
1133616,Mazār-E Sharīf,Balkh,Afghanistan,"Mazār-E Sharīf, Balkh","Mazār-E Sharīf, Afghanistan","Balkh, Afghanistan","Mazār-E Sharīf, Balkh, Afghanistan"


In [0]:
pd4sdg_raw = pd.read_excel(f"{path}/raw/PD4SDG.xlsx")
pd4sdg = formatter(pd4sdg_raw) \
    .rename(columns={'project location 1':'site', 'title':'title', 'project_idx':'un_idx'})

In [0]:
# Get unique un_idx list
f = lambda L: max(L, key=len)  # returns longest string in list
project = pd4sdg.groupby('un_idx').agg(
    title = ('title', f),
    site = ('site', f),
    repeats = ('un_idx', 'count'),
    ).reset_index()
write_file(project, 'project')

In [0]:
# Get project_entity links
partners = ['lead partner'] + [f"partner {i+1}" for i in range(249)]

# Temp list to hold results
L = []
for i, c in enumerate(partners):
    # dict {original column name: new column name}
    col = {'un_idx':'un_idx', c:'name', c+' type':'type', c+' city':'city', c+' country':'country'}
    
    # get partner columns, rename cols, drop rows where name is ''
    s = pd4sdg[col.keys()].rename(columns=col)#.astype(str)
    s = s[s['name'] != '']
    
    # record partner number on project (in case precedence matters - we don't believe it does)
    s['n'] = i

    # Append
    L.append(s)

# concat the lists stored in L
project_entity_raw = pd.concat(L, ignore_index=True).astype(str).convert_dtypes()
write_file(project_entity_raw, 'project_entity')

entity_type_raw = project_entity_raw.groupby('type')['un_idx'].count().to_frame().sort_index().drop([''])
write_file(entity_type_raw, 'entity_type')

city_raw = project_entity_raw.groupby(['city', 'country'])['un_idx'].count().to_frame().sort_index().drop(['',''])
write_file(city_raw, 'city')

country_raw = pd.concat([project['site'], project_entity_raw['country']])
country_raw = country_raw.value_counts().to_frame().sort_index().drop('')
write_file(country_raw, 'country')

entity_raw = project_entity_raw.drop(columns=['un_idx', 'n']).drop_duplicates().sort_values(['name', 'country', 'city'])
write_file(entity_raw, 'entity')

In [46]:
entity_raw['subcountry'] = ''
entity = entity_raw[['name','city','subcountry','country']].rename(columns={'name':'name_orig'})

entity_data = formatter(pd.read_excel(f"{path}/raw/entity_data.xlsx"))
missing_idx = (entity_data[['name','type','city','country']]=='').any(axis=1)
if missing_idx.any():
    display(entity_data[missing_idx])
    raise Exception('Missing data in entity_data.xlsx')

entity_fix = formatter(pd.read_excel(f"{path}/raw/entity_fix.xlsx"))

entity = pd.merge(entity, entity_fix, how='left', on='name_orig')
idx = entity['name'].isna()
entity.loc[idx, 'name'] = entity.loc[idx, 'name_orig']
entity = pd.merge(entity, entity_data, how='left', on='name', suffixes=('_orig',''))

idx = entity['type'].isna()
entity_nomatch = entity.loc[idx,:'country_orig']



# A = textdist(entity_nomatch['name_orig'].to_numpy().reshape(-1,1), entity_data['name'].to_numpy().reshape(1,-1))
A = entity_nomatch[['name_orig']].to_numpy()
B = entity_data[['name']].to_numpy()
# entity_nomatch['name_best_match'] = pd.DataFrame(textdist(A, B.T), index=A.ravel(), columns=B.ravel()).idxmax(axis=1)
C = textdist(A, B.T)
entity_nomatch['score_best_match'] =  C.max(axis=1)
entity_nomatch['name_best_match'] = B[C.argmax(axis=1)]
# entity_nomatch['name_best_match'] = 
entity_nomatch.head(20)
# entity.head(100)

Unnamed: 0,name_orig,city_orig,subcountry_orig,country_orig,score_best_match,name_best_match
0,35. Kamehameha Schools,,,,0.75,Kamehameha Schools
1,BEZEV,Essen,,Germany,0.222222,Solare Brucke E.V.
2,CSP;,,,,0.210526,Swiss Cert Pvt. Ltd
3,Centric,Gouda,,Netherlands,0.875,Centric
4,Ghostnets Australia,Smithfield,,Australia,0.95,Ghostnets Australia
5,Global Ocean Biodiversity Initiative,Romsey,,United Kingdom,0.972973,Global Ocean Biodiversity Initiative
6,IRC;,,,,0.25,Ab Inbev
7,KUA,Kane'Ohe,,United States,0.25,Kf
8,Koc University,Istanbul,,Turkey,0.933333,Koc University
9,Menschen für menschen,Munich,,Germany,0.409091,Tschenett Design


In [44]:
C

array([['Kamehameha Schools'],
       ['Solare Brucke E.V.'],
       ['Swiss Cert Pvt. Ltd'],
       ['Centric'],
       ['Ghostnets Australia'],
       ['Global Ocean Biodiversity Initiative'],
       ['Ab Inbev'],
       ['Kf'],
       ['Koc University'],
       ['Tschenett Design'],
       ['Morocco'],
       ['Nabbir Laboratory (Kl) Sdn. Bhd.'],
       ['National Defence University Of Malaysia'],
       ['Secretariat Of The South Pacific Regional Environment Program'],
       ['Solomon Islands'],
       ['Sri Lanka'],
       ['Thailand'],
       ['United Nations Environment Program'],
       ['United States'],
       ['Vanuatu'],
       ['Zai Na Tina Organic Demonstration Farms'],
       ['Banamex'],
       ["Nuestra Señora Del Pilar' Primary School"],
       ["We Are The Ocean'"],
       ['3M'],
       ['3M'],
       ['3M'],
       ['Energy Research Center Of The Netherlands'],
       ['Ac Coverts'],
       ["Acb - Réseau Des Bretons D'Influence"],
       ['Akatu Institute'],
    

In [37]:
D = pd.DataFrame(C, index=A[:,0], columns=B[:,0])
D.head()
E = D.idxmax(axis=1)
E.head()

 35. Kamehameha Schools      Kamehameha Schools
 BEZEV                       Solare Brucke E.V.
 CSP;                       Swiss Cert Pvt. Ltd
 Centric                                Centric
 Ghostnets Australia        Ghostnets Australia
dtype: object

In [19]:
A.head()
# A
# entity_nomatch[['name_orig']].T.shape

AttributeError: ignored

In [0]:
def fix_site(orig):
    # uniquify to avoid wasted effort
    orig = np.unique(orig).tolist()

    # remove trivial case if present
    try:
        orig.remove('')
    except:
        pass

    # site dict records original, best current score, number of ties for that score, and matches with that score
    site = {s:{'score':0.0, 'n':0, 'match':[]} for s in orig}

    # chosen distance metric
    
    
    # iterate over sites
    for s, val in site.items():
        # best current score
        biggus_scorus = 0.0

        # iterate over known world location in multiple levels like city, subcountry, country and subset of these 3 items
        for lev, w in world_mini.items():

            # compute distance scores & find best
            score = textdist(w.to_numpy(), s)
            newus_scorus = score.max()

            # if there is a better score than the current best, we record this improved match
            if newus_scorus > biggus_scorus:
                biggus_scorus = newus_scorus
                site[s]['score'] = biggus_scorus
                hits = w[score==biggus_scorus].index
                n = len(hits)
                site[s]['n'] = n
                match = world.loc[hits, ['city','subcountry', 'country']]
                if lev in ['subcountry', 'country', ' nocity']:
                    match['city'] = ''
                    if lev in ['country']:
                        match['subcountry'] = ''
                site[s]['match'] = match.to_dict(orient='records')
    return site

project_site_fix_dict = fix_site(project['site'].to_numpy())
project_site_fix = pd.DataFrame.from_dict(project_site_fix_dict, orient='index')
write_file(project_site_fix, 'project_site_fix')

Ngo                               3923
Private Sector                    1831
Academic Institution              1108
Subnational Government             498
Scientific Community               448
National Government                232
Intergovernmental Organization     196
United Nations Entity              125
Partnership                          6
Civil Society Organization           2
Academic Sector                      2
Other Relevant Actor                 1
Philanthropic Organization           1
Un Entity                            1
Supranational Government             1
Name: type, dtype: Int64

In [0]:
world_mini['nocountry'].to_numpy()[:10]

array(["'Ali Sabieh, Ali Sabieh", "'S-Gravenzande, South Holland",
       "'S-Hertogenbosch, North Brabant", 'A Coruña, Galicia',
       'A Estrada, Galicia', 'Aabenraa, South Denmark',
       'Aachen, North Rhine-Westphalia', 'Aalborg, North Denmark',
       'Aalen, Baden-Württemberg', 'Aalsmeer, North Holland'],
      dtype=object)

In [0]:
write_file(world_mini['country'], '/raw/countries_alt',)

In [0]:

# project_site_fix = fix_site(A)
write_file(project_site_fix, 'project_site_fix')
print(project_site_fix)

                   score  n                                              match
Austria         1.000000  1  [{'city': '', 'subcountry': '', 'country': 'Au...
Canada          1.000000  1  [{'city': '', 'subcountry': '', 'country': 'Ca...
Egypt           1.000000  1  [{'city': '', 'subcountry': '', 'country': 'Eg...
Ethiopia        1.000000  1  [{'city': '', 'subcountry': '', 'country': 'Et...
Fiji            1.000000  1  [{'city': '', 'subcountry': '', 'country': 'Fi...
France          1.000000  1  [{'city': '', 'subcountry': '', 'country': 'Fr...
Germany         1.000000  1  [{'city': '', 'subcountry': '', 'country': 'Ge...
Ghana           1.000000  1  [{'city': '', 'subcountry': '', 'country': 'Gh...
Global          0.666667  2  [{'city': 'Goba', 'subcountry': 'Oromiya', 'co...
Guatemala       1.000000  1  [{'city': '', 'subcountry': 'Guatemala', 'coun...
India           1.000000  1  [{'city': '', 'subcountry': '', 'country': 'In...
Israel          1.000000  1  [{'city': '', 'subcount

In [0]:
for key, val in A.items():
    print(key)
    print(val['score'])
    print(val['n'])
    print(val['match'])
    print()
    print()
    print()

Albania
1.0
1
          city subcountry  country
geonameid                         
3186084                    Albania



Algeria
1.0
1
          city subcountry  country
geonameid                         
2508813                    Algeria



Antigua And Barbuda
1.0
1
          city subcountry              country
geonameid                                     
3576022                    Antigua And Barbuda



Argentina
1.0
1
          city subcountry    country
geonameid                           
10172104                   Argentina



Aruba
1.0
1
          city subcountry country
geonameid                        
3577284                     Aruba



Australia
1.0
1
          city subcountry    country
geonameid                           
2172517                    Australia



Austria
1.0
1
          city subcountry  country
geonameid                         
2774326                    Austria



Bahamas
1.0
1
          city subcountry  country
geonameid                         
357

In [0]:
display(project_site_fix.loc[:'Algeria','match'])

Albania              city subcountry  country
geonameid  ...
Algeria              city subcountry  country
geonameid  ...
Name: match, dtype: object

In [0]:
%%time
import xarray as xr
import textdistance
f = np.vectorize(textdistance.levenshtein.normalized_similarity)

site = project['site'].drop_duplicates().tolist()[:10]
site.remove('')

coords = {'site': site,
          'geonameid': world.index,
          'part': world.columns}
data = xr.DataArray(0.0,
                    dims=coords.keys(),
                    coords=coords)

for part, ser in world.iteritems():
    for geonameid, name in ser.iteritems():
        print(part, name)
        data.loc[:, geonameid, part] = f(site, name)
M = data.max(dim=['geonameid','part'])
match = data >= M


d = dict()
for raw in site:
    mask = match.loc[raw].to_pandas()
    d[raw] = world.where(mask).stack()#to_numpy().ravel()

for raw, match in d.items():
    print(raw)
    print(match)
    print()
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
4192205    country    United States
4192289    country    United States
4192375    country    United States
4192674    country    United States
4193699    country    United States
4194474    country    United States
4195701    country    United States
4196586    country    United States
4198322    country    United States
4200671    country    United States
6331909    country    United States
4203696    country    United States
4204007    country    United States
4204230    country    United States
4205196    country    United States
4205885    country    United States
4207226    country    United States
4207400    country    United States
4207783    country    United States
4207981    country    United States
4208442    country    United States
4209448    country    United States
6331908    country    United States
4212684    country    United States
4212992    country    United States
4212995    country    United States

In [0]:
project['site'] = project['loc']


site = project['site'].drop_duplicates().to_frame().set_index('site').drop('')[:10]
site['best_score'] = 0.0
site['best_matches'] = [[] for _ in site.iterrows()]

Unnamed: 0_level_0,best_score,best_matches
site,Unnamed: 1_level_1,Unnamed: 2_level_1
Global,0.0,[]
United States,0.0,[]
United Kingdom,0.0,[]
Sierra Leone,0.0,[]
Fiji,0.0,[]
Samoa,0.0,[]
Nicaragua,0.0,[]
Sri Lanka,0.0,[]
India,0.0,[]
Zimbabwe,0.0,[]


In [0]:
site

Unnamed: 0_level_0,score,match,"(Global, score)","(United States, score)","(United Kingdom, score)","(Sierra Leone, score)","(Fiji, score)","(Samoa, score)","(Nicaragua, score)","(Sri Lanka, score)","(India, score)","(Zimbabwe, score)"
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Global,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333
United States,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333
United Kingdom,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333
Sierra Leone,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333
Fiji,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333
Samoa,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333
Nicaragua,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333
Sri Lanka,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333
India,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333
Zimbabwe,0.0,[],0.285714,0.541667,0.518519,0.342857,0.210526,0.25,0.428571,0.391304,0.294118,0.333333


In [0]:
world = formatter(pd.read_json(f"{path}/raw/world-cities_json.json")) \
    .rename(columns={'name':'city'}) \
    [['geonameid', 'city', 'subcountry', 'country']].set_index('geonameid') \
    .sort_values(['country', 'subcountry', 'city'])
# world['nocountry'] =    world['city'] + ', ' + world['subcountry']
# world['nosubcountry'] = world['city'] + ', '                              + world['country']
# world['nocity'] =                              world['subcountry'] + ', ' + world['country']
# world['all'] =          world['city'] + ', ' + world['subcountry'] + ', ' + world['country']

world.head(10)

Unnamed: 0_level_0,city,subcountry,country
geonameid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1142170,Fayzabad,Badakhshan,Afghanistan
1141089,Ghormach,Badghis,Afghanistan
1147290,Balkh,Balkh,Afghanistan
1136575,Khulm,Balkh,Afghanistan
1133616,Mazār-E Sharīf,Balkh,Afghanistan
1147242,Bāmyān,Bāmīān,Afghanistan
1142264,Farah,Farah,Afghanistan
1148658,Andkhōy,Faryab,Afghanistan
1133453,Maymana,Faryab,Afghanistan
1141269,Ghazni,Ghaznī,Afghanistan


In [0]:
1+1

2

In [0]:
%%time

import textdistance
f = np.vectorize(textdistance.levenshtein.normalized_similarity)

# Make copy of index from country_raw
X = project[['loc']].drop_duplicates().set_index('loc').drop('').sort_index()

for x, _ in X.iterrows():
    print(x)
    

Albania
Algeria
Antigua And Barbuda
Argentina
Aruba
Australia
Austria
Bahamas
Bangladesh
Belgium
Belize
Benin
Brazil
Brussels
Burundi
California
Cambodia
Cameroon
Canada
Cape Verde
Chile
China
Colombia
Congo
Cook Islands
Costa Rica
Cote D'Ivoire
Crete
Cuba
Curacao
Cyprus
Denmark
Dominica
Dominican Republic
Ecuador
Egypt
El Salvador
Estonia
Ethiopia
European Union
Fiji
France
French Polynesia
Gambia
Germany
Ghana
Global
Greece
Grenada
Guatemala
Honduras
Iceland
India
Indonesia
Iran
Ireland
Israel
Italy
Jamaica
Japan
Kenya
Kirabati
Kiribati
Lebanon
Madagascar
Malaysia
Maldives
Malta
Marshall Islands
Mauritius
Mexico
Micronesia
Monaco
Montenegro
Montserrat
Mozambique
Myanmar
Nauru
Nepal
Netherlands
New Caledonia
New Zealand
Nicaragua
Nigeria
Niue
Norway
Ottawa
Pakistan
Panama
Papua New Guinea
Peru
Philippines
Phillipines
Portugal
Roatan
Romania
Russia
Saint Kitts And Nevis
Samoa
Sao Tome And Principe
Senegal
Seychelles
Sierra Leone
Singapore
Solomon Islands
South Africa
South Korea
Spain


In [0]:

for idx, x in X.iterrows():




for idx, y in world.iterrows():
    print(y['country'])


    X[y['all']] = f(X.index, y['all'])



# country_match.shape
match = X.idxmax(axis=1).to_frame().reset_index()
# type(match)
# match['hit'] = match.iloc[:,0] == match.iloc[:,1]
# idx = ~match['hit']
# match[idx]
match.head()

In [0]:
%%time

import textdistance
f = np.vectorize(textdistance.levenshtein.normalized_similarity)

# Make copy of index from country_raw
X = project[['loc']].drop_duplicates().set_index('loc').drop('').sort_index()


# for y in world['country'].drop_duplicates():
#     X[y] = f(X.index, y)

for idx, x in X.iterrows():




for idx, y in world.iterrows():
    print(y['country'])


    X[y['all']] = f(X.index, y['all'])



# country_match.shape
match = X.idxmax(axis=1).to_frame().reset_index()
# type(match)
# match['hit'] = match.iloc[:,0] == match.iloc[:,1]
# idx = ~match['hit']
# match[idx]
match.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Switzerland
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Syria
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Taiwan
Tai

In [0]:
match.head(100)

Unnamed: 0,loc,0
0,Albania,"Fier, Fier, Albania"
1,Algeria,"Mila, Mila, Algeria"
2,Antigua And Barbuda,"Saint John’S, Saint John, Antigua And Barbuda"
3,Argentina,"Salta, Salta, Argentina"
4,Aruba,"Babijn, N/A, Aruba"
5,Australia,"Kew, Victoria, Australia"
6,Austria,"Graz, Styria, Austria"
7,Bahamas,"Ati, Batha, Chad"
8,Bangladesh,"Dhaka, Dhaka, Bangladesh"
9,Belgium,"Mol, Flanders, Belgium"


In [0]:
world['nosubcountry'].drop_duplicates().shape

(22451,)

In [0]:
import textdistance
f = np.vectorize(textdistance.levenshtein.normalized_similarity)

# Make copy of index from country_raw
country_match = country_raw[[]].copy()
country_match.head()

for y in country_world:
    country_match[y] = f(country_match.index, y)
country_match.shape
country_match.idxmax(axis=1)



Afghanistan                                              Afghanistan
Albania                                                      Albania
Alegre                                                       Algeria
Algeria                                                      Algeria
American Samoa                                        American Samoa
Andorra                                                      Andorra
Angola                                                        Angola
Antigua And Barbuda                              Antigua And Barbuda
Apia                                                         Albania
Argenitina                                                 Argentina
Argentina                                                  Argentina
Armenia                                                      Armenia
Aruba                                                          Aruba
Austraila                                                  Australia
Australia                         

In [0]:

# city_all = pd.read_json(f"{path}/raw/world-cities_json.json", dtype=str)\
#     .apply(lambda x: x.str.strip().str.title()).fillna('').replace(regex=r'^.{0,1}$', value='')\
#     .convert_dtypes()

# country_raw = pd.concat([project['loc'], project_entity_raw['country']]).value_counts().drop('').sort_index()

# %timeit country_all = city_all['country'].unique()
# %timeit country_all = city_all['country'].drop_duplicates()
country_all = city_all['country'].str.strip().drop_duplicates().sort_values()
# type(country_all)
country_all

15                                        Afghanistan
575                                     Aland Islands
65                                            Albania
5460                                          Algeria
332                                    American Samoa
0                                             Andorra
102                                            Angola
64                                           Anguilla
63                                Antigua And Barbuda
129                                         Argentina
85                                            Armenia
572                                             Aruba
368                                         Australia
333                                           Austria
576                                        Azerbaijan
2304                                          Bahamas
1023                                          Bahrain
654                                        Bangladesh
653                         

In [0]:
city.dtypes

country       string
geonameid     string
name          string
subcountry    string
dtype: object

In [0]:
# ! pip install textdistance


Collecting textdistance[extras]
  Downloading https://files.pythonhosted.org/packages/3f/18/31397b687f50ffae65469175f07faa68f288e27fcd8716276004c42e5637/textdistance-4.1.5-py3-none-any.whl
Collecting python-Levenshtein; extra == "extras"
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K     |████████████████████████████████| 51kB 2.8MB/s 
Collecting pyxDamerauLevenshtein; extra == "extras"
[?25l  Downloading https://files.pythonhosted.org/packages/b5/54/2d398545cae80d2fc8444345542ad5f3ffab0694c8efb8ed2fbe92017305/pyxDamerauLevenshtein-1.5.3.tar.gz (58kB)
[K     |████████████████████████████████| 61kB 6.6MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting abydos; extra == "extras"
[?25l  Downloading https://files.pythonhosted.org/pack

In [0]:
# ! pip install leven
# ! pip install StringDist

from leven import levenshtein
levenshtein('quinten','Quintin')


import stringdist
# stringdist.levenshtein(['test', 'gh'], 'testing')

# r = np.vectorize(stringdist.levenshtein)
# r(['bh','quinten'],['Quintin','k'])

import textdistance
textdistance.hamming.normalized_similarity(['test', 'gh'], 'text')



# levenshtein('quinten',['Quintin', 'gth'])

0.25

In [0]:
idx = city['country'] == "United States"

city.loc[idx, 'subcountry'].unique

city.groupby(['name', 'country'])['subcountry'].count().sort_values()

Output hidden; open in https://colab.research.google.com to view.

In [0]:
entity_type_raw
write_file(entity_type_raw, '/raw/entity_type_raw')

In [0]:
city = project_entity.groupby(['city', 'country'])['un_idx'].count().drop(['',''])

country = pd.concat([project['loc'], project_entity['country']]).value_counts().drop('')
# X


# X = project['loc']
# project_country = X[X != ''].value_counts()

# X = project_entity['country']
# entity_country = X[X != ''].value_counts()

# all_country = entity_country.add(project_country, fill_value=0).astype(int).sort_values()

# X = X[X != '']
# entity_city = X[X != ''].value_counts()
# entity_city
# X.head()

ERROR! Session/line number was not unique in database. History logging moved to new session 65


United States                       3019
United Kingdom                       815
Switzerland                          782
France                               699
Global                               658
Kenya                                421
Italy                                406
Belgium                              393
Australia                            364
India                                343
Germany                              342
Canada                               306
Brazil                               287
Fiji                                 259
Netherlands                          256
Indonesia                            247
Japan                                241
Sweden                               216
China                                213
New Zealand                          206
Samoa                                206
Honduras                             203
Nigeria                              203
Dominican Republic                   196
Spain           