In [0]:
# Make sure we have the new pandas 1.0 (Jan 2020) so we can use the new string dtype
! pip install pandas --upgrade

# Fuzzy text matching
! pip install "textdistance[extras]" --upgrade

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)

path = '/content/drive/My Drive/active/m5362_20sp_data_warehousing/PD4SDG/PD4SDG_new/data sources'

def formatter(X):
    """Common cleaning and formatting"""
    # convert to modern dtypes like "string" introduced in Pandas 1.0 (Jan 2020)
    df = X.copy().convert_dtypes()
    
    # trim whitespace, replace missing and length 0 or 1 strings with '', standard capitalization
    f = lambda x: x.str.strip().fillna('').str.lower() \
        .str.replace(r'^.{0,1}$', '').str.replace(r'^st\.', 'saint').str.replace(r'^st', 'saint').str.replace('&', 'and') \
        .str.title()
    
    # format column names, but use lowercase (my preference)
    df.columns = f(df.columns).str.lower()
    
    # format string columns
    strings = df.select_dtypes(include=['string','object']).columns   # finds string columns
    df[strings] = df[strings].apply(f).astype('string')   # formats them
    return df

In [0]:
world_raw = pd.read_json(f"{path}/raw/world-cities.json").set_index('geonameid')
world = formatter(world_raw) \
    .rename(columns={'name':'city'})
world = world[cols].drop_duplicates().sort_values(['country', 'subcountry', 'city'])
world.to_csv(f"{path}/city.csv")

In [0]:
country_data_raw = pd.read_csv(f"{path}/raw/countries of the world.csv", decimal=',')
country_data = formatter(country_data_raw)
country_data.index += 1
country_data.index.rename('id', inplace=True)
country_data.to_csv(f"{path}/country_data.csv")

In [0]:
pd4sdg_raw = pd.read_excel(f"{path}/raw/PD4SDG.xlsx")
pd4sdg = formatter(pd4sdg_raw) \
    .rename(columns={'project location 1':'site', 'title':'title', 'project_idx':'un_id'})
pd4sdg['un_id'] = pd4sdg['un_id'].str.replace("'", "")

In [0]:
# Get unique un_idx list
f = lambda L: max(L, key=len)  # returns longest string in list
project = pd4sdg.groupby('un_id').agg(
    title = ('title', f),
    site = ('site', f),
    repeats = ('un_id', 'count'),
    ).reset_index()
project.index += 1
project.index.rename('id', inplace=True)
project.to_csv(f"{path}/project.csv")

In [0]:
# Get project_entity links
partners = ['lead partner'] + [f"partner {i+1}" for i in range(237)]

# Temp list to hold results
L = []
for (n, c) in enumerate(partners):
    # get partner columns, rename cols, drop rows where name is ''
    s = pd4sdg[['un_id', c]].rename(columns={c: 'name'})
    # s = s[s['name'].str.len() > 0]
    s = s[s['name'] != '']

    # record partner number on project (in case precedence matters - we don't believe it does)
    s['n'] = n

    # Append
    L.append(s)

# concat the lists stored in L
project_entity = pd.concat(L)
project_entity.index += 1
project_entity.index.rename('id', inplace=True)
project_entity.to_csv(f"{path}/project_entity.csv", index=False)

# Text matching code

In [0]:
import textdistance
def textdist_func(a, b):
    try:
        a[0].lower() + b[0].lower()
    except:
        return 0.0
    else:
        return textdistance.levenshtein.normalized_similarity(a, b)
textdist_ufunc = np.frompyfunc(textdist_func, 2, 1)
def textdist(A, B):
    return textdist_ufunc.outer(A, B).astype('float')


def large(X, k=2, axis=0):
    if k <= 1:
        srt = np.argmax(X, axis)  # find
        srt = np.expand_dims(srt, axis)
        X = np.take_along_axis(X, srt, axis)
    else:
        X = np.rollaxis(X, axis, 0)
        srt1 = np.argpartition(X, -k, axis=0)[-k:]
        X = np.take_along_axis(X, srt1, axis=0)
        srt2 = np.argsort(X, axis=0)[::-1]
        X = np.take_along_axis(X, srt2, axis=0)
        srt = np.take_along_axis(srt1, srt2, axis=0)
        X   = np.rollaxis(X, 0, axis+1)
        srt = np.rollaxis(srt, 0, axis+1)
    return X, srt


def text_match(orig, targ, num_matches=1):
    def f(X):
        df = pd.DataFrame(X.copy()).drop_duplicates()
        m = (df!='').any(axis=1)
        return df[m]
    orig = f(orig)
    targ = f(targ)

    all_scores = textdist(orig.to_numpy(), targ.to_numpy())
    max_scores = all_scores.max((1,3))
    best_score, srt = large(max_scores.T, k=num_matches, axis=0)

    index = targ.index.to_numpy()
    names = targ.iloc[:, 0].to_numpy()
    for i in range(num_matches):
        orig[f"score_{i+1}"] = (best_score[i]*100).round(1)
        orig[f"index_{i+1}"] =   index[srt[i]]
        orig[ f"name_{i+1}"]  =  names[srt[i]]
   
    return orig, all_scores

In [0]:
num_matches = 3
all_site = pd.read_csv(f"{path}/output/site.csv").set_index('site_id')

entity   = pd.read_csv(f"{path}/output/entity.csv").set_index('id')[['name', 'type', 'site']]
s = all_site.index > 5000
df, s = text_match(entity['site'][:5], all_site.iloc[s,3:], num_matches)
for i in range(num_matches):
    df[f"use_match_{i+1}"] = ''
df[f"use_match_{1}"] = 'x'
df['no_match_found'] = ''
entity_site_fix = entity.join(df, lsuffix='_orig').sort_values('score_1')
entity_site_fix.to_csv(f"{path}/output/entity_site_fix.csv")

# Old or Experimental Code

In [0]:
## Read and process entity data
entity = pd.read_csv(f"{path}/output/entity.csv")


check_sites = True
look_for_duplicates = True
num_matches = 3

entity_data = formatter(pd.read_excel(f"{path}/raw/entity_data.xlsx"))
missing_idx = (entity_data[['name','type','city','country']]=='').any(axis=1)
if missing_idx.any():
    display(entity_data[missing_idx])
    raise Exception('Missing data in entity_data.xlsx')
entity_data['site'] = cat(entity_data[['city', 'subcountry', 'country']])

if check_sites:
    site = pd.concat([entity_data['site'], project['site']]).drop_duplicates().reset_index(drop=True)
    site_fix, s = text_match(site, world_site, num_matches)
    site_fix.sort_values('score_1', ascending=True, inplace=True)

    for i in range(num_matches):
        entity_data_similarity[f"use_match_{i+1}"] = ''

    write_file(entity_data_similarity, 'entity_data_similarity')


# idx = proj_site_fix['score_1'] < 100
# proj_site_fix = proj_site_fix.loc[idx]

In [0]:
entity_data = formatter(pd.read_excel(f"{path}/raw/entity_data.xlsx"))
missing_idx = (entity_data[['name','type','city','country']]=='').any(axis=1)
if missing_idx.any():
    display(entity_data[missing_idx])
    raise Exception('Missing data in entity_data.xlsx')
entity_data['type']

In [0]:
world_raw = pd.read_json(f"{path}/raw/world-cities.json").set_index('geonameid')
cols = ['city', 'subcountry', 'country']
world = formatter(world_raw) \
    .rename(columns={'name':'city'})

world = world[cols].drop_duplicates().sort_values(cols[::-1])

F = [f"form_{i+1}" for i in range(4)]
for f in F:
    world[f] = ''

def cat(df):
        return df.add(", ").sum(axis=1).str.strip(", ")

df = world.copy()
df[['city', 'subcountry']] = ''
df2 = df.iloc[:2].copy()
df2['country'] = ['Global', 'European Union']
df = df2.append(df)
df[F[0]] = cat(df[['country']])
country_df = df.drop_duplicates().reset_index(drop=True)


df = world.copy()
df['city'] = ''
df[F[0]] = cat(df[['subcountry', 'country']])
df[F[1]] = cat(df[['subcountry']])
subcountry_df = df.drop_duplicates().reset_index(drop=True)
subcountry_df.index += 1000
pd.



df = world.copy()
df[F[0]] = cat(df[['city', 'subcountry', 'country']])
df[F[1]] = cat(df[['city',               'country']])
df[F[2]] = cat(df[['city', 'subcountry'           ]])
df[F[3]] = cat(df[['city'                         ]])
city_df = df.copy()


df = world.copy()
df['city'] = ''
df[F[0]] = cat(df[['subcountry', 'country']])
df[F[1]] = cat(df[['subcountry']])
subcountry_df = df.copy().reset_index(drop=True)
subcountry_df.index += 1000


df = world.copy()
df[['city', 'subcountry']] = ''
df2 = df.iloc[:2].copy()
df2['country'] = ['Global', 'European Union']
df = df2.append(df)
df[F[0]] = cat(df[['country']])
country_df = df.copy().reset_index(drop=True)


world = pd.concat([country_df, subcountry_df]).drop_duplicates().reset_index(drop=True)
world = pd.concat([world, city_df]).drop_duplicates()
world.index.rename('geonameid', inplace=True)

world_site = world[F]

with pd.ExcelWriter(f'{path}/world.xlsx') as writer:  
    world_raw.to_excel(writer, sheet_name='raw')
    world.to_excel(writer, sheet_name='clean')
    world_site.to_excel(writer, sheet_name='sites')

In [0]:
# do this in sql

# name fix to make country names in country dataset compatible with country names in the city dataset
rep =  {
    'Bahamas, The': 'Bahamas',
    'British Virgin Is.': 'British Virgin Islands',
    'Burma': 'Myanmar',
    'Central African Rep.': 'Central African Republic',
    'Congo, Dem. Rep.': 'Democratic Republic Of The Congo',
    'Congo, Repub. Of The': 'Republic Of The Congo',
    "Cote D'Ivoire": 'Ivory Coast',
    'Gambia, The': 'Gambia',
    # 'Gaza Strip':
    'Korea, North': 'North Korea',
    'Korea, South': 'South Korea',
    'Macau': 'Macao',
    'Micronesia, Fed. St.': 'Micronesia',
    'N. Mariana Islands': 'Northern Mariana Islands',
    # 'Netherlands Antilles':
    'Turks And Caicos Is': 'Turks And Caicos Islands',
    'Virgin Islands': 'U.S. Virgin Islands',
    # 'West Bank':
}
country_data_raw = pd.read_csv(f"{path}/raw/countries of the world.csv", decimal=',')
country_data = formatter(country_data_raw)
country_data.index += 1
country_data.index.rename('id', inplace=True)

country_data.to_csv(f"{path}/raw/countries of the world cleaned.csv")

# country_data['country'] = country_data['country'].replace(rep).astype('string')
# country_data.set_index('country', inplace=True)

# # Create "Palestine" to be compatible with city dataset
# def combine(old):
#     x = country_data.loc[old]
#     y = x.iloc[0].copy()
#     y[1:3] = x.iloc[:,1:3].sum().astype(int)
#     y[3] = np.round(y[1] / y[2], 2)
#     y[4:] = np.round(x.iloc[:,4:].mean(), 2)
#     return y

# pal = ['Gaza Strip','West Bank']
# country_data.loc['Palestinian Territory'] = combine(pal)

# with pd.ExcelWriter(f'{path}/country_data.xlsx') as writer:  
#     country_data_raw.to_excel(writer, sheet_name='raw')
#     country_data.to_excel(writer, sheet_name='clean')
country_data.head()

In [0]:
entity_data['type'].value_counts()

In [0]:
## Read and process entity data
look_for_duplicates = True
num_matches = 3

entity_data = formatter(pd.read_excel(f"{path}/raw/entity_data.xlsx"))
missing_idx = (entity_data[['name','type','city','country']]=='').any(axis=1)
if missing_idx.any():
    display(entity_data[missing_idx])
    raise Exception('Missing data in entity_data.xlsx')

if look_for_duplicates:
    A = entity_data['name'][:100]
    B = A
    left, s = text_match(A, B, num_matches+1)
    # display(left.head())
    # idx = left['score_1'] > 0.8
    # left = left.loc[idx]

    attr = ['score', 'index', 'name']
    srt = [f"{s}_{i}" for i in range(1, num_matches+1) for s in attr]
    entity_data_similarity = left[srt].sort_values('score_1', ascending=False)

    entity_data_similarity['keep_orig'] = 'x'
    for i in range(num_matches):
        entity_data_similarity[f"use_match_{i+1}"] = ''

    write_file(entity_data_similarity, 'entity_data_similarity')
# entity_data_similarity.head()
entity_data_similarity

In [0]:
entity_data.head()

# sites = project['site']

In [0]:
project['site']


proj_site_fix, s = text_match(project['site'].drop_duplicates(), world_site, num_matches=3)
proj_site_fix.sort_values('score_1', ascending=False, inplace=True)
idx = proj_site_fix['score_1'] < 100
proj_site_fix = proj_site_fix.loc[idx]


In [0]:
world_raw = pd.read_json(f"{path}/raw/world-cities_json.json").set_index('geonameid').drop_duplicates()

cols = ['city', 'subcountry', 'country']
world = formatter(world_raw) \
    .rename(columns={'name':'city'})
world = world[cols].sort_values(cols[::-1])

def cat(df):
        return df.add(", ").sum(axis=1).str.strip(", ")

A = pd.DataFrame()
A[F[0]] = cat(world[['city', 'subcountry', 'country']])
A[F[1]] = cat(world[['city',               'country']])
A[F[2]] = cat(world[['city', 'subcountry'           ]])
A[F[3]] = cat(world[['city'                         ]])

B = pd.DataFrame()
B[F[0]] = cat(world[['subcountry', 'country']])
B[F[1]] = cat(world[[              'country']])

C = pd.DataFrame()
C[F[0]] = cat(world[['country']])

D = C.iloc[:2].copy()
D[F[0]] = ['Global', 'European Union']

world_site = pd.concat([D, C, B]).reset_index(drop=True)
world_site = pd.concat([world_site, A]).drop_duplicates()

world_site.shape

In [0]:
world_raw = pd.read_json(f"{path}/raw/world-cities_json.json").set_index('geonameid')
cols = ['city', 'subcountry', 'country']
world = formatter(world_raw) \
    .rename(columns={'name':'city'})
world = world[cols].sort_values(cols[::-1])#['country', 'subcountry', 'city'])

F = [f"form_{i+1}" for i in range(4)]
for f in F:
    world[f] = ''

def cat(df):
        return df.add(", ").sum(axis=1).str.strip(", ")

A = world.copy()
A[F[0]] = cat(A[cols])
A[F[1]] = cat(A[['city', 'country']])
A[F[2]] = cat(A[['city', 'subcountry']])
A[F[3]] = cat(A[['city']])


B = world.copy()
B['city'] = ''
B = B.drop_duplicates()
B[F[0]] = cat(B[['subcountry', 'country']])
B[F[1]] = cat(B[['subcountry']])

C = world.copy()
C[['city', 'subcountry']] = ''
D = C.iloc[:2].copy()
D['country'] = ['Global', 'European Union']
C = D.append(C).drop_duplicates()
C[F[0]] = cat(C[['country']])

D = pd.concat([C, B], ignore_index=True)
world = pd.concat([D, A])
world.index.rename('geonameid', inplace=True)


world.head()
# world.tail()

# A = world.copy()
# A['city'] = ''
# A = A.drop_duplicates()


# B = A.copy()
# B['subcountry'] = ''
# B = B.drop_duplicates()

# C = B.iloc[:2].copy()
# C['country'] = ['Global', 'European Union']

# D = pd.concat([C, B, A], ignore_index=True)
# world = pd.concat([D, world])
# world.index.rename('geonameid', inplace=True)

# with pd.ExcelWriter(f'{path}/world.xlsx') as writer:  
#     world_raw.to_excel(writer, sheet_name='raw')
#     world.to_excel(writer, sheet_name='clean')
#     L = {'site':['city','subcountry','country'],
#         'nocountry':['city','subcountry'],
#         'nosubcountry':['city','country'],
#         'nocity':['subcountry','country'],
#         'city':['city'],
#         'subcountry':['subcountry'],
#         'country':['country'],
#     }
#     world_mini = dict()
#     for lev, cols in L.items():
#         X = world[cols].add(", ").sum(axis=1).str.strip(", ").drop_duplicates().sort_index()
#         X = X[X != '']
#         world_mini[lev] = X
#         X.to_excel(writer, sheet_name=lev)

In [0]:
entity_raw = project_entity_raw.drop(columns=['un_idx', 'n']).drop_duplicates()\
    .sort_values(['name', 'country', 'city']).rename(columns={'name':'name_orig'})

# apply entity_fix
entity_fix = formatter(pd.read_excel(f"{path}/raw/entity_fix.xlsx"))
entity = pd.merge(entity_raw, entity_fix, how='left', on='name_orig')
idx = entity['name'].isna()
entity.loc[idx, 'name'] = entity.loc[idx, 'name_orig']

# merge entity_data
entity = pd.merge(entity, entity_data, how='left', on='name', suffixes=('_orig',''))

# look for unmatched entities
idx = entity['type'].isna()
if idx.any():
    entity_nomatch = entity.loc[idx,:'country_orig'].drop_duplicates()

    num_matches = 3
    A = entity_nomatch['name_orig']
    B = entity_data['name']
    left, s = text_match(A, B, num_matches)

    attr = ['type', 'city', 'subcountry', 'country']
    for i in range(num_matches):
        attr_new = [f"{s}_{i}" for s in attr]
        right = entity_data[attr].rename(columns=dict(zip(attr, attr_new)))
        left = left.join(right, on=f"index_{i}", how='left')

    attr = ['score', 'index', 'name'] + attr
    srt = [f"{s}_{i}"   for i in range(num_matches) for s in attr]
    ren = [f"{s}_{i+1}" for i in range(num_matches) for s in attr]
    entity_nomatch_similarity = left[srt].rename(columns=dict(zip(srt,ren))).sort_values('score_1', ascending=False)

    entity_nomatch_similarity['use_orig'] = ''
    for i in range(num_matches):
        entity_nomatch_similarity[f'use_match_{i+1}'] = ''

    for s in attr[2:]:
        entity_nomatch_similarity[f'{s}_new'] = ''

write_file(entity_nomatch_similarity, 'entity_nomatch_similarity')
entity_nomatch_similarity.head()

In [0]:
world_raw = pd.read_json(f"{path}/raw/world-cities_json.json").set_index('geonameid')
world = formatter(world_raw) \
    .rename(columns={'name':'city'}) \
    [['city', 'subcountry', 'country']] \
    .sort_values(['country', 'subcountry', 'city']) \

F = [f"form_{i+1}" for i in range(4)]
world[F] = ''

L = {'site':['city','subcountry','country'],
    'nocountry':['city','subcountry'],
    'nosubcountry':['city','country'],
    'nocity':['subcountry','country'],
    'city':['city'],
    'subcountry':['subcountry'],
    'country':['country'],
}
D = dict()
for lev, cols in L.items():
    X = world[cols].add(", ").sum(axis=1).str.strip(", ").drop_duplicates()#.sort_index()
    X = X[X != '']
    d_mini[lev] = X
world = pd.concat([d for  in world_mini.values()]).drop_duplicates()
    
    
    X.to_excel(writer, sheet_name=lev)


with pd.ExcelWriter(f'{path}/world.xlsx') as writer:  
    world_raw.to_excel(writer, sheet_name='raw')
    world.to_excel(writer, sheet_name='clean')


In [0]:
num_matches = 2
A = project['site'].drop_duplicates()
A = A[A != '']
B = world_mini['site']
left, s = text_match(A, B, num_matches)

project_site_similarity = left
project_site_similarity.head()
# project_site_fix_dict = fix_site(project['site'].to_numpy())
# project_site_fix = pd.DataFrame.from_dict(project_site_fix_dict, orient='index')
# write_file(project_site_fix, 'project_site_fix')

In [0]:
project_site_similarity

In [0]:
def fix_site(orig, num_matches=1):
    B = world_mini['site']
    left, s = text_match(orig, B, num_matches)

    
    # display(left.head(10))


    # attr = ['name']
    # for i in range(num_matches):
    #     attr_new = [f"{s}_{i}" for s in attr]
    #     right = entity_data[attr].rename(columns=dict(zip(attr, attr_new)))
    #     left = left.join(right, on=f"index_{i}", how='left')




    # site = {s:{'scores':[0.0 for _ in range(n)], 'matches':[None for _ in range(n)]} for s in orig}
    
    # # iterate over sites
    # for lev, w in world_mini.items():
    #     d = textdist(orig, w)
        
    #     for s, val in site.items():
            

    #     # iterate over known world location in multiple levels like city, subcountry, country and subset of these 3 items
        

    #         # compute distance scores & find best
    #         score = textdist(np.array(w), s)


    #         newus_scorus = score.max()

    #         # if there is a better score than the current best, we record this improved match
    #         if newus_scorus > biggus_scorus:
    #             biggus_scorus = newus_scorus
    #             site[s]['score'] = biggus_scorus
    #             hits = w[score==biggus_scorus].index
    #             n = len(hits)
    #             site[s]['n'] = n
    #             match = world.loc[hits, ['city','subcountry', 'country']]
    #             if lev in ['subcountry', 'country', ' nocity']:
    #                 match['city'] = ''
    #                 if lev in ['country']:
    #                     match['subcountry'] = ''
    #             site[s]['match'] = match.to_dict(orient='records')
    # return site

# project_site_fix_dict = fix_site(project['site'].to_numpy())
# project_site_fix = pd.DataFrame.from_dict(project_site_fix_dict, orient='index')
# write_file(project_site_fix, 'project_site_fix')


project_site_fix_dict = fix_site(project['site'].to_numpy())
project_site_fix = pd.DataFrame.from_dict(project_site_fix_dict, orient='index')
write_file(project_site_fix, 'project_site_fix')

In [0]:
entity_raw = project_entity_raw.drop(columns=['un_idx', 'n']).drop_duplicates().sort_values(['name', 'country', 'city'])
# entity_raw['subcountry'] = ''
entity_raw = entity_raw[['name','type','city','country']].rename(columns={'name':'name_orig'})



entity_fix = formatter(pd.read_excel(f"{path}/raw/entity_fix.xlsx"))
entity_data = formatter(pd.read_excel(f"{path}/raw/entity_data.xlsx"))
missing_idx = (entity_data[['name','type','city','country']]=='').any(axis=1)
if missing_idx.any():
    display(entity_data[missing_idx])
    raise Exception('Missing data in entity_data.xlsx')


entity = pd.merge(entity_raw, entity_fix, how='left', on='name_orig')

idx = entity['name'].isna()
entity.loc[idx, 'name'] = entity.loc[idx, 'name_orig']

entity = pd.merge(entity, entity_data, how='left', on='name', suffixes=('_orig',''))

idx = entity['type'].isna()
entity_nomatch = entity.loc[idx,:'country_orig'].drop_duplicates()

num_matches=3
fix, _ = text_match(entity_nomatch['name_orig'], entity_data['name'], num_matches)
for i in range(num_matches):
    fix = fix.join(entity_data.set_index('name'), how='left', on=f"name_{i}")
    L = ['type', 'city', 'subcountry', 'country']
    fix.rename(columns={c:f"{c}_{i}" for c in L}, inplace=True)

L = ['score', 'name'] + L
fix = fix[[f"{attr}_{i}" for i in range(num_matches) for attr in L]]
fix.sort_values('score_0', ascending=False, inplace=True)

entity_nomatch_fix = entity_nomatch.join(fix, on='name_orig', how='left').sort_values('score_0', ascending=False)


entity_nomatch_fix['use_orig'] = ''
for i in range(num_matches):
    entity_nomatch_fix[f'use_match_{i}'] = ''

for c in L[1:]:
    entity_nomatch_fix[f'{c}_new'] = ''

write_file(entity_nomatch_fix, 'entity_nomatch_fix')
entity_nomatch_fix

In [0]:
entity_raw = project_entity_raw.drop(columns=['un_idx', 'n']).drop_duplicates()\
    .sort_values(['name', 'country', 'city']).rename(columns={'name':'name_orig'})

# apply entity_fix
entity_fix = formatter(pd.read_excel(f"{path}/raw/entity_fix.xlsx"))
entity = pd.merge(entity_raw, entity_fix, how='left', on='name_orig')


idx = entity['name'].isna()
entity.loc[idx, 'name'] = entity.loc[idx, 'name_orig']
entity = pd.merge(entity, entity_data, how='left', on='name', suffixes=('_orig',''))


entity.iloc[6120:6140]

In [0]:
entity_fix.tail()

In [0]:
entity

In [0]:
entity_nomatch_fix = entity_nomatch.join(fix, on='name_orig', how='left').sort_values('score_0', ascending=False)
entity_nomatch_fix['use_orig'] = ''
for i in range(num_matches):
    entity_nomatch_fix[f'use_match_{i}'] = ''

for c in L[1:]:
    entity_nomatch_fix[f'{c}_new'] = ''

write_file(entity_nomatch_fix, 'entity_nomatch_fix')
entity_nomatch_fix

In [0]:
entity_nomatch.head()

In [0]:
entity

In [0]:
# entity_nomatch.join(df, on='name_orig').sort_values('score_0', ascending=False)
num_matches=3
i = 0
X = df
for i in range(num_matches):
    X = pd.merge(X, entity_data, left_on=f"name_{i}", right_on='name').drop(columns=['name'])
    L = ['type', 'city', 'subcountry', 'country']
    X.rename(columns={c:f"{c}_{i}" for c in L}, inplace=True)

X = X[[f"{attr}_{i}" for i in range(num_matches) for attr in ['score', 'name', 'type', 'city', 'subcountry', 'country']]]
X
# for i in range(num_matches):
#     X = 

In [0]:
def fix_site(orig):
    # uniquify to avoid wasted effort
    orig = np.unique(orig).tolist()

    # remove trivial case if present
    try:
        orig.remove('')
    except:
        pass

    world['site']

    # site dict records original, best current score, number of ties for that score, and matches with that score




    site = {s:{'score':0.0, 'n':0, 'match':[]} for s in orig}

    # chosen distance metric
    
    
    # iterate over sites
    for s, val in site.items():
        # best current score
        biggus_scorus = 0.0

        # iterate over known world location in multiple levels like city, subcountry, country and subset of these 3 items
        for lev, w in world_mini.items():

            # compute distance scores & find best
            score = textdist(w.to_numpy(), s)
            newus_scorus = score.max()

            # if there is a better score than the current best, we record this improved match
            if newus_scorus > biggus_scorus:
                biggus_scorus = newus_scorus
                site[s]['score'] = biggus_scorus
                hits = w[score==biggus_scorus].index
                n = len(hits)
                site[s]['n'] = n
                match = world.loc[hits, ['city','subcountry', 'country']]
                if lev in ['subcountry', 'country', ' nocity']:
                    match['city'] = ''
                    if lev in ['country']:
                        match['subcountry'] = ''
                site[s]['match'] = match.to_dict(orient='records')
    return site

project_site_fix_dict = fix_site(project['site'].to_numpy())
project_site_fix = pd.DataFrame.from_dict(project_site_fix_dict, orient='index')
write_file(project_site_fix, 'project_site_fix')

In [0]:
def fix_site(orig):
    # uniquify to avoid wasted effort
    orig = np.unique(orig).tolist()

    # remove trivial case if present
    try:
        orig.remove('')
    except:
        pass

    # site dict records original, best current score, number of ties for that score, and matches with that score
    site = {s:{'score':0.0, 'n':0, 'match':[]} for s in orig}

    # chosen distance metric
    
    
    # iterate over sites
    for s, val in site.items():
        # best current score
        biggus_scorus = 0.0

        # iterate over known world location in multiple levels like city, subcountry, country and subset of these 3 items
        for lev, w in world_mini.items():

            # compute distance scores & find best
            score = textdist(w.to_numpy(), s)
            newus_scorus = score.max()

            # if there is a better score than the current best, we record this improved match
            if newus_scorus > biggus_scorus:
                biggus_scorus = newus_scorus
                site[s]['score'] = biggus_scorus
                hits = w[score==biggus_scorus].index
                n = len(hits)
                site[s]['n'] = n
                match = world.loc[hits, ['city','subcountry', 'country']]
                if lev in ['subcountry', 'country', ' nocity']:
                    match['city'] = ''
                    if lev in ['country']:
                        match['subcountry'] = ''
                site[s]['match'] = match.to_dict(orient='records')
    return site

project_site_fix_dict = fix_site(project['site'].to_numpy())
project_site_fix = pd.DataFrame.from_dict(project_site_fix_dict, orient='index')
write_file(project_site_fix, 'project_site_fix')

In [0]:
%%time
import xarray as xr
import textdistance
f = np.vectorize(textdistance.levenshtein.normalized_similarity)

site = project['site'].drop_duplicates().tolist()[:10]
site.remove('')

coords = {'site': site,
          'geonameid': world.index,
          'part': world.columns}
data = xr.DataArray(0.0,
                    dims=coords.keys(),
                    coords=coords)

for part, ser in world.iteritems():
    for geonameid, name in ser.iteritems():
        print(part, name)
        data.loc[:, geonameid, part] = f(site, name)
M = data.max(dim=['geonameid','part'])
match = data >= M


d = dict()
for raw in site:
    mask = match.loc[raw].to_pandas()
    d[raw] = world.where(mask).stack()#to_numpy().ravel()

for raw, match in d.items():
    print(raw)
    print(match)
    print()
    print()

In [0]:
project['site'] = project['loc']


site = project['site'].drop_duplicates().to_frame().set_index('site').drop('')[:10]
site['best_score'] = 0.0
site['best_matches'] = [[] for _ in site.iterrows()]

In [0]:
site

In [0]:
world = formatter(pd.read_json(f"{path}/raw/world-cities_json.json")) \
    .rename(columns={'name':'city'}) \
    [['geonameid', 'city', 'subcountry', 'country']].set_index('geonameid') \
    .sort_values(['country', 'subcountry', 'city'])
# world['nocountry'] =    world['city'] + ', ' + world['subcountry']
# world['nosubcountry'] = world['city'] + ', '                              + world['country']
# world['nocity'] =                              world['subcountry'] + ', ' + world['country']
# world['all'] =          world['city'] + ', ' + world['subcountry'] + ', ' + world['country']

world.head(10)

In [0]:
1+1

In [0]:
%%time

import textdistance
f = np.vectorize(textdistance.levenshtein.normalized_similarity)

# Make copy of index from country_raw
X = project[['loc']].drop_duplicates().set_index('loc').drop('').sort_index()

for x, _ in X.iterrows():
    print(x)
    

In [0]:

for idx, x in X.iterrows():




for idx, y in world.iterrows():
    print(y['country'])


    X[y['all']] = f(X.index, y['all'])



# country_match.shape
match = X.idxmax(axis=1).to_frame().reset_index()
# type(match)
# match['hit'] = match.iloc[:,0] == match.iloc[:,1]
# idx = ~match['hit']
# match[idx]
match.head()

In [0]:
%%time

import textdistance
f = np.vectorize(textdistance.levenshtein.normalized_similarity)

# Make copy of index from country_raw
X = project[['loc']].drop_duplicates().set_index('loc').drop('').sort_index()


# for y in world['country'].drop_duplicates():
#     X[y] = f(X.index, y)

for idx, x in X.iterrows():




for idx, y in world.iterrows():
    print(y['country'])


    X[y['all']] = f(X.index, y['all'])



# country_match.shape
match = X.idxmax(axis=1).to_frame().reset_index()
# type(match)
# match['hit'] = match.iloc[:,0] == match.iloc[:,1]
# idx = ~match['hit']
# match[idx]
match.head()

In [0]:
match.head(100)

In [0]:
world['nosubcountry'].drop_duplicates().shape

In [0]:
import textdistance
f = np.vectorize(textdistance.levenshtein.normalized_similarity)

# Make copy of index from country_raw
country_match = country_raw[[]].copy()
country_match.head()

for y in country_world:
    country_match[y] = f(country_match.index, y)
country_match.shape
country_match.idxmax(axis=1)



In [0]:

# city_all = pd.read_json(f"{path}/raw/world-cities_json.json", dtype=str)\
#     .apply(lambda x: x.str.strip().str.title()).fillna('').replace(regex=r'^.{0,1}$', value='')\
#     .convert_dtypes()

# country_raw = pd.concat([project['loc'], project_entity_raw['country']]).value_counts().drop('').sort_index()

# %timeit country_all = city_all['country'].unique()
# %timeit country_all = city_all['country'].drop_duplicates()
country_all = city_all['country'].str.strip().drop_duplicates().sort_values()
# type(country_all)
country_all

In [0]:
city.dtypes

In [0]:
# ! pip install textdistance


In [0]:
# ! pip install leven
# ! pip install StringDist

from leven import levenshtein
levenshtein('quinten','Quintin')


import stringdist
# stringdist.levenshtein(['test', 'gh'], 'testing')

# r = np.vectorize(stringdist.levenshtein)
# r(['bh','quinten'],['Quintin','k'])

import textdistance
textdistance.hamming.normalized_similarity(['test', 'gh'], 'text')



# levenshtein('quinten',['Quintin', 'gth'])

In [0]:
idx = city['country'] == "United States"

city.loc[idx, 'subcountry'].unique

city.groupby(['name', 'country'])['subcountry'].count().sort_values()

In [0]:
entity_type_raw
write_file(entity_type_raw, '/raw/entity_type_raw')

In [0]:
city = project_entity.groupby(['city', 'country'])['un_idx'].count().drop(['',''])

country = pd.concat([project['loc'], project_entity['country']]).value_counts().drop('')
# X


# X = project['loc']
# project_country = X[X != ''].value_counts()

# X = project_entity['country']
# entity_country = X[X != ''].value_counts()

# all_country = entity_country.add(project_country, fill_value=0).astype(int).sort_values()

# X = X[X != '']
# entity_city = X[X != ''].value_counts()
# entity_city
# X.head()