<a href="https://colab.research.google.com/github/MarciaFG/skill-flow/blob/main/preprocess_20230104.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Scientists' Topic Mobility**

**Author:** Marcia R. Ferreira (Complexity Science Hub Vienna & TU Wien)

**Date:** 03-01-2023

**Task:** Match the GRID city strings to World cities strings

**Input:** `org_metadata.csv` and `World_Cities.csv`

**Output:** 


# Collab Initialization

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime → "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.
Your runtime has 13.6 gigabytes of available RAM

To enable a high-RAM runtime, select the Runtime → "Change runtime type"
menu, and then select High-RAM in the Runtime shape dropdown. Then, 
re-execute this cell.


In [None]:
#!pip install -U mxnet-cu101==1.7.0
#!pip install d2l==1.0.0-beta0

# Install required Drivers

In [3]:
import numpy as np
import requests
import pandas as pd
from tqdm import tqdm
import torch
import nltk
import io
import matplotlib.pyplot as plt
from google.cloud import bigquery
import humanize
#!pip install google-sheets-to-csv
# enable data table format
from google.colab import data_table
data_table.enable_dataframe_formatter()

## Google Drive Access

You will be asked to click a link to generate a secret key to access your Google Drive. 

Copy and paste secret key it into the space provided with the notebook.

In [None]:
import os.path
from google.colab import drive

# mount Google Drive to /content/drive/My Drive/
if os.path.isdir("/content/drive/My Drive"):
  print("Google Drive already mounted")
else:
  drive.mount('/content/drive')

## Fetch Data

In [4]:
import os.path
import urllib.request
import tarfile
import zipfile
import gzip
from shutil import copy

def fetch_remote_datafile(filename, remote_url):
  if os.path.isfile("./" + filename):
    print("already have " + filename + " in workspace")
    return
  print("fetching " + filename + " from " + remote_url + "...")
  urllib.request.urlretrieve(remote_url, "./" + filename)

def cache_datafile_in_drive(filename):
  if os.path.isfile("./" + filename) == False:
    print("cannot cache " + filename + ", it is not in workspace")
    return
  
  data_drive_path = "/content/drive/My Drive/individualAlign/data/"
  if os.path.isfile(data_drive_path + filename):
    print("" + filename + " has already been stored in Google Drive")
  else:
    print("copying " + filename + " to " + data_drive_path)
    copy("./" + filename, data_drive_path)
  

def load_datafile_from_drive(filename, remote_url=None):
  data_drive_path = "/content/drive/My Drive/individualAlign/data/"
  if os.path.isfile("./" + filename):
    print("already have " + filename + " in workspace")
  elif os.path.isfile(data_drive_path + filename):
    print("have " + filename + " in Google Drive, copying to workspace...")
    copy(data_drive_path + filename, ".")
  elif remote_url != None:
    fetch_remote_datafile(filename, remote_url)
  else:
    print("error: you need to manually download " + filename + " and put in drive")
    
def extract_datafile(filename, expected_extract_artifact=None):
  if expected_extract_artifact != None and (os.path.isfile(expected_extract_artifact) or os.path.isdir(expected_extract_artifact)):
    print("files in " + filename + " have already been extracted")
  elif os.path.isfile("./" + filename) == False:
    print("error: cannot extract " + filename + ", it is not in the workspace")
  else:
    extension = filename.split('.')[-1]
    if extension == "zip":
      print("extracting " + filename + "...")
      data_file = open(filename, "rb")
      z = zipfile.ZipFile(data_file)
      for name in z.namelist():
          print("    extracting file", name)
          z.extract(name, "./")
      data_file.close()
    elif extension == "gz":
      print("extracting " + filename + "...")
      if filename.split('.')[-2] == "tar":
        tar = tarfile.open(filename)
        tar.extractall()
        tar.close()
      else:
        data_zip_file = gzip.GzipFile(filename, 'rb')
        data = data_zip_file.read()
        data_zip_file.close()
        extracted_file = open('.'.join(filename.split('.')[0:-1]), 'wb')
        extracted_file.write(data)
        extracted_file.close()
    elif extension == "tar":
      print("extracting " + filename + "...")
      tar = tarfile.open(filename)
      tar.extractall()
      tar.close()
    elif extension == "csv":
      print("do not need to extract csv")
    else:
      print("cannot extract " + filename)
      
def load_cache_extract_datafile(filename, expected_extract_artifact=None, remote_url=None):
  load_datafile_from_drive(filename, remote_url)
  extract_datafile(filename, expected_extract_artifact)
  cache_datafile_in_drive(filename)
  

In [None]:
load_cache_extract_datafile("folder name", "file name", "url")

## From Github

In [None]:
# From Github (Files < 25MB)
url = 'copied_raw_GH_link'
df1 = pd.read_csv(url) # Dataset is now stored in a Pandas Dataframe

## From a local drive

In [None]:
# From a local drive
# Alternatively upload the files manually
# upload files
from google.colab import files
uploaded = files.upload()

# import it into a pandas dataframe
df = pd.read_csv(io.BytesIO(uploaded['dataset.csv']))

## From Google Drive 

In [5]:
# Code to read csv file into Colaboratory:
#!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
# link = 'https://drive.google.com/file/d/1jPx0AKsYABIBAUo22YnMUljM4nSOfcaA/view?usp=sharing' # The shareable link contains the id, copy it to code below

In [13]:
#load the org data file
#fluff, id = link.split('=')
id = '1jPx0AKsYABIBAUo22YnMUljM4nSOfcaA'
print (id) # Verify that you have everything
# id='1uAWzsA6UGRRT_q85OGFR5qkC3WBQhsuHrHDvnzqnDJo'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('org_metadata.csv')  
df_org = pd.read_csv('org_metadata.csv', sep=";") # Dataset is now stored in a Pandas Dataframe
df_org.head(1)

1jPx0AKsYABIBAUo22YnMUljM4nSOfcaA


Unnamed: 0,grid_id,organization_name,alias,country_code,country,city,state_code,state,organization_type_id,organization_type,latitude,longitude
0,grid.10041.34,University of La Laguna,University of San Fernando de La Laguna; Unive...,ES,Spain,San Cristóbal de La Laguna,,,3.0,Education,28484815,-1631285


In [12]:
#load the world cities data file
id = '1MnlMaMhf_5ftGS1M-h-fkO-0PIODPY-d'
print (id) # Verify that you have everything
# id='1uAWzsA6UGRRT_q85OGFR5qkC3WBQhsuHrHDvnzqnDJo'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('World_Cities.csv')  
df_cites = pd.read_csv('World_Cities.csv') # Dataset is now stored in a Pandas Dataframe
df_cites.head(1)

1MnlMaMhf_5ftGS1M-h-fkO-0PIODPY-d


Unnamed: 0,X,Y,FID,OBJECTID,CITY_NAME,GMI_ADMIN,ADMIN_NAME,FIPS_CNTRY,CNTRY_NAME,STATUS,POP,POP_RANK,POP_CLASS,PORT_ID,LABEL_FLAG,POP_SOURCE
0,-56.093004,-15.615,1,1,Cuiaba,BRA-MGR,Mato Grosso,BR,Brazil,Provincial capital,540814,3,"500,000 to 999,999",0,0,UN_Data_2010_2020


In [11]:
#load the additional world cities data file
# downloaded from here: https://simplemaps.com/data/world-cities
id = '184CAgNBWzOYAD90OO69akSj30mDYYc7p'
print (id) # Verify that you have everything
# id='1uAWzsA6UGRRT_q85OGFR5qkC3WBQhsuHrHDvnzqnDJo'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('worldcities.csv.csv')  
df_cities_new = pd.read_csv('worldcities.csv.csv') # Dataset is now stored in a Pandas Dataframe
df_cities_new.head(1)

184CAgNBWzOYAD90OO69akSj30mDYYc7p


Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6839,139.7744,Japan,JP,JPN,Tōkyō,primary,39105000.0,1392685764


# If everything else fails, the easiest way

In [None]:
%cd /content/drive/MyDrive/IndividualAlign/data
df_org = pd.read_csv('/content/drive/MyDrive/IndividualAlign/data/org_metadata.csv', sep=";")
df_org = pd.read_csv('/content/drive/MyDrive/IndividualAlign/data/World_Cities.csv')

# String matching


In [34]:
# Subset the dataframes
dfsub_cities = df_cites[['CITY_NAME', 'X', 'Y']].sort_values(by=['CITY_NAME']).drop_duplicates().reset_index(drop=True)
dfsub_cities_new = df_cities_new[['city', 'city_ascii', 'iso2', 'lat', 'lng']].sort_values(by=['city_ascii', 'iso2']).drop_duplicates().reset_index(drop=True)
dfsub_orgs = df_org[['city', 'country_code']].sort_values(by=['city', 'country_code']).drop_duplicates().reset_index(drop=True)

print(dfsub_cities.head(1))
print(dfsub_cities_new.head(1))
print(dfsub_orgs.head(1))

  CITY_NAME     X          Y
0     'Ataq  46.8  14.550003
          city   city_ascii iso2   lat    lng
0  ’Aïn Abessa  'Ain Abessa   DZ  36.3  5.295
       city country_code
0  A Coruña           ES


### Punctuation removal

In [38]:
# columns CITY_NAME, city
# first lets clean the strings and do an exact match
import string
string.punctuation

#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#storing the puntuation free text
dfsub_cities['CITY_NAME_v2'] = dfsub_cities['CITY_NAME'].apply(lambda x:remove_punctuation(x))
dfsub_cities_new['city_ascii_v2'] = dfsub_cities_new['city_ascii'].apply(lambda x:remove_punctuation(x))
dfsub_orgs['city_v2'] = dfsub_orgs['city'].apply(lambda x:remove_punctuation(x))

#lowering the text
dfsub_cities['CITY_NAME_v2'] = dfsub_cities['CITY_NAME_v2'].apply(lambda x: x.lower())
dfsub_cities_new['city_ascii_v2'] = dfsub_cities_new['city_ascii_v2'].apply(lambda x: x.lower())
dfsub_orgs['city_v2'] = dfsub_orgs['city_v2'].apply(lambda x: x.lower())

dfsub_cities['CITY_NAME_v2'] = dfsub_cities['CITY_NAME_v2'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
dfsub_cities_new['city_ascii_v2']  = dfsub_cities_new['city_ascii_v2'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
dfsub_orgs['city_v2'] = dfsub_orgs['city_v2'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# So to do this for all str dtypes in dataframe do this:
#cols = dfsub_cities.select_dtypes(include=[np.object]).columns
#dfsub_cities[cols] = dfsub_cities[cols].apply(lambda x: x.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))

# now let's deduplicate the datasets

dfsub_orgs = dfsub_orgs.drop_duplicates(subset=['city_v2', 'country_code'])
dfsub_cities_new = dfsub_cities_new.drop_duplicates(subset=['city_ascii_v2', 'iso2'])
dfsub_cities = dfsub_cities.drop_duplicates(subset=['CITY_NAME_v2', 'X', 'Y'])

print(dfsub_cities.head(1))
print(dfsub_cities_new.head(1))
print(dfsub_orgs.head(1))

print(dfsub_cities['CITY_NAME_v2'].unique().size)
print(dfsub_cities_new['city_ascii_v2'].unique().size)
print(dfsub_orgs['city_v2'].unique().size)

  CITY_NAME     X          Y CITY_NAME_v2
0     'Ataq  46.8  14.550003         ataq
          city   city_ascii iso2   lat    lng city_ascii_v2
0  ’Aïn Abessa  'Ain Abessa   DZ  36.3  5.295    ain abessa
       city country_code   city_v2
0  A Coruña           ES  a coruna
2502
39174
5266


### Exact Matching
*Let's do an exact match first and then we work with the remaining non-matched ones*

In [39]:
# By using DataFrame.merge()
# matching on the cleaned strings
df1 = dfsub_cities_new.merge(dfsub_orgs, how = 'inner', left_on=['city_ascii_v2', 'iso2'], right_on=['city_v2', 'country_code']) #4184 out of 5266 not bad!
df1

# matching on the original strings
#df2 = dfsub_cities_new.merge(dfsub_orgs, how = 'inner', left_on=['city', 'iso2'], right_on=['city', 'country_code']) #4184 out of 5266 not bad!
#df2
# the results here are less good so we will stay with df1 in further analysis

Unnamed: 0,city_x,city_ascii,iso2,lat,lng,city_ascii_v2,city_y,country_code,city_v2
0,A Coruña,A Coruna,ES,43.3667,-8.3833,a coruna,A Coruña,ES,a coruna
1,Aachen,Aachen,DE,50.7762,6.0838,aachen,Aachen,DE,aachen
2,Aalborg,Aalborg,DK,57.0337,9.9166,aalborg,Aalborg,DK,aalborg
3,Aalen,Aalen,DE,48.8372,10.0936,aalen,Aalen,DE,aalen
4,Aalst,Aalst,BE,50.9383,4.0392,aalst,Aalst,BE,aalst
...,...,...,...,...,...,...,...,...,...
4179,Zutphen,Zutphen,NL,52.1400,6.1950,zutphen,Zutphen,NL,zutphen
4180,Zvolen,Zvolen,SK,48.5831,19.1331,zvolen,Zvolen,SK,zvolen
4181,Zwingenberg,Zwingenberg,DE,49.7225,8.6139,zwingenberg,Zwingenberg,DE,zwingenberg
4182,Zwolle,Zwolle,NL,52.5167,6.1000,zwolle,Zwolle,NL,zwolle


In [72]:
# now lets get the ones that we could not match, there are 1237 cities that were not matched
#perform outer join
outer = dfsub_orgs.merge(dfsub_cities_new, how='outer', indicator=True)

#perform anti-join
anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

#view results
#print(anti_join)
#print(anti_join['city'].unique())
#df2 = dfsub_cities.merge(dfsub_orgs, how = 'inner', left_on=['CITY_NAME_v2'], right_on=['city_v2']) #1062 out of 5266 :( !
#df3 = df1.merge(df2, how = 'left', left_on = ['city_v2', 'country_code'], right_on=['city_v2', 'country_code']) #4193 out of 5266! 1073 left to match!
#print(df3['city_v2'].unique().size)

# we can try to see if we can match any of these leftover strings to the world cities dataset
df2 = dfsub_cities.merge(anti_join, how = 'inner', left_on=['CITY_NAME_v2'], right_on=['city_v2']) # ok we could match 81 entries
#df2

# now let's create a dataset with both
df3 = df2[['city','city_v2', 'country_code', 'X', 'Y']].sort_values(by=['city_v2']).drop_duplicates().reset_index(drop=True) #81
df3 = df3.rename(str.lower, axis='columns')

df4 = df1[['city_y','city_v2', 'country_code', 'lng', 'lat']].sort_values(by=['city_v2']).drop_duplicates().reset_index(drop=True) #4184 ! 5266 - 4265 = 1001 left
df4 = df4.rename(columns={"city_y": "city","lng": "x", "lat": "y"})

# lets unionize these two sets of data and create a new antijoin
matched_cities = pd.concat([df3, df4]).drop_duplicates(subset=['city', 'country_code'], keep='last')

outer_n = dfsub_orgs.merge(matched_cities, how='outer', indicator=True)
anti_join_n = outer[(outer._merge=='left_only')].drop('_merge', axis=1)

anti_join_n.sort_values('city')

## Fuzzy Matching of the remaining *cities*