# Experimentation

Loading the datasets and cleaning. The following datasets are expected:
* `locations_clean_user_location.tsv`: The original provided list of raw locations with corresponding number of occurances
* In `/data`:
  * `cities1000.tsv`, cities with > 1000 pop. (GeoNames):
    * https://download.geonames.org/export/dump/cities1000.zip
    * Unzipped and renamed to `.tsv`
  * `countryInfo.tsv`, countries (GeoNames):
    * https://download.geonames.org/export/dump/countryInfo.txt
    * Unzipped and renamed to `.tsv`
  * `admin1CodesASCII.txt`, states and provinces (admin1) (GeoNames)
    * https://download.geonames.org/export/dump/admin1CodesASCII.txt

In [168]:
import os
import pandas as pd
import geopandas as gpd

current_dir = os.getcwd()
data_dir = "data"

## Load datasets

In [169]:
# Tweets user locations list
# Loading using pandas' read_csv (tab-deleted) to set 'tweet_id' dtype to int

tweets_user_locations = os.path.join(current_dir, "locations_clean_user_location.tsv")
df = pd.read_csv(tweets_user_locations, sep='\t', dtype={'tweet_id': int})
df.head(3)

Unnamed: 0,tweet_user_location,tweet_id
0,,4994911
1,United States,190257
2,India,97652


In [170]:
# GeoNames (Cities with > 1000 inabitants)
# https://download.geonames.org/export/dump/cities1000.zip
# Loading using geopandas for geometry (usefulness tbd)

cities = os.path.join(current_dir, data_dir, "cities1000.tsv")
cities_df = gpd.read_file(cities)
cities_df.columns = cities_df.columns.str.lower() # lowercase headers
cities_df.head(3)

Unnamed: 0,geonameid,name,asciiname,altnames,latitude,longitude,featclass,featcode,country,cc2,admin1,admin2,admin3,admin4,population,elevation,gtopo30,timezone,moddate,geometry
0,3039154,El Tarter,El Tarter,"Ehl Tarter,Эл Тартер",42.57952,1.65362,P,PPL,AD,,2,,,,1052.0,,1721,Europe/Andorra,2012-11-03,POINT (1.65362 42.57952)
1,3039163,Sant Julià de Lòria,Sant Julia de Loria,"San Julia,San Julià,Sant Julia de Loria,Sant J...",42.46372,1.49129,P,PPLA,AD,,6,,,,8022.0,,921,Europe/Andorra,2013-11-23,POINT (1.49129 42.46372)
2,3039604,Pas de la Casa,Pas de la Casa,"Pas de la Kasa,Пас де ла Каса",42.54277,1.73361,P,PPL,AD,,3,,,,2363.0,2050.0,2106,Europe/Andorra,2008-06-09,POINT (1.73361 42.54277)


In [171]:
# Test cities finding
# City
city_test = cities_df[(cities_df['name'] == 'London')]
city_test

# City & admin1
city_test = cities_df[(cities_df['name'] == 'London') & \
                      (cities_df['admin1'] == 'ENG')]
city_test

# City & admin1 & country
city_test = cities_df[(cities_df['name'] == 'London') & \
                      (cities_df['admin1'] == 'ENG') & \
                      (cities_df['country'] == 'GB')]
city_test

# City & country
city_test = cities_df[(cities_df['name'] == 'London') & \
                      (cities_df['country'] == 'GB')]
city_test

# City (with the largest population)
cities_df[(cities_df['name'] == 'London')].nlargest(1, ['population']) 

Unnamed: 0,geonameid,name,asciiname,altnames,latitude,longitude,featclass,featcode,country,cc2,admin1,admin2,admin3,admin4,population,elevation,gtopo30,timezone,moddate,geometry
48817,2643743,London,London,"ILondon,LON,Lakana,Landan,Landen,Ljondan,Llund...",51.50853,-0.12574,P,PPLC,GB,,ENG,GLA,,,7556900.0,,25,Europe/London,2019-09-18,POINT (-0.12574 51.50853)


In [172]:
# GeoNames (Countries info)
# https://download.geonames.org/export/dump/countryInfo.txt
# Loading using pandas' read_csv (tab-deleted), ignore lines 1-48

countries = os.path.join(current_dir, data_dir, "countryInfo.tsv")
countries_df = pd.read_csv(countries, sep='\t', header=49)
countries_df.head(3)

Unnamed: 0,#ISO,ISO3,ISO-Numeric,fips,Country,Capital,Area(in sq km),Population,Continent,tld,CurrencyCode,CurrencyName,Phone,Postal Code Format,Postal Code Regex,Languages,geonameid,neighbours,EquivalentFipsCode
0,AD,AND,20,AN,Andorra,Andorra la Vella,468.0,77006,EU,.ad,EUR,Euro,376,AD###,^(?:AD)*(\d{3})$,ca,3041565,"ES,FR",
1,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,82880.0,9630959,AS,.ae,AED,Dirham,971,,,"ar-AE,fa,en,hi,ur",290557,"SA,OM",
2,AF,AFG,4,AF,Afghanistan,Kabul,647500.0,37172386,AS,.af,AFN,Afghani,93,,,"fa-AF,ps,uz-AF,tk",1149361,"TM,CN,IR,TJ,PK,UZ",


In [173]:
# GeoNames (states and provinces, admin1)
# https://download.geonames.org/export/dump/admin1CodesASCII.txt
# Loading using pandas' read_csv (tab-deleted),
# Column names from https://download.geonames.org/export/dump/readme.txt
# 'code' is '<country>.<admin1 for country>'

admin1 = os.path.join(current_dir, data_dir, "admin1CodesASCII.txt")
admin1_df = pd.read_csv(admin1, sep='\t', names=['code', 'name', 'name ascii', 'geonameid'])
admin1_df.head(3)

Unnamed: 0,code,name,name ascii,geonameid
0,AD.06,Sant Julià de Loria,Sant Julia de Loria,3039162
1,AD.05,Ordino,Ordino,3039676
2,AD.04,La Massana,La Massana,3040131


In [174]:
# GeoNames 'admin1' (admin1_df) for Canadian provinces uses a 2-digit code
# Use postal abbreviation which people use

# CA.01	Alberta	Alberta	5883102
# CA.02	British Columbia	British Columbia	5909050
# CA.03	Manitoba	Manitoba	6065171
# CA.04	New Brunswick	New Brunswick	6087430
# CA.13	Northwest Territories	Northwest Territories	6091069
# CA.07	Nova Scotia	Nova Scotia	6091530
# CA.14	Nunavut	Nunavut	6091732
# CA.08	Ontario	Ontario	6093943
# CA.09	Prince Edward Island	Prince Edward Island	6113358
# CA.10	Quebec	Quebec	6115047
# CA.11	Saskatchewan	Saskatchewan	6141242
# CA.12	Yukon	Yukon	6185811
# CA.05	Newfoundland and Labrador	Newfoundland and Labrador	6354959

province_abbr = {
    'CA.01': 'CA.AB', # Alberta
    'CA.02': 'CA.BC', # British Columbia
    'CA.03': 'CA.MB', # Manitoba
    'CA.04': 'CA.NB', # New Brunswick
    'CA.05': 'CA.NL', # Newfoundland and Labrador
    'CA.07': 'CA.NS', # Nova Scotia
    'CA.08': 'CA.ON', # Ontario
    'CA.09': 'CA.PE', # Prince Edward Island
    'CA.10': 'CA.QC', # Quebec
    'CA.11': 'CA.SK', # Saskatchewan
    'CA.12': 'CA.YK', # Yukon
    'CA.13': 'CA.NT', # Northwest Territories
    'CA.14': 'CA.NU'  # Nunavut
}

new_provinces = admin1_df[admin1_df['code'].str.contains('^CA.')].copy()
new_provinces['code'] = new_provinces['code'].map(province_abbr)
admin1_df = pd.concat([admin1_df, new_provinces], ignore_index=True)
admin1_df[admin1_df['code'].str.contains('^CA.')]

Unnamed: 0,code,name,name ascii,geonameid
466,CA.01,Alberta,Alberta,5883102
467,CA.02,British Columbia,British Columbia,5909050
468,CA.03,Manitoba,Manitoba,6065171
469,CA.04,New Brunswick,New Brunswick,6087430
470,CA.13,Northwest Territories,Northwest Territories,6091069
471,CA.07,Nova Scotia,Nova Scotia,6091530
472,CA.14,Nunavut,Nunavut,6091732
473,CA.08,Ontario,Ontario,6093943
474,CA.09,Prince Edward Island,Prince Edward Island,6113358
475,CA.10,Quebec,Quebec,6115047


In [175]:
# Test when there's more than 1 admin1

# admin1_df[admin1_df['code'].str.contains('^US.')]
# admin1_df[admin1_df['name'] == 'La Paz']
country_code = 'HN'
element = 'La Paz'
test1 = admin1_df[(admin1_df['name'] == element)].copy()
test1.loc[:, 'geonameid'] = 99
test11 = test1.head(1)
test1

Unnamed: 0,code,name,name ascii,geonameid
351,BO.04,La Paz,La Paz,99
1190,HN.12,La Paz,La Paz,99
3288,SV.06,La Paz,La Paz,99


In [176]:
# Add alternative country names (e.g. USA, UK, etc.)
# (we can't easily get alternative country names)
alternative_country_names = {
    6252001: 'USA',    # United States
    2510769: 'España', # [Kingdom of] Spain
    2635167: 'UK',     # United Kingdom
    1861060: '日本',    # Japan
    298795: 'Türkiye', # Turkey
    3469034: 'Brasil', # Brazil
    3175395: 'Italia', # Italy
    1694008: 'Republic of the Philippines', # Philipines
    2921044: 'Deutschland' # Germany
}

new_countries = pd.DataFrame([], columns=countries_df.columns)
for geo, alt_name in alternative_country_names.items():
    alt_country = countries_df[countries_df['geonameid'] == geo].copy()
    alt_country['Country'] = alt_name
    countries_df = pd.concat([countries_df, alt_country], ignore_index=True)

# TODO: City alternartives
# And others like abbreviations (e.g. CDMX)
# Add to list of city alternatives
# #3527646: 'CDMX',   # Mexico City 

## Constants

In [177]:
# Discard specific 'tweet_user_location' strings
LOCATION_DISCARD = ['None', '\\N', 'Global', 'Earth',
                    'Planet Earth', 'Worldwide', 'Everywhere',
                    'Internet', 'En todas partes']

## Helper functions

In [178]:
# Displays the percentage of tweets that have a 'geonameid'
# Skipping the ones we know aren't valid (discards)
def print_geonameid_completeness(df):
    all_tweets = df[~df['tweet_user_location'].isin(LOCATION_DISCARD)]['tweet_id'].sum()
    geonameid_tweets = df[df.geonameid.notnull()]['tweet_id'].sum()
    print(f'{geonameid_tweets/all_tweets*100:.3f}%')

 ## Clean

In [179]:
# Make a copy of 'tweet_user_location' so we leave the original intact
df['tweet_user_location_copy'] = df['tweet_user_location']

# Discard specific 'tweet_user_location' strings
# tweet_user_location_discard = ['None', '\\N']
# df = df[~df['tweet_user_location'].isin(tweet_user_location_discard)]

# Discard locations that don't exist more than 2 times
df = df[df['tweet_id'] > 2]

# Filter out emojis and other symbols
# * https://stackoverflow.com/a/49986645
# * https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html (Unicode symbol ranges)
import re
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emojis: emoticons
        u"\U0001F300-\U0001F5FF"  # emojis: symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # emojis: transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # emojis: flags (iOS)
        u"\U00002700-\U000027BF"  # 'Dingbats' http://www.unicode.org/charts/PDF/U2700.pdf
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)
df['tweet_user_location_copy'] = df['tweet_user_location_copy'].map(lambda x: deEmojify(x))

# Truncate leading and trailing spaces
df['tweet_user_location_copy'] = df['tweet_user_location_copy'].map(lambda x: x.strip())

# Truncate trailing "," and "." characters
df['tweet_user_location_copy'] = df['tweet_user_location_copy'].map(lambda x: x.rstrip(','))
df['tweet_user_location_copy'] = df['tweet_user_location_copy'].map(lambda x: x.rstrip('.'))

df

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy
0,,4994911,
1,United States,190257,United States
2,India,97652,India
3,"London, England",77542,"London, England"
4,USA,67336,USA
...,...,...,...
338210,N 52°27' 0'' / W 1°49' 0'',3,N 52°27' 0'' / W 1°49' 0''
338211,Villerupt-Luxembourg-Oslo-Stoc,3,Villerupt-Luxembourg-Oslo-Stoc
338212,Chicago ✈,3,Chicago
338213,Catch Me If You Can,3,Catch Me If You Can


## Countries

In [180]:
# Some locations are verbatim the name of a country, e.g.:
df[df['tweet_user_location_copy'] == 'Canada']

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy
18,Canada,40858,Canada
1497,Canada,723,Canada
2846,Canada 🇨🇦,352,Canada
4756,Canada.,199,Canada
5333,Canada,177,Canada
9208,🇨🇦 Canada,100,Canada
14493,Canada 🇨🇦,63,Canada
17363,Canada🇨🇦,52,Canada
52402,🇨🇦Canada🇨🇦,17,Canada
78821,🍁 Canada 🍁,12,Canada


In [181]:
# # df[df['tweet_user_location'].isin(simple_countries_df['Country'])]
# # simple_countries_df = countries_df[['#ISO','Country', 'geonameid']]#.set_index('Country')

# # Merge in country info (with goenameid) when there's an exact country match

# # Keep the columns of countries_df we need.
# simple_countries_df = countries_df[['#ISO','Country', 'geonameid']]
# #df = pd.merge(df, simple_countries_df, how='left', left_on='tweet_user_location_copy', right_on='Country')
# df

In [182]:
#print_geonameid_completeness(df)

## Cities

In [183]:
#simple_cities_df = cities_df[['geonameid', 'name', 'asciiname', 'altnames']]
# df = pd.merge(df, simple_cities_df, how='left', left_on='tweet_user_location', right_on='name')
#df_copy = df[df['geonameid'].isnull()]
#pd.merge(df_copy, simple_cities_df, how='left', left_on='tweet_user_location_copy', right_on='name')

# NB: this can't work b/c cities name (unlike countries) aren't unique, e.g. there's a lot of "London"

In [184]:
# df[df['tweet_user_location_copy'].str.count(',') > 2]

In [185]:
# test = "Toronto, Ontario, Canada, World"
# test = "Toronto, Canada"
# test.split(',')

In [186]:
# import numpy as np

# def split_fixed_parts(num_parts, location):
#     parts = location.split(',')
#     if num_parts > len(parts):
#         for i in range(num_parts - len(parts)):
#             parts.insert(0, None)
#     else:
#         for i in range(len(parts) - num_parts):
#             parts.pop(0)
#     return parts

# def parts_dict(num_parts, location):
#     parts = split_fixed_parts(num_parts, location)
#     return {f'el-{k}':parts[k] for k in range(num_parts)}

def split_parts(location):
    return location.split(',')

# print(split_fixed_parts(3, 'Toronto'))
print(split_parts('Toronto'))

['Toronto']


In [187]:
# Mini test dataset
# df[['column_new_1', 'column_new_2', 'column_new_3']] = pd.DataFrame([[np.nan, 'dogs', 3]], index=df.index)
test_df = df[df['tweet_user_location_copy'].str.count(',') == 0].head(200)
test_df

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy
0,,4994911,
1,United States,190257,United States
2,India,97652,India
4,USA,67336,USA
5,London,66315,London
10,France,54481,France
12,United Kingdom,48965,United Kingdom
13,México,46987,México
15,España,44838,España
16,Nigeria,44185,Nigeria


In [188]:
num_parts = 3

# https://stackoverflow.com/a/16242202
#test_df.tweet_user_location_copy.apply(lambda s: pd.Series(parts_dict(num_parts, s)))
#test_df = pd.concat([test_df, test_df.tweet_user_location_copy.apply(lambda s: pd.Series(parts_dict(num_parts, s)))], axis=1)

test_df['elements'] = test_df['tweet_user_location_copy'].map(lambda location: location.split(','))
# test_df['elements'] = test_df['elements'].map(lambda x: [i.strip() for i in x if i is not None])
test_df

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy,elements
0,,4994911,,[None]
1,United States,190257,United States,[United States]
2,India,97652,India,[India]
4,USA,67336,USA,[USA]
5,London,66315,London,[London]
10,France,54481,France,[France]
12,United Kingdom,48965,United Kingdom,[United Kingdom]
13,México,46987,México,[México]
15,España,44838,España,[España]
16,Nigeria,44185,Nigeria,[Nigeria]


In [189]:
# TODO: Case incensitive mask/match
# e.g. df2 = df1['company_name'].str.contains("apple", na=False, case=False)

def get_country(countries_df, element):
    # Filter 'Country' field with 'element'
    country = countries_df[countries_df['Country'] == element]
    
    # No results
    if len(country) == 0:
        return None
    
    # There can be only one result
    return country


def get_admin1(admin1_df, element, country_code=None):
    if country_code is None:
        # admin1 matching name (or ascii name) w/o country
        admin1 = admin1_df[(admin1_df['name'] == element) | \
                           (admin1_df['name ascii'] == element)]
    
        if len(admin1) == 0:
            # No results
            return None
    
        if len(admin1) > 1:
            # #ERROR:99
            # This error happens when, w/o a country, there is
            # more than 1 admin1 by that name/ascii name.
            # There is not enough data to infer which one.
            # e.g. "La Paz" district (Bolivia, Honduras, El Savador)
            admin1 = admin1_df[(admin1_df['name'] == element)].copy()
            admin1.loc[:, 'geonameid'] = 99
            return admin1.head(1)
            
    
    else:
        # admin1 matching name (or ascii name) and country
        admin1 = admin1_df[(admin1_df['code'].str.contains(f'^{country_code}.')) & \
                          ((admin1_df['name'] == element) | \
                           (admin1_df['name ascii'] == element))]
        
        if len(admin1) == 0:
            # No results
            # TODO: Look for an abbreviation else return None
            return None
    
    return admin1


def get_city(cities_df, element, admin1_code=None, country_code=None):
    if admin1_code is None and country_code is None:
        cities = cities_df[(cities_df['name'] == element)]
    
    elif admin1_code is None:
        cities = cities_df[(cities_df['name'] == element) & \
                         (cities_df['country'] == country_code)]
    
    elif country_code is None:
        cities = cities_df[(cities_df['name'] == element) & \
                           (cities_df['admin1'] == admin1_code)]
    
    else:
        cities = cities_df[(cities_df['name'] == element) & \
                           (cities_df['admin1'] == admin1_code) & \
                           (cities_df['country'] == country_code)]
    
    if len(cities) == 0:
        # No results
        return None
    
    else:
        # More than one result,
        # take the city with the largest population.
        return cities.nlargest(1, ['population']) 

"""
Cases:

                  country
      state/prov, country
            city, country
city, state/prov, country

city, state/prov, country
city, state/prov
city, country
city

neighboorhood, city, country
neighboorhood, city

state/prov, country
state/prov
"""

def infer_geonameid(elements):
    # Datasets
    # * countries_df
    # * admin1_df
    # * cities_df

    # print(elements)
    
    # Don't try to infer if element should be ignored
    if elements[0] in LOCATION_DISCARD:
        pass
    
    # One item
    # TODO: Invert? Check city first, then state, then country?
    # e.g. New York is always the city, not the state.
    elif len(elements) == 1:
        country = get_country(countries_df, elements[0])
        
        # "<country>" as-is
        if country is not None:
            return str(country['geonameid'].item())
    
        admin1 = get_admin1(admin1_df, elements[0])
    
        # "<state/province>" as-is
        if admin1 is not None:
            return str(admin1['geonameid'].item())

        city = get_city(cities_df, elements[0])
        
        # "<city>" as-is
        if city is not None:
            return str(city['geonameid'].item())

    
    # Two items
    elif len(elements) == 2:
        
        # if element[1] is country:
            # if element[0] is <state/province> within <country>:
                # return <state/province>
                
            # if element[0] is <city> within <country>:
                # return <city>
                
            # return country
        
        # if element[1] is <state/province>:
            # if element[0] is <city> within <country>:
                # return <city>
                
            # return <state/province>
            
        # if element[1] is <city>:
        
            # return <city>
        
        pass
    
    # Three items
    elif len(elements) == 3:
        
        # if element[2] is country:
            # if element[1] is <state/province> within <country>:
                # if element[0] if <city> within <state/province>
                    # return <city
                    
                # return <state/province>
                
            # if element[1] is city within country:
                # return <city>
                
            # return <country>
            
        # if element[2] is <state/province>:
            # if element[1] if <city> within <state/province>
                    # return <city>
        
        pass
            
    return np.nan

In [190]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

test_df['geonameid'] = np.nan
test_df['geonameid'] = test_df['elements'].map(lambda elements: infer_geonameid(elements))
test_df

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy,elements,geonameid
0,,4994911,,[None],
1,United States,190257,United States,[United States],6252001.0
2,India,97652,India,[India],1269750.0
4,USA,67336,USA,[USA],6252001.0
5,London,66315,London,[London],2643743.0
10,France,54481,France,[France],3017382.0
12,United Kingdom,48965,United Kingdom,[United Kingdom],2635167.0
13,México,46987,México,[México],3523272.0
15,España,44838,España,[España],2510769.0
16,Nigeria,44185,Nigeria,[Nigeria],2328926.0


In [191]:
print_geonameid_completeness(test_df)

89.603%
