# Experimentation

Loading the datasets and cleaning. The following datasets are expected:
* `locations_clean_user_location.tsv`: The original provided list of raw locations with corresponding number of occurances
* In `/data`:
  * `cities1000.tsv`, cities with > 1000 pop. (GeoNames):
    * https://download.geonames.org/export/dump/cities1000.zip
    * Unzipped and renamed to `.tsv`
  * `countryInfo.tsv`, countries (GeoNames):
    * https://download.geonames.org/export/dump/countryInfo.txt
    * Unzipped and renamed to `.tsv`
  * `admin1CodesASCII.txt`, states and provinces (admin1) (GeoNames)
    * https://download.geonames.org/export/dump/admin1CodesASCII.txt

In [1]:
import os
import pandas as pd
import geopandas as gpd

current_dir = os.getcwd()
data_dir = "data"

## Load datasets

In [2]:
# Tweets user locations list
# Loading using pandas' read_csv (tab-deleted) to set 'tweet_id' dtype to int

tweets_user_locations = os.path.join(current_dir, "locations_clean_user_location.tsv")
df = pd.read_csv(tweets_user_locations, sep='\t', dtype={'tweet_id': int})
df.head(3)

Unnamed: 0,tweet_user_location,tweet_id
0,,4994911
1,United States,190257
2,India,97652


In [3]:
# GeoNames (Cities with > 1000 inabitants)
# https://download.geonames.org/export/dump/cities1000.zip
# Loading using geopandas for geometry (usefulness tbd)

cities = os.path.join(current_dir, data_dir, "cities1000.tsv")
cities_df = gpd.read_file(cities)
cities_df.columns = cities_df.columns.str.lower() # lowercase headers
cities_df.head(3)

Unnamed: 0,geonameid,name,asciiname,altnames,latitude,longitude,featclass,featcode,country,cc2,admin1,admin2,admin3,admin4,population,elevation,gtopo30,timezone,moddate,geometry
0,3039154,El Tarter,El Tarter,"Ehl Tarter,Эл Тартер",42.57952,1.65362,P,PPL,AD,,2,,,,1052.0,,1721,Europe/Andorra,2012-11-03,POINT (1.65362 42.57952)
1,3039163,Sant Julià de Lòria,Sant Julia de Loria,"San Julia,San Julià,Sant Julia de Loria,Sant J...",42.46372,1.49129,P,PPLA,AD,,6,,,,8022.0,,921,Europe/Andorra,2013-11-23,POINT (1.49129 42.46372)
2,3039604,Pas de la Casa,Pas de la Casa,"Pas de la Kasa,Пас де ла Каса",42.54277,1.73361,P,PPL,AD,,3,,,,2363.0,2050.0,2106,Europe/Andorra,2008-06-09,POINT (1.73361 42.54277)


In [4]:
# GeoNames (Countries info)
# https://download.geonames.org/export/dump/countryInfo.txt
# Loading using pandas' read_csv (tab-deleted), ignore lines 1-48

countries = os.path.join(current_dir, data_dir, "countryInfo.tsv")
countries_df = pd.read_csv(countries, sep='\t', header=49)
countries_df.head(3)

Unnamed: 0,#ISO,ISO3,ISO-Numeric,fips,Country,Capital,Area(in sq km),Population,Continent,tld,CurrencyCode,CurrencyName,Phone,Postal Code Format,Postal Code Regex,Languages,geonameid,neighbours,EquivalentFipsCode
0,AD,AND,20,AN,Andorra,Andorra la Vella,468.0,77006,EU,.ad,EUR,Euro,376,AD###,^(?:AD)*(\d{3})$,ca,3041565,"ES,FR",
1,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,82880.0,9630959,AS,.ae,AED,Dirham,971,,,"ar-AE,fa,en,hi,ur",290557,"SA,OM",
2,AF,AFG,4,AF,Afghanistan,Kabul,647500.0,37172386,AS,.af,AFN,Afghani,93,,,"fa-AF,ps,uz-AF,tk",1149361,"TM,CN,IR,TJ,PK,UZ",


In [5]:
# GeoNames (states and provinces, admin1)
# https://download.geonames.org/export/dump/admin1CodesASCII.txt
# Loading using pandas' read_csv (tab-deleted),
# Column names from https://download.geonames.org/export/dump/readme.txt
# 'code' is '<country>.<admin1 for country>'

admin1 = os.path.join(current_dir, data_dir, "admin1CodesASCII.txt")
admin1_df = pd.read_csv(admin1, sep='\t', names=['code', 'name', 'name ascii', 'geonameid'])
admin1_df.head(3)

Unnamed: 0,code,name,name ascii,geonameid
0,AD.06,Sant Julià de Loria,Sant Julia de Loria,3039162
1,AD.05,Ordino,Ordino,3039676
2,AD.04,La Massana,La Massana,3040131


In [28]:
# GeoNames 'admin1' (admin1_df) for Canadian provinces uses a 2-digit code
# CA.01	Alberta	Alberta	5883102
# CA.02	British Columbia	British Columbia	5909050
# CA.03	Manitoba	Manitoba	6065171
# CA.04	New Brunswick	New Brunswick	6087430
# CA.13	Northwest Territories	Northwest Territories	6091069
# CA.07	Nova Scotia	Nova Scotia	6091530
# CA.14	Nunavut	Nunavut	6091732
# CA.08	Ontario	Ontario	6093943
# CA.09	Prince Edward Island	Prince Edward Island	6113358
# CA.10	Quebec	Quebec	6115047
# CA.11	Saskatchewan	Saskatchewan	6141242
# CA.12	Yukon	Yukon	6185811
# CA.05	Newfoundland and Labrador	Newfoundland and Labrador	6354959
new_provinces = admin1_df['code'].str.contains('^CA.')
new_provinces

0        False
1        False
2        False
3        False
4        False
         ...  
3951     False
3952     False
3953     False
3954     False
CA.AB    False
Name: code, Length: 3956, dtype: bool

## Helper functions

In [6]:
# Displays the percentage of tweets that have a 'geonameid'
def print_geonameid_completeness(df):
    all_tweets = df['tweet_id'].sum()
    geonameid_tweets = df[df.geonameid.notnull()]['tweet_id'].sum()
    print(f'{geonameid_tweets/all_tweets*100:.3f}%')

 ## Clean

In [7]:
# Make a copy of 'tweet_user_location' so we leave the original intact
df['tweet_user_location_copy'] = df['tweet_user_location']

# Discard specific 'tweet_user_location' strings
tweet_user_location_discard = ['None', '\\N']
df = df[~df['tweet_user_location'].isin(tweet_user_location_discard)]

# Discard locations that don't exist more than 2 times
df = df[df['tweet_id'] > 2]

# Filter out emojis and other symbols
# * https://stackoverflow.com/a/49986645
# * https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html (Unicode symbol ranges)
import re
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emojis: emoticons
        u"\U0001F300-\U0001F5FF"  # emojis: symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # emojis: transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # emojis: flags (iOS)
        u"\U00002700-\U000027BF"  # 'Dingbats' http://www.unicode.org/charts/PDF/U2700.pdf
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)
df['tweet_user_location_copy'] = df['tweet_user_location_copy'].map(lambda x: deEmojify(x))

# Truncate leading and trailing spaces
df['tweet_user_location_copy'] = df['tweet_user_location_copy'].map(lambda x: x.strip())

# Truncate trailing "," and "." characters
df['tweet_user_location_copy'] = df['tweet_user_location_copy'].map(lambda x: x.rstrip(','))
df['tweet_user_location_copy'] = df['tweet_user_location_copy'].map(lambda x: x.rstrip('.'))

df

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy
1,United States,190257,United States
2,India,97652,India
3,"London, England",77542,"London, England"
4,USA,67336,USA
5,London,66315,London
...,...,...,...
338210,N 52°27' 0'' / W 1°49' 0'',3,N 52°27' 0'' / W 1°49' 0''
338211,Villerupt-Luxembourg-Oslo-Stoc,3,Villerupt-Luxembourg-Oslo-Stoc
338212,Chicago ✈,3,Chicago
338213,Catch Me If You Can,3,Catch Me If You Can


## Countries

In [8]:
# Some locations are verbatim the name of a country, e.g.:
df[df['tweet_user_location_copy'] == 'Canada']

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy
18,Canada,40858,Canada
1497,Canada,723,Canada
2846,Canada 🇨🇦,352,Canada
4756,Canada.,199,Canada
5333,Canada,177,Canada
9208,🇨🇦 Canada,100,Canada
14493,Canada 🇨🇦,63,Canada
17363,Canada🇨🇦,52,Canada
52402,🇨🇦Canada🇨🇦,17,Canada
78821,🍁 Canada 🍁,12,Canada


In [9]:
# df[df['tweet_user_location'].isin(simple_countries_df['Country'])]
# simple_countries_df = countries_df[['#ISO','Country', 'geonameid']]#.set_index('Country')

# Merge in country info (with goenameid) when there's an exact country match

# Keep the columns of countries_df we need.
simple_countries_df = countries_df[['#ISO','Country', 'geonameid']]
#df = pd.merge(df, simple_countries_df, how='left', left_on='tweet_user_location_copy', right_on='Country')
df

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy
1,United States,190257,United States
2,India,97652,India
3,"London, England",77542,"London, England"
4,USA,67336,USA
5,London,66315,London
...,...,...,...
338210,N 52°27' 0'' / W 1°49' 0'',3,N 52°27' 0'' / W 1°49' 0''
338211,Villerupt-Luxembourg-Oslo-Stoc,3,Villerupt-Luxembourg-Oslo-Stoc
338212,Chicago ✈,3,Chicago
338213,Catch Me If You Can,3,Catch Me If You Can


In [10]:
#print_geonameid_completeness(df)

## Cities

In [11]:
#simple_cities_df = cities_df[['geonameid', 'name', 'asciiname', 'altnames']]
# df = pd.merge(df, simple_cities_df, how='left', left_on='tweet_user_location', right_on='name')
#df_copy = df[df['geonameid'].isnull()]
#pd.merge(df_copy, simple_cities_df, how='left', left_on='tweet_user_location_copy', right_on='name')

# NB: this can't work b/c cities name (unlike countries) aren't unique, e.g. there's a lot of "London"

In [12]:
df[df['tweet_user_location_copy'].str.count(',') > 2]

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy
2988,"LA,Houston, NYC, ATL,Tampa",333,"LA,Houston, NYC, ATL,Tampa"
4053,"Gedung Graha Pena Lt.10, Jl. Raya Kebayoran La...",238,"Gedung Graha Pena Lt.10, Jl. Raya Kebayoran La..."
4698,"Paris, France/La Habana, Cuba/México/Roma, Italia",202,"Paris, France/La Habana, Cuba/México/Roma, Italia"
4753,"Lucerne, Suisse, Kinshasa,Bruxelles,Boston RDC",199,"Lucerne, Suisse, Kinshasa,Bruxelles,Boston RDC"
5556,"Durham, England, UK, EU till I die.",169,"Durham, England, UK, EU till I die"
...,...,...,...
336979,"Central, Southside, SW, VA",3,"Central, Southside, SW, VA"
337241,"EX:Sugar,A2B,eG,RL,CZ,aV,Eyy",3,"EX:Sugar,A2B,eG,RL,CZ,aV,Eyy"
337521,"Chicago, Austin, NYC, LA",3,"Chicago, Austin, NYC, LA"
337587,"Chicago, DC, Los Angeles, New York",3,"Chicago, DC, Los Angeles, New York"


In [13]:
test = "Toronto, Ontario, Canada, World"
test = "Toronto, Canada"
test.split(',')

['Toronto', ' Canada']

In [14]:
import numpy as np

def split_fixed_parts(num_parts, location):
    parts = location.split(',')
    if num_parts > len(parts):
        for i in range(num_parts - len(parts)):
            parts.insert(0, None)
    else:
        for i in range(len(parts) - num_parts):
            parts.pop(0)
    return parts

def parts_dict(num_parts, location):
    parts = split_fixed_parts(num_parts, location)
    return {f'el-{k}':parts[k] for k in range(num_parts)}

In [15]:
# df[['column_new_1', 'column_new_2', 'column_new_3']] = pd.DataFrame([[np.nan, 'dogs', 3]], index=df.index)
test_df = df[df['tweet_user_location_copy'].str.count(',') == 2].head(10)
test_df

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy
55,"Bogotá, D.C., Colombia",19367,"Bogotá, D.C., Colombia"
572,"Toronto, Ontario, Canada",2052,"Toronto, Ontario, Canada"
843,"Melbourne, Victoria, Australia",1398,"Melbourne, Victoria, Australia"
975,"Bogotá, DC, Colombia",1176,"Bogotá, DC, Colombia"
1111,"Ottawa, Ontario, Canada",1022,"Ottawa, Ontario, Canada"
1235,"Vancouver, BC, Canada",901,"Vancouver, BC, Canada"
1335,"Edmonton, Alberta, Canada",818,"Edmonton, Alberta, Canada"
1340,"Teesdale Park, Thornaby, UK",813,"Teesdale Park, Thornaby, UK"
1380,"Calgary, Alberta, Canada",791,"Calgary, Alberta, Canada"
1427,"Greater Portland, Oregon, USA",764,"Greater Portland, Oregon, USA"


In [16]:
num_parts = 3

# https://stackoverflow.com/a/16242202
#test_df.tweet_user_location_copy.apply(lambda s: pd.Series(parts_dict(num_parts, s)))
#test_df = pd.concat([test_df, test_df.tweet_user_location_copy.apply(lambda s: pd.Series(parts_dict(num_parts, s)))], axis=1)

test_df['elements'] = test_df['tweet_user_location_copy'].map(lambda row: split_fixed_parts(num_parts, row))
test_df['elements'] = test_df['elements'].map(lambda x: [i.strip() for i in x])
test_df

Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy,elements
55,"Bogotá, D.C., Colombia",19367,"Bogotá, D.C., Colombia","[Bogotá, D.C., Colombia]"
572,"Toronto, Ontario, Canada",2052,"Toronto, Ontario, Canada","[Toronto, Ontario, Canada]"
843,"Melbourne, Victoria, Australia",1398,"Melbourne, Victoria, Australia","[Melbourne, Victoria, Australia]"
975,"Bogotá, DC, Colombia",1176,"Bogotá, DC, Colombia","[Bogotá, DC, Colombia]"
1111,"Ottawa, Ontario, Canada",1022,"Ottawa, Ontario, Canada","[Ottawa, Ontario, Canada]"
1235,"Vancouver, BC, Canada",901,"Vancouver, BC, Canada","[Vancouver, BC, Canada]"
1335,"Edmonton, Alberta, Canada",818,"Edmonton, Alberta, Canada","[Edmonton, Alberta, Canada]"
1340,"Teesdale Park, Thornaby, UK",813,"Teesdale Park, Thornaby, UK","[Teesdale Park, Thornaby, UK]"
1380,"Calgary, Alberta, Canada",791,"Calgary, Alberta, Canada","[Calgary, Alberta, Canada]"
1427,"Greater Portland, Oregon, USA",764,"Greater Portland, Oregon, USA","[Greater Portland, Oregon, USA]"


In [17]:
def get_country(countries_df, element):
    country = countries_df[countries_df['Country'] == element]
    if len(country) == 0:
        return None
    return country
    
def is_city(element):
    pass

"""
Cases:

                  country
      state/prov, country
            city, country
city, state/prov, country



city, state/prov, country
city, state/prov
city, country
city

neighboorhood, city, country
neighboorhood, city

state/prov, country
state/prov
"""

def infer_geonameid(elements):
    # Datasets
    # * countries_df
    # * admin1_df
    # * cities_df
    
    # print(elements)
    
    # Last item is country
    country = get_country(countries_df, elements[-1])
    if country is not None:
        
        # Before last item is empty
        if elements[-2] is None:
            return country['geonameid'].item()
        
        # Before last item is not empty
        else:
            country_iso_code = country['#ISO'].item()
            return country_iso_code
            
    return np.nan

In [18]:
test_df['geonameid'] = np.nan
test_df['geonameid'] = test_df['elements'].map(lambda elements: infer_geonameid(elements))
test_df

not empty
not empty
not empty
not empty
not empty
not empty
not empty
not empty


Unnamed: 0,tweet_user_location,tweet_id,tweet_user_location_copy,elements,geonameid
55,"Bogotá, D.C., Colombia",19367,"Bogotá, D.C., Colombia","[Bogotá, D.C., Colombia]",CO
572,"Toronto, Ontario, Canada",2052,"Toronto, Ontario, Canada","[Toronto, Ontario, Canada]",CA
843,"Melbourne, Victoria, Australia",1398,"Melbourne, Victoria, Australia","[Melbourne, Victoria, Australia]",AU
975,"Bogotá, DC, Colombia",1176,"Bogotá, DC, Colombia","[Bogotá, DC, Colombia]",CO
1111,"Ottawa, Ontario, Canada",1022,"Ottawa, Ontario, Canada","[Ottawa, Ontario, Canada]",CA
1235,"Vancouver, BC, Canada",901,"Vancouver, BC, Canada","[Vancouver, BC, Canada]",CA
1335,"Edmonton, Alberta, Canada",818,"Edmonton, Alberta, Canada","[Edmonton, Alberta, Canada]",CA
1340,"Teesdale Park, Thornaby, UK",813,"Teesdale Park, Thornaby, UK","[Teesdale Park, Thornaby, UK]",
1380,"Calgary, Alberta, Canada",791,"Calgary, Alberta, Canada","[Calgary, Alberta, Canada]",CA
1427,"Greater Portland, Oregon, USA",764,"Greater Portland, Oregon, USA","[Greater Portland, Oregon, USA]",


In [19]:
print_geonameid_completeness(test_df)

94.581%
