# Experimentation

Loading the datasets and cleaning. The following datasets are expected:
* `locations_clean_user_location.tsv`: The original provided list of raw locations with corresponding number of occurances
* In `/data`:
  * `cities1000.tsv` (GeoNames):
    * https://download.geonames.org/export/dump/cities1000.zip
    * Unzipped and renamed to `.tsv`
  * `countryInfo.tsv` (GeoNames):
    * https://download.geonames.org/export/dump/countryInfo.txt
    * Unzipped and renamed to `.tsv`

In [65]:
import os
import pandas as pd
import geopandas as gpd

current_dir = os.getcwd()
data_dir = "data"

## Load datasets

In [66]:
# Tweets user locations list
# Loading using pandas' read_csv (tab-deleted) to set 'tweet_id' dtype to int

tweets_user_locations = os.path.join(current_dir, "locations_clean_user_location.tsv")
df = pd.read_csv(tweets_user_locations, sep='\t', dtype={'tweet_id': int})
df.head(3)

Unnamed: 0,tweet_user_location,tweet_id
0,,4994911
1,United States,190257
2,India,97652


In [67]:
# GeoNames (Cities with > 1000 inabitants)
# https://download.geonames.org/export/dump/cities1000.zip
# Loading using geopandas for geometry (usefulness tbd)

cities = os.path.join(current_dir, data_dir, "cities1000.tsv")
cities_df = gpd.read_file(cities)
cities_df.head(3)

Unnamed: 0,GEONAMEID,NAME,ASCIINAME,ALTNAMES,LATITUDE,LONGITUDE,FEATCLASS,FEATCODE,COUNTRY,CC2,ADMIN1,ADMIN2,ADMIN3,ADMIN4,POPULATION,ELEVATION,GTOPO30,TIMEZONE,MODDATE,geometry
0,3039154,El Tarter,El Tarter,"Ehl Tarter,Эл Тартер",42.57952,1.65362,P,PPL,AD,,2,,,,1052.0,,1721,Europe/Andorra,2012-11-03,POINT (1.65362 42.57952)
1,3039163,Sant Julià de Lòria,Sant Julia de Loria,"San Julia,San Julià,Sant Julia de Loria,Sant J...",42.46372,1.49129,P,PPLA,AD,,6,,,,8022.0,,921,Europe/Andorra,2013-11-23,POINT (1.49129 42.46372)
2,3039604,Pas de la Casa,Pas de la Casa,"Pas de la Kasa,Пас де ла Каса",42.54277,1.73361,P,PPL,AD,,3,,,,2363.0,2050.0,2106,Europe/Andorra,2008-06-09,POINT (1.73361 42.54277)


In [68]:
# GeoNames (Countries info)
# https://download.geonames.org/export/dump/countryInfo.txt
# Loading using pandas' read_csv (tab-deleted), ignore lines 1-48

countries = os.path.join(current_dir, data_dir, "countryInfo.tsv")
countries_df = pd.read_csv(countries, sep='\t', header=49)
countries_df.head(3)

Unnamed: 0,#ISO,ISO3,ISO-Numeric,fips,Country,Capital,Area(in sq km),Population,Continent,tld,CurrencyCode,CurrencyName,Phone,Postal Code Format,Postal Code Regex,Languages,geonameid,neighbours,EquivalentFipsCode
0,AD,AND,20,AN,Andorra,Andorra la Vella,468.0,77006,EU,.ad,EUR,Euro,376,AD###,^(?:AD)*(\d{3})$,ca,3041565,"ES,FR",
1,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,82880.0,9630959,AS,.ae,AED,Dirham,971,,,"ar-AE,fa,en,hi,ur",290557,"SA,OM",
2,AF,AFG,4,AF,Afghanistan,Kabul,647500.0,37172386,AS,.af,AFN,Afghani,93,,,"fa-AF,ps,uz-AF,tk",1149361,"TM,CN,IR,TJ,PK,UZ",


 ## Clean
_Work in progress!_

In [69]:
# Discard specific 'tweet_user_location' strings
tweet_user_location_discard = ['None']
df = df[~df['tweet_user_location'].isin(tweet_user_location_discard)]

# Discard locations that don't exist more than 2 times
df = df[df['tweet_id'] > 2]

df

Unnamed: 0,tweet_user_location,tweet_id
1,United States,190257
2,India,97652
3,"London, England",77542
4,USA,67336
5,London,66315
...,...,...
338210,N 52°27' 0'' / W 1°49' 0'',3
338211,Villerupt-Luxembourg-Oslo-Stoc,3
338212,Chicago ✈,3
338213,Catch Me If You Can,3
