In [235]:
import folium
import pandas as pd
import geopandas as gpd
import glob
import re
import numpy as np
from tqdm.auto import tqdm

In [355]:
pd.set_option('display.max_columns', None)
# pd.reset_option('max_rows')

In [237]:
geo_countries = gpd.read_file('../data/countries.geojson')
geo_countries.head()

Unnamed: 0,ADMIN,ISO_A3,geometry
0,Aruba,ABW,"POLYGON ((-69.99694 12.57758, -69.93639 12.531..."
1,Afghanistan,AFG,"POLYGON ((71.04980 38.40866, 71.05714 38.40903..."
2,Angola,AGO,"MULTIPOLYGON (((11.73752 -16.69258, 11.73851 -..."
3,Anguilla,AIA,"MULTIPOLYGON (((-63.03767 18.21296, -63.09952 ..."
4,Albania,ALB,"POLYGON ((19.74777 42.57890, 19.74601 42.57993..."


In [238]:
world_cities = pd.read_csv('../data/worldcities/worldcities.csv', usecols= ['city_ascii', 'country', 'iso3', 'population'])
world_cities.head()

Unnamed: 0,city_ascii,country,iso3,population
0,Tokyo,Japan,JPN,39105000.0
1,Jakarta,Indonesia,IDN,35362000.0
2,Delhi,India,IND,31870000.0
3,Manila,Philippines,PHL,23971000.0
4,Sao Paulo,Brazil,BRA,22495000.0


In [257]:
city_iso3 = world_cities.sort_values('population', ascending = False).drop_duplicates('city_ascii').rename(columns={'city_ascii':'city'})[['city', 'iso3']]
state_iso3 = pd.read_csv('../data/US_states.csv', usecols=['STATE', 'STATE2']).assign(iso3='USA').rename(columns = {'STATE':'state', 'STATE2':'state2'})
country_iso3 = world_cities.sort_values('population', ascending = False).drop_duplicates('iso3')[['country', 'iso3']]

In [261]:
country_iso3.head()

Unnamed: 0,country,iso3
0,Japan,JPN
1,Indonesia,IDN
2,India,IND
3,Philippines,PHL
4,Brazil,BRA


In [None]:
country_pop = (world_cities.groupby('iso3').
 sum('population').
 drop(columns = ['lat', 'lng', 'id']).
 rename(columns = {'population':'country_pop'}).
 reset_index()
)
country_pop = country_pop[country_pop['iso3'] != 'CHN']


In [None]:
country_pop.sort_values('country_pop', ascending = False)

In [None]:
# initialize the map and store it in a m object
m = folium.Map(location=[40, -95], zoom_start=4)

folium.Choropleth(
    geo_data='../data/countries.geojson',
    name="choropleth",
    data=country_pop,
    columns=["iso3", "country_pop"],
    key_on="feature.properties.ISO_A3",
    fill_color="YlGn",
    fill_opacity=0.7,
    line_opacity=.1,
    legend_name="population",
).add_to(m)

folium.LayerControl().add_to(m)

m

In [11]:
glob.glob('../data/tweets_en/*')

['../data/tweets_en/tweets_ukraine_en.csv',
 '../data/tweets_en/tweets_russia_en.csv',
 '../data/tweets_en/tweets_eu_en.csv',
 '../data/tweets_en/tweets_zelenskyy_en.csv',
 '../data/tweets_en/tweets_biden_en.csv',
 '../data/tweets_en/tweets_putin_en.csv',
 '../data/tweets_en/tweets_johnson_en.csv',
 '../data/tweets_en/tweets_nato_en.csv',
 '../data/tweets_en/tweets_scholz_en.csv',
 '../data/tweets_en/tweets_macron_en.csv']

In [None]:
df.value_counts('location', ascending = False).head(100)

In [414]:
df.query('location == "Italia"')['iso_final'][:5]

7       NaN
1568    NaN
3582    NaN
6598    NaN
6658    NaN
Name: iso_final, dtype: object

In [402]:
country_iso3[country_iso3['country'].str.contains('Czech')]

Unnamed: 0,country,iso3
519,Czechia,CZE


### Convet location to iso3

In [347]:
df = pd.read_csv('../data/tweets_en/tweets_scholz_en.csv', lineterminator='\n', encoding='latin-1')

#### Manual transformation of location
"Deutschland"
'USA'
Democrat in the USA
'UK'
'CANADA'
'England'
'Scotland'
'The Netherlands' to 'Netherlands'
Polska to Poland
Czech Republic to Czechia
New York City
Italia

In [348]:
test = df['location'].apply(lambda x : str(x).split(','))
loc1 = []
loc2 = []
for loc in tqdm(test):
    if len(loc) == 1:
        loc1 += [loc[0]]
        loc2 += ['nan']
    if len(loc) == 2:
        loc1 += [loc[0]]
        loc2 += [loc[1]]
    if len(loc) >2:
        loc1 += [loc[0]]
        loc2 += [loc[1]]

  0%|          | 0/67558 [00:00<?, ?it/s]

In [349]:
df = df.assign(loc1 = loc1, loc2 = loc2)

In [350]:
df['loc1'] = df['loc1'].apply(lambda x: x.strip(' '))
df['loc2'] = df['loc2'].apply(lambda x: x.strip(' '))

In [351]:
df = (df.
 merge(city_iso3, how = 'left', left_on = 'loc1', right_on = 'city').
 merge(city_iso3, how = 'left', left_on = 'loc2', right_on = 'city').
 merge(state_iso3, how = 'left', left_on = 'loc1', right_on = 'state').
 merge(state_iso3, how = 'left', left_on = 'loc2', right_on = 'state').
 merge(country_iso3, how = 'left', left_on = 'loc1', right_on = 'country').
 merge(country_iso3, how = 'left', left_on = 'loc2', right_on = 'country')
)


In [353]:
df['iso_final'] = (df['iso3_x'].iloc[:,0].
 combine_first(df['iso3_x'].iloc[:,1]).
 combine_first(df['iso3_x'].iloc[:,2]).
 combine_first(df['iso3_y'].iloc[:,0]).
 combine_first(df['iso3_y'].iloc[:,1]).
 combine_first(df['iso3_y'].iloc[:,2])
)