In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
#Plot Maps
import geopandas
import pycountry 
from ipywidgets import interact, IntSlider

from sklearn.impute import SimpleImputer
import datetime

from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from functools import partial
import csv

import os

In [2]:
nd = pd.read_csv('dataset/EMDAT_1900-2021_NatDis.csv')

In [3]:
nd.loc[(nd.Country == 'Bahamas (the)'),'Country']='Bahamas'
nd.loc[(nd.Country == 'Bolivia (Plurinational State of)'),'Country']='Bolivia, Plurinational State of'
nd.loc[(nd.Country == 'Comoros (the)'),'Country']='Comoros'
nd.loc[(nd.Country == 'Congo (the Democratic Republic of the)'),'Country']='Congo, the Democratic Republic of the'
nd.loc[(nd.Country == 'Congo (the)'),'Country']='Congo'
nd.loc[(nd.Country == 'Côte d’Ivoire'),'Country']="Côte d'Ivoire"
nd.loc[(nd.Country == 'Czech Republic (the)'),'Country']="Czechia"
nd.loc[(nd.Country == 'Dominican Republic (the)'),'Country']="Dominican Republic"
nd.loc[(nd.Country == 'Gambia (the)'),'Country']="Gambia"
nd.loc[(nd.Country == 'Iran (Islamic Republic of)'),'Country']="Iran, Islamic Republic of" 
nd.loc[(nd.Country == "Korea (the Democratic People's Republic of)"),'Country']="Korea, Democratic People's Republic of" 
nd.loc[(nd.Country == "Korea (the Republic of)"),'Country']="Korea, Republic of" 
nd.loc[(nd.Country == "Lao People's Democratic Republic (the)"),'Country']="Lao People's Democratic Republic" 
nd.loc[(nd.Country == "Macedonia (the former Yugoslav Republic of)"),'Country']="North Macedonia"
nd.loc[(nd.Country == "Marshall Islands (the)"),'Country']="Marshall Islands" 
nd.loc[(nd.Country == "Micronesia (Federated States of)"),'Country']="Micronesia, Federated States of" 
nd.loc[(nd.Country == "Moldova (the Republic of)"),'Country']="Moldova, Republic of" 
nd.loc[(nd.Country == "Netherlands (the)"),'Country']="Netherlands" 
nd.loc[(nd.Country == "Niger (the)"),'Country']="Niger" 
nd.loc[(nd.Country == "Philippines (the)"),'Country']="Philippines" 
nd.loc[(nd.Country == "Russian Federation (the)"),'Country']="Russian Federation" 
nd.loc[(nd.Country == "Sudan (the)"),'Country']="Sudan" 
nd.loc[(nd.Country == "Taiwan (Province of China)"),'Country']="Taiwan"
nd.loc[(nd.Country == "United Arab Emirates (the)"),'Country']="United Arab Emirates"
nd.loc[(nd.Country == "United Kingdom of Great Britain and Northern Ireland (the)"),'Country']="United Kingdom" 
nd.loc[(nd.Country == 'United States of America (the)'),'Country']="United States" 
nd.loc[(nd.Country == 'Venezuela (Bolivarian Republic of)'),'Country']='Venezuela, Bolivarian Republic of' 
nd.loc[(nd.Country == 'Swaziland'),'Country']='Eswatini'

In [4]:
def alpha2code(column):
    CODE=[]
    for country in column:
        try:
            code=pycountry.countries.get(name=country).alpha_2
            CODE.append(code)
        except:
            CODE.append('None')
    return CODE

In [5]:
nd['CountryAlpha2'] = alpha2code(nd.Country)

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [7]:
useful = nd[['Dis No', 'Disaster Type', 'Disaster Subtype', 'Continent', 'Region', 'Country', 'CountryAlpha2', 'Location', 'Latitude', 'Longitude']]

In [8]:
geolocator = Nominatim(user_agent="epfl.lookingforthird.locationsmaybe")
rateLimiter = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=0)

def findCoordsForRow(row, nameCol, codeCol):
    try:
        return rateLimiter(row[nameCol], country_codes=row[codeCol])
    except:
        return None

def applyRateLimiter(df, nameCol, codeCol, foundLocation, foundLat, foundLong, fileName):
    df[foundLocation] = df.apply(partial(findCoordsForRow, nameCol=nameCol, codeCol=codeCol), axis=1)
    df[foundLat] = df[foundLocation].apply(lambda loc: loc.point[0] if loc else None)
    df[foundLong] = df[foundLocation].apply(lambda loc: loc.point[1] if loc else None)
    df.to_csv(fileName, encoding='utf-16')

In [46]:
geolocator.geocode('Saint Barthélemy')

Location(Saint-Barthélemy, 97133, France, (17.9036287, -62.81152921501989, 0.0))

In [329]:
grouped = useful.groupby(['Country', 'CountryAlpha2']).size().reset_index().rename(columns={0:'Count'})
grouped = grouped.sort_values('Count', ascending=False)

In [330]:
grouped.head()

Unnamed: 0,Country,CountryAlpha2,Count
212,United States,US,1059
40,China,CN,970
90,India,IN,740
155,Philippines,PH,659
91,Indonesia,ID,560


In [None]:
applyRateLimiter(grouped, 'Country', 'CountryAlpha2', 'Found Location', 'Found Latitude', 'Found Longitude', 'FoundCoords/Countries.csv')

In [9]:
locGrouped = useful.loc[useful['CountryAlpha2'] != 'None'].groupby(['Country', 'CountryAlpha2', 'Location']).size().reset_index().rename(columns={0:'Count'})
locGrouped = locGrouped.sort_values('Count', ascending=False)

In [10]:
locGrouped.head()

Unnamed: 0,Country,CountryAlpha2,Location,Count
11899,United States,US,Texas,21
11269,United States,US,California,18
2753,China,CN,Yunnan province,17
2611,China,CN,Sichuan province,17
5114,India,IN,Uttar Pradesh,16


In [11]:
len(locGrouped.index)

12515

In [12]:
locSplits = np.array_split(locGrouped, 100)

In [None]:
cnt = 0
for df in locSplits:
    fileName = 'FoundCoords/Locations_' + str(cnt).zfill(3) + '.csv'
    applyRateLimiter(df, 'Location', 'CountryAlpha2', 'Found Location', 'Found Latitude', 'Found Longitude', fileName)
    cnt += 1