# Data Scraping and Cleaning


In [687]:
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
import tabula
import pycountry

warnings.filterwarnings(action = 'ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### 1. Data Source: [ROCS](http://rocs.hu-berlin.de/viz/sgb/)

- Airport name and three-letter symbol
- Country
- Passenger flux per day
- Connected airports
- Effective distance to outbreak center
- Geodesic distance to outbreak center (km)

In [690]:
response1 = requests.get('http://rocs.hu-berlin.de/viz/sgb/airports0.95_2014/airports.json')
rocs_airport_info_df = pd.DataFrame(response1.json()['nodes'])

response2 = requests.get('http://rocs.hu-berlin.de/viz/sgb/airports0.95_2014/spt_673.json')
rocs_data_df = pd.DataFrame(response2.json())  

# Join on index
rocs_df = rocs_data_df.join(rocs_airport_info_df, how= 'outer')
rocs_df2 = rocs_df[['Dfrom', 'Dgeo', 'name', 'continent_name', 'region_name', 'country_name', 'lid', 'F',
                    'k', 'lon', 'lat']]

rocs_df2.rename(columns = {'Dfrom':'effective_dist', 'Dgeo':'geo_dist', 'name':'airport_name', 
                     'continent_name':'continent_name', 'region_name':'region_name', 'country_name':'country_name',
                    'lid':'airport_id', 'F':'ann_pass_flux', 'k':'connected_airports', 'lon':'long', 'lat':'lat'},
               inplace = True)
rocs_df2.head()

Unnamed: 0,effective_dist,geo_dist,airport_name,continent_name,region_name,country_name,airport_id,ann_pass_flux,connected_airports,long,lat
0,9.626587,8111.034829,Addis Ababa,Africa,Eastern Africa,Ethiopia,ADD,3887132.5,56.5,38.799444,8.977778
1,13.897219,7763.569189,Asmara,Africa,Eastern Africa,Eritrea,ASM,307820.5,10.5,38.910556,15.291944
2,14.024414,9712.495427,Bujumbura,Africa,Eastern Africa,Burundi,BJM,252165.0,6.5,29.318611,-3.323889
3,13.695018,8990.314878,Dar es Salaam,Africa,Eastern Africa,Tanzania United Republic of,DAR,1840029.0,17.0,39.2025,-6.878056
4,13.038729,9221.694455,Entebbe,Africa,Eastern Africa,Uganda,EBB,1079933.0,16.5,32.443611,0.0425


In [None]:
# Rename country to normalized convention 

search_results = np.array()

for index, val in enumerate(rocs_df.country_name):
    try:
        spec = pycountry.countries.search_fuzzy(val)
        search_results.append(spec)
    except:
        search_results.append(np.nan)
    

print(len(rocs_df.country_name), len(search_results))

In [699]:
search_results[0][0].name

'Ethiopia'

In [725]:
country_names = np.array([])

for index,val in enumerate(search_results):
    try:
        country_names = np.append(country_names, search_results[index][0].name)
    except:
        country_names = np.append(country_names, np.nan)

country_names[:10]

array(['Ethiopia', 'Eritrea', 'Burundi', 'nan', 'Uganda', 'Zimbabwe',
       'nan', 'Rwanda', 'Kenya', 'Zambia'], dtype='<U33')

In [729]:
country_names[3] == 'nan'

True

In [730]:
rocs_df2['searched_country_names'] = country_names

for index, val in enumerate(rocs_df2['searched_country_names']):
    if val == 'nan':
        print(index, rocs_df.country_name[index])

3 Tanzania United Republic of
6 Tanzania United Republic of
13 Tanzania United Republic of
19 Tanzania United Republic of
23 Congo Democratic Republic of
33 Libyan Arab Jamahiriya
46 Libyan Arab Jamahiriya
54 Libyan Arab Jamahiriya
57 Libyan Arab Jamahiriya
75 Cape Verde
90 Cape Verde
92 Cape Verde
93 Antigua and Barbuda, Leeward Islands
103 Grenada, Windward Islands
118 Saint Kitts and Nevis, Leeward Islands
121 Virgin Islands, US
122 Virgin Islands, US
123 St Maarten (Dutch Part)
527 Korea Republic of
528 Korea Republic of
549 Korea Republic of
557 Hong Kong (SAR) China
568 Korea Republic of
596 Korea Republic of
600 Korea Republic of
612 Macao (SAR) China
635 Korea Republic of
638 Korea Republic of
648 Korea Republic of
667 Korea Republic of
697 Iran Islamic Republic of
698 Iran Islamic Republic of
703 Iran Islamic Republic of
705 Iran Islamic Republic of
719 Iran Islamic Republic of
720 Iran Islamic Republic of
736 Iran Islamic Republic of
737 Iran Islamic Republic of
742 Iran Isla

### 2. Data Source: [US Dept of Transportation](https://data.transportation.gov/Aviation/International_Report_Passengers/xgub-n9bw)
This data only provides passenger information on international US flights.

- Year, month/datetime
- US airport and foreign airport IDs
- Number of passengers

In [468]:
response = requests.get('https://data.transportation.gov/resource/xgub-n9bw.json')
airports_df = pd.DataFrame(response.json())
airports_df.head()

Unnamed: 0,data_dte,year,month,usg_apt_id,usg_apt,usg_wac,fg_apt_id,fg_apt,fg_wac,airlineid,carrier,carriergroup,type,scheduled,charter,total
0,2019-06-01T00:00:00.000,2019,6,12478,JFK,22,12972,LHR,493,19682,VS,0,Passengers,85790,0,85790
1,2019-06-01T00:00:00.000,2019,6,12478,JFK,22,10920,CDG,427,19532,AF,0,Passengers,85324,0,85324
2,2019-06-01T00:00:00.000,2019,6,12953,LGA,22,16271,YYZ,936,19531,AC,0,Passengers,82389,0,82389
3,2019-06-01T00:00:00.000,2019,6,12478,JFK,22,12972,LHR,493,19540,BA,0,Passengers,79975,0,79975
4,2019-06-01T00:00:00.000,2019,6,12892,LAX,91,16271,YYZ,936,19531,AC,0,Passengers,73172,0,73172


In [470]:
print('Number of US Airports: ', len(airports_df.usg_apt.unique()))
print('Number of Foreign Airports: ', len(airports_df.fg_apt.unique()))

Number of US Airports:  52
Number of Foreign Airports:  132


In [569]:
airports_df.year.value_counts()

2019    1000
Name: year, dtype: int64

In [570]:
airports_df.data_dte.value_counts()

2019-06-01T00:00:00.000    1000
Name: data_dte, dtype: int64

In [571]:
airports_df.type.value_counts()

Passengers    1000
Name: type, dtype: int64

In [572]:
airports_df.scheduled = airports_df.scheduled.astype(int)
airports_df.total = airports_df.total.astype(int)

for i in range (0, len(airports_df)):
    if (airports_df.scheduled[i] == airports_df.total[i]):
        pass
    elif (airports_df.scheduled[i] != airports_df.total[i]):
        print(i, (airports_df.scheduled[i] - airports_df.total[i]))

16 -108
465 -53
521 -58


Take total passengers instead of scheduled passengers since scheduled passengers differ for at least three flights.

### 3. Data Source: [International Health Regulations SPAR Index](http://apps.who.int/gho/data/view.main.IHRSPARCTRYALLv?lang=en)
WHO International Health Regulations State Parties Self-Assessment Annual Reporting index, a combination of indicators (country-level). The latest data is available for 2018. Index values range from 0-100.

- Country
- Legislation and Financing
- IHR Coordination and National IHR Focal Point Functions
- Zoonotic Events and the Human-animal Interface
- Food Safety
- Laboratory
- Surveillance
- Human Resources
- National Health Emergency Framework
- Health Service Provision
- Risk Communication
- Points of Entry
- Chemical Events
- Radiation Emergencies

In [598]:
spar_df = pd.read_csv('../xmart.csv')
spar_df.head()

Unnamed: 0,Country,Legislation and Financing; 2018,IHR Coordination and National IHR Focal Point Functions; 2018,Zoonotic Events and the Human-animal Interface; 2018,Food Safety; 2018,Laboratory; 2018,Surveillance; 2018,Human Resources; 2018,National Health Emergency Framework; 2018,Health Service Provision; 2018,Risk Communication; 2018,Points of Entry; 2018,Chemical Events; 2018,Radiation Emergencies; 2018
0,Afghanistan,13,60,80,20,40,80,40,27,40,20,10,0,20
1,Albania,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data
2,Algeria,100,90,80,80,60,80,80,53,93,60,60,100,100
3,Andorra,47,50,60,80,67,20,20,0,53,60,20,0,0
4,Angola,47,80,80,20,60,90,60,60,47,80,40,40,60


In [599]:
int_cols = list(spar_df.columns)
int_cols.remove('Country')

for col in int_cols:
    for index, val in enumerate(spar_df[col]):
        if val == 'No data':
            spar_df[col][index] = np.nan
        elif val != 'No data':
            spar_df[col][index] = int(val)

spar_df['Index'] = df.apply(lambda _: 0, axis=1)

for index in range(0, len(spar_df)):
    spar_df['Index'][index] = sum(spar_df.iloc[index][int_cols])/len(int_cols)
    
spar_df2 = spar_df[['Country', 'Index']]

# impute mean into 13 country entries
spar_df2.Index.fillna(spar_df2.Index.mean(), inplace= True)

spar_df2.Index.isna().value_counts()

False    194
Name: Index, dtype: int64

### 4. Data Source: [Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv)
Sourcing daily cumulative confirmed cases, recoveries, and deaths per location (Jan 21 - present) from:
- World Health Organization (WHO): https://www.who.int/
- DXY.cn. Pneumonia. 2020. http://3g.dxy.cn/newh5/view/pneumonia.
- BNO News: https://bnonews.com/index.php/2020/02/the-latest-coronavirus-cases/
- National Health Commission of the People’s Republic of China (NHC):
http://www.nhc.gov.cn/xcs/yqtb/list_gzbd.shtml
- China CDC (CCDC): http://weekly.chinacdc.cn/news/TrackingtheEpidemic.htm
- Hong Kong Department of Health: https://www.chp.gov.hk/en/features/102465.html
- Macau Government: https://www.ssm.gov.mo/portal/
- Taiwan CDC: https://sites.google.com/cdc.gov.tw/2019ncov/taiwan?authuser=0
- US CDC: https://www.cdc.gov/coronavirus/2019-ncov/index.html
- Government of Canada: https://www.canada.ca/en/public-health/services/diseases/coronavirus.html
- Australia Government Department of Health: https://www.health.gov.au/news/coronavirus-update-at-a-glance
- European Centre for Disease Prevention and Control (ECDC): https://www.ecdc.europa.eu/en/geographical-distribution-2019-ncov-cases
- Ministry of Health Singapore (MOH): https://www.moh.gov.sg/covid-19

In [391]:
jhu_df = pd.read_csv('../COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv')
jhu_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20
0,Anhui,Mainland China,31.82571,117.2264,1,9,15,39,60,70,106,152,200,237,297,340,408,480,530,591,665,733,779,830,860,889,910,934,950,962,973,982,986
1,Beijing,Mainland China,40.18238,116.4142,14,22,36,41,68,80,91,111,114,139,168,191,212,228,253,274,297,315,326,337,342,352,366,372,375,380,381,387,393
2,Chongqing,Mainland China,30.05718,107.874,6,9,27,57,75,110,132,147,182,211,247,300,337,366,389,411,426,428,468,486,505,518,529,537,544,551,553,555,560
3,Fujian,Mainland China,26.07783,117.9895,1,5,10,18,35,59,80,84,101,120,144,159,179,194,205,215,224,239,250,261,267,272,279,281,285,287,290,292,293
4,Gansu,Mainland China,36.0611,103.8343,0,2,2,4,7,14,19,24,26,29,40,51,55,57,62,62,67,79,83,83,86,87,90,90,90,90,91,91,91


In [392]:
jhu_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 33 columns):
Province/State    53 non-null object
Country/Region    76 non-null object
Lat               76 non-null float64
Long              76 non-null float64
1/22/20           76 non-null int64
1/23/20           76 non-null int64
1/24/20           76 non-null int64
1/25/20           76 non-null int64
1/26/20           76 non-null int64
1/27/20           76 non-null int64
1/28/20           76 non-null int64
1/29/20           76 non-null int64
1/30/20           76 non-null int64
1/31/20           76 non-null int64
2/1/20            76 non-null int64
2/2/20            76 non-null int64
2/3/20            76 non-null int64
2/4/20            76 non-null int64
2/5/20            76 non-null int64
2/6/20            76 non-null int64
2/7/20            76 non-null int64
2/8/20            76 non-null int64
2/9/20            76 non-null int64
2/10/20           76 non-null int64
2/11/20           76 non-nu

### 5. Data Sources: 
### [UN, source = World Bank](http://data.un.org/Data.aspx?q=GDP+per+capita&d=WDI&f=Indicator_Code%3aNY.GDP.PCAP.PP.CD)

- GDP per capita for 2018, 230 country records

### [London Datastore, source = UN](https://data.london.gov.uk/dataset/global-city-population-estimates)

- City populations over 300k time series (1950-2030, 5 yr) with latitude/longitude

In [506]:
# GDP per capita, PPP (current international $)
gdp_df = pd.read_csv('../UNdata_Export_20200221_065525496.csv')
gdp_df.head()

Unnamed: 0,Country or Area,Year,Value,Value Footnotes
0,Afghanistan,2018,1955.006208,
1,Albania,2018,13364.155397,
2,Algeria,2018,15481.78762,
3,Angola,2018,6452.355165,
4,Antigua and Barbuda,2018,26868.133524,


In [507]:
gdp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 4 columns):
Country or Area    230 non-null object
Year               230 non-null int64
Value              230 non-null float64
Value Footnotes    0 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 7.3+ KB


In [508]:
city_pop_df = pd.read_excel('../global-city-population-estimates.xls', sheet_name=1)
city_pop_df.head()

Unnamed: 0,Country Code,Country or area,City Code,Urban Agglomeration,Note,Latitude,Longitude,1950,1955,1960,1965,1970,1975,1980,1985,1990,1995,2000,2005,2010,2015,2020,2025,2030
0,392,Japan,21671,Tokyo,325.0,35.6895,139.69171,11274.641,13712.679,16678.821,20284.371,23297.503,26614.733,28548.512,30303.794,32530.003,33586.573,34449.908,35621.544,36833.979,38001.018,38323.229,37875.951,37190.489
1,356,India,21228,Delhi,318.0,28.66667,77.21667,1369.369,1781.624,2282.962,2845.042,3530.693,4425.964,5558.481,7325.185,9725.885,12407.372,15732.304,18670.494,21935.142,25703.168,29347.622,32726.564,36060.1
2,156,China,20656,Shanghai,202.0,31.22222,121.45806,4300.942,5846.383,6819.634,6428.131,6036.492,5626.64,5966.171,6846.765,7823.028,10449.535,13958.981,16763.455,19979.977,23740.778,27137.316,29441.569,30750.671
3,76,Brazil,20287,São Paulo,,-23.5475,-46.63611,2334.038,3043.828,3969.759,5494.15,7620.49,9614.016,12089.454,13394.815,14775.84,15913.473,17014.078,18288.134,19659.808,21066.245,22118.9,22899.066,23444.363
4,356,India,21206,Mumbai (Bombay),,19.073975,72.880838,2857.359,3432.147,4060.373,4853.571,5811.304,7081.96,8657.886,10390.912,12436.423,14309.845,16366.787,17890.736,19421.983,21042.538,22838.483,25207.329,27796.555


In [593]:
city_pop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1692 entries, 0 to 1691
Data columns (total 24 columns):
Country Code           1692 non-null int64
Country or area        1692 non-null object
City Code              1692 non-null int64
Urban Agglomeration    1692 non-null object
Note                   550 non-null float64
Latitude               1692 non-null float64
Longitude              1692 non-null float64
1950                   1692 non-null float64
1955                   1692 non-null float64
1960                   1692 non-null float64
1965                   1692 non-null float64
1970                   1692 non-null float64
1975                   1692 non-null float64
1980                   1692 non-null float64
1985                   1692 non-null float64
1990                   1692 non-null float64
1995                   1692 non-null float64
2000                   1692 non-null float64
2005                   1692 non-null float64
2010                   1692 non-null float64


### 5. Data Source: World Bank

[Doctors](https://data.worldbank.org/indicator/SH.MED.PHYS.ZS) and [hospital beds per 1000 people/country](https://data.worldbank.org/indicator/SH.MED.BEDS.ZS)


In [614]:
doctors_df = pd.read_csv('../API_SH.MED.PHYS.ZS_DS2_en_csv_v2_713164/API_SH.MED.PHYS.ZS_DS2_en_csv_v2_713164.csv', 
                         skiprows=4)
doctors_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
0,Aruba,ABW,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.12,,,,,,,,,,,,,,,,,,,,,,,,,
1,Afghanistan,AFG,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,0.035,,,,,0.063,,,,,0.065,,,,,,,,,,,0.077,,,,,0.183,0.179,,0.129,0.109,,,0.143,,,,0.11,,,,0.1957,,,,,0.163,0.1774,0.1771,0.2156,0.2396,0.2553,0.245,0.2894,0.3039,0.2907,0.284,,,,
2,Angola,AGO,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,0.067,,,,,0.076,,,,,0.116,,,,,,,,,,,,,,0.059,,,,,,0.042,,,,,,,0.0584,,,,,,,0.0618,,,,,0.1311,,,,,,,,0.2149,,,
3,Albania,ALB,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,0.276,,,,,0.481,,,,,0.739,0.911,0.845,,,,,1.036,,,1.367,,,1.415,1.407,1.406,,,,1.403,1.374,1.47,1.65,1.425,,1.306,1.354,1.295,1.289,1.282,1.389,,1.305,,,,,1.146,,1.144,1.2379,1.2225,1.2658,1.2706,,,1.1998,,,,
4,Andorra,AND,"Physicians (per 1,000 people)",SH.MED.PHYS.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.231,,2.435,2.47,2.594,2.549,2.594,,3.3333,,,3.64,3.716,,3.112,4.0,,,,,3.3333,,,,,


In [615]:
doctors_df['latest_doctors_per_1000'] = doctors_df.ffill(axis=1).iloc[:, -1] 

for index, val in enumerate(doctors_df.latest_doctors_per_1000):
    if val == 'SH.MED.PHYS.ZS':
        doctors_df.latest_doctors_per_1000[index] = np.nan
    else:
        doctors_df.latest_doctors_per_1000[index] = float(val)

# impute median into 11 country entries
doctors_df.latest_doctors_per_1000.fillna(doctors_df.latest_doctors_per_1000.median(), inplace = True)

doctors_df.latest_doctors_per_1000.isna().value_counts()

False    264
Name: latest_doctors_per_1000, dtype: int64

In [616]:
hosp_beds_df = pd.read_csv('../API_SH.MED.BEDS.ZS_DS2_en_csv_v2_717551/API_SH.MED.BEDS.ZS_DS2_en_csv_v2_717551.csv', 
                           skiprows=4)
hosp_beds_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
0,Aruba,ABW,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Afghanistan,AFG,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,0.170627,,,,,,,,,,0.199,,,,,,,,,,,0.2756,,,,,,0.3091,,,0.2498,,,,,,,,,,0.3,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.5,0.5,0.5,0.5,,,,,
2,Angola,AGO,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,2.061462,,,,,,,,,,2.721,,,,,,,,,,,,,,,,,,,,1.2913,,,,,,,,,,,,,,,0.8,,,,,,,,,,,,,,,
3,Albania,ALB,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,5.102676,,,,,,,,,,,,,,,,,,,,4.2717,4.1871,4.1607,4.0862,4.139,4.1388,4.0697,3.97,3.9355,4.1321,4.0249,3.9987,4.0134,3.8314,3.0171,3.19,3.14,3.05,3.05,3.03,3.3,3.3,3.1,3.1,3.0,3.1,3.1,3.1,,2.8,3.0,2.6,2.9,2.9,,,,,,,
4,Andorra,AND,"Hospital beds (per 1,000 people)",SH.MED.BEDS.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.18,3.0,3.09,3.2,3.2,2.59,,3.3,,2.7,2.6,2.6,,2.5,,,,,,,,,,,


In [617]:
hosp_beds_df['latest_hosp_beds_per_1000'] = hosp_beds_df.ffill(axis=1).iloc[:, -1] 

for index, val in enumerate(hosp_beds_df.latest_hosp_beds_per_1000):
    if val == 'SH.MED.BEDS.ZS':
        hosp_beds_df.latest_hosp_beds_per_1000[index] = np.nan
    else:
        hosp_beds_df.latest_hosp_beds_per_1000[index] = float(val)

# impute median into 18 country entries
hosp_beds_df.latest_hosp_beds_per_1000.fillna(hosp_beds_df.latest_hosp_beds_per_1000.median(), inplace = True)

hosp_beds_df.latest_hosp_beds_per_1000.isna().value_counts()

False    264
Name: latest_hosp_beds_per_1000, dtype: int64

### 6. Begin merging country-level data

- SPAR index
- GDP per capita
- Physicians per 1000 people
- Hospital beds per 100 people

In [620]:
health_df = hosp_beds_df[['Country Name', 
                          'latest_hosp_beds_per_1000']].merge(doctors_df[['Country Name',
                                                                        'latest_doctors_per_1000']], 
                                                             how = 'inner', 
                                                             on = 'Country Name')

In [None]:
gdp_df.rename(columns={'Country or Area': 'Country Name', 'Value': 'GDP_per_capita'}, inplace = True)

country_stats_df = health_df.merge(gdp_df[['Country Name', 'GDP_per_capita']], how = 'outer', on = 'Country Name')

spar_df2.rename(columns = {'Country': 'Country Name', 'Index': 'SPAR_index'}, inplace = True)
country_stats_df = country_stats_df.merge(spar_df2, how = 'outer', on = 'Country Name')

# impute median SPAR index for 112 country entries
country_stats_df.SPAR_index.fillna(country_stats_df.SPAR_index.median(), inplace = True)

# impute median GDP into 51 country entries
country_stats_df.GDP_per_capita.fillna(country_stats_df.GDP_per_capita.median(), inplace = True)

# impute median hosp beds and physicians into 17 country entries
country_stats_df.latest_hosp_beds_per_1000.fillna(country_stats_df.latest_hosp_beds_per_1000.median(), inplace = True)
country_stats_df.latest_doctors_per_1000.fillna(country_stats_df.latest_doctors_per_1000.median(), inplace = True)

country_stats_df.info()

In [688]:
# Rename country to normalized convention in rocs_df, spar_df2, jhu_df, gdp_df, city_pop_df, health_df, IVDI_df

search_results = []

for index, val in enumerate(rocs_df.country_name):
    spec = pycountry.countries.search_fuzzy(val)
    search_results.append(spec)

print(len(rocs_df.country_name), len(search_results))

LookupError: tanzania united republic of

### 7. Data Source: [Rand Corporation](https://www.rand.org/pubs/research_reports/RR1605.html)

Infectious Disease Vulnerability Index (IDVI) takes data from various international organizations to determine scores per each of seven domains as well as an overall score. The domain categories are:
- Demographic
- Health Care
- Public Health
- Disease Dynamics
- Political-Domestic
- Political-International
- Economic

In [669]:
file = '../RAND_RR1605.pdf'
tables = tabula.read_pdf(file, pages = 'all', multiple_tables = True)

Got stderr: Feb 21, 2020 3:18:32 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Feb 21, 2020 3:18:32 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode



In [657]:
tabula.convert_into(file, "../Rand_IVDI.csv", pages = 'all')

Got stderr: Feb 21, 2020 3:08:27 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Feb 21, 2020 3:08:27 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode



In [680]:
IVDI_df = pd.read_csv('../Rand_IVDI.csv')
# IVDI_df.rename(columns = {'Unnamed: 2': 'Country Name', 'Normed Score':'Rand_IVDI'}, inplace = True)
IVDI_df = IVDI_df[[' Country', ' Overall Score']]
IVDI_df.rename(columns = {' Country': 'Country Name', ' Overall Score': 'IVDF_score'}, inplace = True)
IVDI_df.head()

Unnamed: 0,Country Name,IVDF_score
0,Somalia,0.0
1,Central African Republic,6.1e-05
2,Chad,0.09845
3,South Sudan,0.100836
4,Mauritania,0.107294


In [685]:
print(len(IVDI_df), len(country_stats_df))
IVDI_df.tail()
# country_stats_df = IVDI_df.merge(country_stats_df, how = 'outer', on = 'Country Name')
# country_stats_df

195 306


Unnamed: 0,Country Name,IVDF_score
190,Sweden,0.955625
191,Germany,0.96689
192,Finland,0.968274
193,Canada,0.9734
194,Norway,1.0
