# Project 1 

## Group One: Covid 19 | Part One: Data Cleansing 

Team Members: Michael Bien, Lupita Lopez, Jinah Porter, Debra Wu

### Research Questions to Answer:

1. WORLD: Compare COVID deaths by GDP by Country: Does a higher GDP result in a lower number of COVID deaths?

2. US: Examine death rates by age group and determine is children under 10 die at a lower rate, adults 60+ at a higher rate

3. US: Compare cases/deaths by state

4. US: Does pre-COVID state GDP related to the cases/deaths in a given state

5. March shutdown, 4th of July and Memorial Day: compare 2 months of shutdown vs. reopening: how did reopening affect case and death rates? 

6. US: Is there a relationship between COVID cases/deaths and gender?

7. US: Is there a relationship between COVID cases/deaths and ethnicity?


In [42]:
#Import necessary libraries
import pandas as pd
import os
import requests
import numpy as np
import requests
from pprint import pprint
import json

Import World Bank data by Country; source: https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.MKTP.CD&country=#

In [43]:
# Import world_bank_data.csv
file_path = os.path.join("..","source_data","world_bank_data.csv")
world_bank_df = pd.read_csv(file_path)
print(world_bank_df.shape)
world_bank_df.head()

(264, 3)


Unnamed: 0,Country Name,Country Code,2018 [YR2018]
0,Afghanistan,AFG,19484384937.0
1,Albania,ALB,15147020535.0
2,Algeria,DZA,174000000000.0
3,American Samoa,ASM,636000000.0
4,Andorra,AND,3218316013.0


Johns Hopkins Country Data with iso: https://documenter.getpostman.com/view/10724784/SzYXWz3x?version=latest

In [45]:
# Grab Johns Hopkins country list with 3 character country code
JH_countries = requests.get("https://covid-api.com/api/regions").json()
JH_countries_list_df = pd.DataFrame(JH_countries["data"])
JH_countries_list_df = JH_countries_list_df[["name","iso"]]
JH_countries_list_df = JH_countries_list_df.sort_values(by='name', ascending=True)

In [46]:
# create empty columns for deaths, confirmed, recovered
JH_countries_list_df["JH Deaths"] = ""
JH_countries_list_df["JH Confirmed"] = ""
JH_countries_list_df["JH Recovered"] = ""
# create empty list to gather country lookuop errors
JH_country_no_data = []

# John Hopkins base url
base_url = "https://covid19-stats-api.herokuapp.com/api/v1/cases?country="
for index,row in JH_countries_list_df.iterrows():
    country_lookup = row["name"]
    request_url = base_url + country_lookup
    response = requests.get(request_url).json()
    try:
        confirm_JH = response["confirmed"]
        row["JH Confirmed"] = confirm_JH
        death_JH = response["deaths"]
        row["JH Deaths"] = death_JH
        recovered_JH = response["recovered"]
        row["JH Recovered"] = recovered_JH
        print(f"Processing country | {country_lookup} | {response}")
        
    except KeyError:
        print(f"<<Data not found for {country_lookup}")
        JH_country_no_data.append(country_lookup)
        pass

Processing country | Afghanistan | {'confirmed': 38716, 'deaths': 1420, 'recovered': 31638}
Processing country | Albania | {'confirmed': 11353, 'deaths': 334, 'recovered': 6569}
Processing country | Algeria | {'confirmed': 48254, 'deaths': 1612, 'recovered': 34037}
Processing country | Andorra | {'confirmed': 1344, 'deaths': 53, 'recovered': 943}
Processing country | Angola | {'confirmed': 3388, 'deaths': 134, 'recovered': 1301}
Processing country | Antigua and Barbuda | {'confirmed': 95, 'deaths': 3, 'recovered': 91}
Processing country | Argentina | {'confirmed': 555537, 'deaths': 11352, 'recovered': 419513}
Processing country | Armenia | {'confirmed': 45862, 'deaths': 916, 'recovered': 41659}
<<Data not found for Aruba
Processing country | Australia | {'confirmed': 26692, 'deaths': 816, 'recovered': 23463}
Processing country | Austria | {'confirmed': 33159, 'deaths': 756, 'recovered': 26760}
Processing country | Azerbaijan | {'confirmed': 38327, 'deaths': 562, 'recovered': 35756}
Pro

In [48]:
print(JH_country_no_data)

['Aruba', 'Cayman Islands', 'Channel Islands', 'Cruise Ship', 'Curacao', 'Faroe Islands', 'French Guiana', 'Gibraltar', 'Greenland', 'Guadeloupe', 'Guam', 'Guernsey', 'Jersey', 'Macao SAR', 'Martinique', 'Mayotte', 'Others', 'Puerto Rico', 'Reunion', 'Saint Barthelemy', 'Saint Martin', 'Taipei and environs']


In [49]:
#filter dataframe to find rows with blank JH Deaths
JH_no_data_df = JH_countries_list_df[JH_countries_list_df["JH Deaths"] == ""]
JH_no_data_df

Unnamed: 0,name,iso,JH Deaths,JH Confirmed,JH Recovered
132,Aruba,ABW,,,
126,Cayman Islands,CYM,,,
112,Channel Islands,GGY-JEY,,,
115,Cruise Ship,cruise,,,
141,Curacao,CUW,,,
81,Faroe Islands,FRO,,,
98,French Guiana,GUF,,,
82,Gibraltar,GIB,,,
159,Greenland,GRL,,,
127,Guadeloupe,GLP,,,


In [52]:
# merge World Bank and Johns Hopkins dataframes on iso
#rename columns for merge
JH_countries_list_df.rename(columns={"iso":"Country Code"}, inplace=True)
WB_JH_merge_df = pd.merge(world_bank_df,JH_countries_list_df,on="Country Code",how="outer")
# examine blank country names
cities_not_on_WB = WB_JH_merge_df[WB_JH_merge_df["Country Name"].isnull()]
cities_not_on_JH = WB_JH_merge_df[WB_JH_merge_df["name"].isnull()]
cities_not_on_JH

Unnamed: 0,Country Name,Country Code,2018 [YR2018],name,JH Deaths,JH Confirmed,JH Recovered
3,American Samoa,ASM,636000000,,,,
21,Bermuda,BMU,..,,,,
27,British Virgin Islands,VGB,..,,,,
39,Channel Islands,CHI,..,,,,
69,French Polynesia,PYF,..,,,,
...,...,...,...,...,...,...,...
259,Sub-Saharan Africa,SSF,1.71E+12,,,,
260,Sub-Saharan Africa (excluding high income),SSA,1.70E+12,,,,
261,Sub-Saharan Africa (IDA & IBRD countries),TSS,1.71E+12,,,,
262,Upper middle income,UMC,2.53E+13,,,,


In [53]:
cities_not_on_WB

Unnamed: 0,Country Name,Country Code,2018 [YR2018],name,JH Deaths,JH Confirmed,JH Recovered
264,,GGY-JEY,,Channel Islands,,,
265,,cruise,,Cruise Ship,,,
266,,NA-SHIP-DP,,Diamond Princess,13.0,712.0,651.0
267,,GUF,,French Guiana,,,
268,,GLP,,Guadeloupe,,,
269,,GGY,,Guernsey,,,
270,,VAT,,Holy See,0.0,12.0,12.0
271,,JEY,,Jersey,,,
272,,RKS,,Kosovo,488.0,12683.0,8788.0
273,,NA-SHIP-MSZ,,MS Zaandam,4.0,18.0,0.0


In [87]:
test_df = pd.read_html("https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population_density")
len(test_df)

3

In [88]:
test_df[0]

Unnamed: 0_level_0,Rank,Country (or dependent territory),Area,Area,Population,Density,Density,Date,Population source
Unnamed: 0_level_1,Rank,Country (or dependent territory),km2,mi2,Population,pop./km2,pop./mi2,Date,Population source
0,–,Macau (China),32.9,13,696100,21158.05,54799,"September 30, 2019",Official quarterly estimate
1,1,Monaco,2.02,0.78,38300,18960.4,49107,"December 31, 2018",Official estimate
2,2,Singapore,722.5,279,5703600,7894.26,20446,"July 1, 2019",Official estimate
3,–,Hong Kong (China),1106,427,7500700,6781.83,17565,"December 31, 2019",Official estimate
4,–,Gibraltar (United Kingdom),6.8,2.6,33701,4956.03,12836,"July 1, 2019",UN projection
...,...,...,...,...,...,...,...,...,...
250,–,Falkland Islands (United Kingdom),12173,4700,2563,0.21,0.54,"April 15, 2012",2012 census result
251,–,Svalbard and Jan Mayen (Norway),61399,23706,2655,0.04,0.10,"September 1, 2012",Official estimate
252,–,Greenland (Denmark),2166000,836297,55877,0.03,0.08,"January 1, 2018",Official estimate
253,Rank,Country (or dependent territory),km2,mi2,Population,pop./km2,pop./mi2,Date,Population source


In [89]:
test_df = test_df[0]

In [93]:
#test_df.rename(columns={("Rank","Rank"): "Rank"})
.columns =

Unnamed: 0_level_0,"R,a,n,k","C,o,u,n,t,r,y, ,(,o,r, ,d,e,p,e,n,d,e,n,t, ,t,e,r,r,i,t,o,r,y,)","A,r,e,a","A,r,e,a","P,o,p,u,l,a,t,i,o,n","D,e,n,s,i,t,y","D,e,n,s,i,t,y","D,a,t,e","P,o,p,u,l,a,t,i,o,n, ,s,o,u,r,c,e"
Unnamed: 0_level_1,"R,a,n,k","C,o,u,n,t,r,y, ,(,o,r, ,d,e,p,e,n,d,e,n,t, ,t,e,r,r,i,t,o,r,y,)","k,m,2","m,i,2","P,o,p,u,l,a,t,i,o,n","p,o,p,.,/,k,m,2","p,o,p,.,/,m,i,2","D,a,t,e","P,o,p,u,l,a,t,i,o,n, ,s,o,u,r,c,e"
0,–,Macau (China),32.9,13,696100,21158.05,54799,"September 30, 2019",Official quarterly estimate
1,1,Monaco,2.02,0.78,38300,18960.4,49107,"December 31, 2018",Official estimate
2,2,Singapore,722.5,279,5703600,7894.26,20446,"July 1, 2019",Official estimate
3,–,Hong Kong (China),1106,427,7500700,6781.83,17565,"December 31, 2019",Official estimate
4,–,Gibraltar (United Kingdom),6.8,2.6,33701,4956.03,12836,"July 1, 2019",UN projection
...,...,...,...,...,...,...,...,...,...
250,–,Falkland Islands (United Kingdom),12173,4700,2563,0.21,0.54,"April 15, 2012",2012 census result
251,–,Svalbard and Jan Mayen (Norway),61399,23706,2655,0.04,0.10,"September 1, 2012",Official estimate
252,–,Greenland (Denmark),2166000,836297,55877,0.03,0.08,"January 1, 2018",Official estimate
253,Rank,Country (or dependent territory),km2,mi2,Population,pop./km2,pop./mi2,Date,Population source


In [95]:
test_df.columns = ["_".join(col) for col in test_df.columns]

In [96]:
test_df

Unnamed: 0,Rank_Rank,Country (or dependent territory)_Country (or dependent territory),Area_km2,Area_mi2,Population_Population,Density_pop./km2,Density_pop./mi2,Date_Date,Population source_Population source
0,–,Macau (China),32.9,13,696100,21158.05,54799,"September 30, 2019",Official quarterly estimate
1,1,Monaco,2.02,0.78,38300,18960.4,49107,"December 31, 2018",Official estimate
2,2,Singapore,722.5,279,5703600,7894.26,20446,"July 1, 2019",Official estimate
3,–,Hong Kong (China),1106,427,7500700,6781.83,17565,"December 31, 2019",Official estimate
4,–,Gibraltar (United Kingdom),6.8,2.6,33701,4956.03,12836,"July 1, 2019",UN projection
...,...,...,...,...,...,...,...,...,...
250,–,Falkland Islands (United Kingdom),12173,4700,2563,0.21,0.54,"April 15, 2012",2012 census result
251,–,Svalbard and Jan Mayen (Norway),61399,23706,2655,0.04,0.10,"September 1, 2012",Official estimate
252,–,Greenland (Denmark),2166000,836297,55877,0.03,0.08,"January 1, 2018",Official estimate
253,Rank,Country (or dependent territory),km2,mi2,Population,pop./km2,pop./mi2,Date,Population source
