# Project 1 

## Group One: Covid 19 | Part One: Data Cleansing 

Team Members: Michael Bien, Lupita Lopez, Jinah Porter, Debra Wu

### Research Questions to Answer:

1. WORLD: Compare COVID deaths by GDP by Country: Does a higher GDP result in a lower number of COVID deaths?

2. US: Examine death rates by age group and determine is children under 10 die at a lower rate, adults 60+ at a higher rate

3. US: Compare cases/deaths by state

4. US: Does pre-COVID state GDP related to the cases/deaths in a given state

5. March shutdown, 4th of July and Memorial Day: compare 2 months of shutdown vs. reopening: how did reopening affect case and death rates? 

6. US: Is there a relationship between COVID cases/deaths and gender?

7. US: Is there a relationship between COVID cases/deaths and ethnicity?


In [173]:
#Import necessary libraries
import pandas as pd
import os
import requests
import numpy as np
import requests
from pprint import pprint
import json

Import World Bank data by Country; source: https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.MKTP.CD&country=#

In [174]:
# Import world_bank_data.csv
file_path = os.path.join("..","source_data","world_bank_data.csv")
world_bank_df = pd.read_csv(file_path)
# Drop countries where the 2018 GDP is null, this is indicated by ".."
#Logic: Countries with no GDP data do not contribute to the analysis by GDP
world_bank_df = world_bank_df[world_bank_df["2018 [YR2018]"] != ".."]
world_bank_df.tail(45)

Unnamed: 0,Country Name,Country Code,2018 [YR2018]
216,Zimbabwe,ZWE,24311560500.0
217,Arab World,ARB,2770000000000.0
218,Caribbean small states,CSS,73459161489.0
219,Central Europe and the Baltics,CEB,1640000000000.0
220,Early-demographic dividend,EAR,11600000000000.0
221,East Asia & Pacific,EAS,26300000000000.0
222,East Asia & Pacific (excluding high income),EAP,16600000000000.0
223,East Asia & Pacific (IDA & IBRD countries),TEA,16600000000000.0
224,Euro area,EMU,13700000000000.0
225,Europe & Central Asia,ECS,23100000000000.0


In [175]:
# dropping rows by index number range. These rows represent aggregations of the country data and are not useful in the analysis
world_bank_df.drop(world_bank_df.loc[217:263].index, inplace=True)
world_bank_df.tail(5)

Unnamed: 0,Country Name,Country Code,2018 [YR2018]
211,Vietnam,VNM,245000000000.0
213,West Bank and Gaza,PSE,14615900000.0
214,"Yemen, Rep.",YEM,27591261663.0
215,Zambia,ZMB,27005238896.0
216,Zimbabwe,ZWE,24311560500.0


Johns Hopkins Country Data with iso: https://documenter.getpostman.com/view/10724784/SzYXWz3x?version=latest
Johns Hopkins Case Data by Country: https://documenter.getpostman.com/view/5352730/SzYbyxR5?version=latest

In [176]:
# Grab Johns Hopkins country list with 3 character country code
JH_countries = requests.get("https://covid-api.com/api/regions").json()
JH_countries_list_df = pd.DataFrame(JH_countries["data"])
JH_countries_list_df = JH_countries_list_df[["name","iso"]]
JH_countries_list_df = JH_countries_list_df.sort_values(by='name', ascending=True)
JH_countries_list_df.head()

Unnamed: 0,name,iso
37,Afghanistan,AFG
106,Albania,ALB
41,Algeria,DZA
70,Andorra,AND
178,Angola,AGO


In [258]:
# create empty columns for deaths, confirmed, recovered
JH_countries_list_df["JH Deaths"] = ""
JH_countries_list_df["JH Confirmed"] = ""
JH_countries_list_df["JH Recovered"] = ""
# create empty list to gather country lookuop errors
JH_country_no_data = []

# John Hopkins base url
base_url = "https://covid19-stats-api.herokuapp.com/api/v1/cases?country="
for index,row in JH_countries_list_df.iterrows():
    country_lookup = row["name"]
    request_url = base_url + country_lookup
    response = requests.get(request_url).json()
    try:
        confirm_JH = response["confirmed"]
        row["JH Confirmed"] = confirm_JH
        death_JH = response["deaths"]
        row["JH Deaths"] = death_JH
        recovered_JH = response["recovered"]
        row["JH Recovered"] = recovered_JH
        print(f"Processing country | {country_lookup} | {response}")
        
    except KeyError:
        print(f"<<Data not found for {country_lookup}")
        JH_country_no_data.append(country_lookup)
        pass
print("** API call complete**")

Processing country | Afghanistan | {'confirmed': 38815, 'deaths': 1426, 'recovered': 32098}
Processing country | Albania | {'confirmed': 11672, 'deaths': 340, 'recovered': 6668}
Processing country | Algeria | {'confirmed': 48734, 'deaths': 1632, 'recovered': 34385}
Processing country | Andorra | {'confirmed': 1438, 'deaths': 53, 'recovered': 945}
Processing country | Angola | {'confirmed': 3569, 'deaths': 139, 'recovered': 1332}
Processing country | Antigua and Barbuda | {'confirmed': 95, 'deaths': 3, 'recovered': 91}
Processing country | Argentina | {'confirmed': 577338, 'deaths': 11852, 'recovered': 438883}
Processing country | Armenia | {'confirmed': 46119, 'deaths': 920, 'recovered': 41941}
Processing country | Australia | {'confirmed': 26778, 'deaths': 824, 'recovered': 23650}
Processing country | Austria | {'confirmed': 34305, 'deaths': 757, 'recovered': 27354}
Processing country | Azerbaijan | {'confirmed': 38517, 'deaths': 566, 'recovered': 35998}
Processing country | Bahamas |

In [257]:
print(JH_country_no_data)

[]


In [284]:
# Drop rows with zero death using the following logic:
# 1. The list of countries and case data were obtained independently from two different Johns Hopkins apis
# 2. Instance where the Johns Hopkins country returns a null for Johns Hopkins cases indicates that the countries with null cases can be droppped as the case data is not being tracked
JH_countries_list_df = JH_countries_list_df[JH_countries_list_df["JH Deaths"] != ""]
JH_countries_list_df.shape

(188, 5)

In [285]:
# merge World Bank and Johns Hopkins dataframes on iso
#rename columns for merge
JH_countries_list_df.rename(columns={"iso":"Country Code"}, inplace=True)
WB_JH_merge_df = pd.merge(world_bank_df,JH_countries_list_df,on="Country Code",how="outer")
WB_JH_merge_df.head()

Unnamed: 0,Country Name,Country Code,2018 [YR2018],name,JH Deaths,JH Confirmed,JH Recovered
0,Afghanistan,AFG,19484384937.0,Afghanistan,1426.0,38815.0,32098.0
1,Albania,ALB,15147020535.0,Albania,340.0,11672.0,6668.0
2,Algeria,DZA,174000000000.0,Algeria,1632.0,48734.0,34385.0
3,American Samoa,ASM,636000000.0,,,,
4,Andorra,AND,3218316013.0,Andorra,53.0,1438.0,945.0


In [286]:
# capture rows where there is no JH data
JH_no_data = WB_JH_merge_df[WB_JH_merge_df["JH Deaths"].isnull()]
JH_no_data

Unnamed: 0,Country Name,Country Code,2018 [YR2018],name,JH Deaths,JH Confirmed,JH Recovered
3,American Samoa,ASM,636000000.0,,,,
33,Cayman Islands,CYM,5485419417.0,,,,
46,Curacao,CUW,3127908045.0,,,,
69,Greenland,GRL,3051626390.0,,,,
71,Guam,GUM,5920000000.0,,,,
78,"Hong Kong SAR, China",HKG,362000000000.0,,,,
92,Kiribati,KIR,196737896.0,,,,
94,Kosovo,XKX,7942961738.0,,,,
105,"Macao SAR, China",MAC,55084050790.0,,,,
112,Marshall Islands,MHL,221278000.0,,,,


In [347]:
# drop nulls where there is no JH data - logic:
#This analysis is of case data:
# 1.coutries without case data do not contribute to the analysis 
# 2. The dataset was joined on the iso3, which provides an exact match between datasets
WB_JH_merge_df = WB_JH_merge_df[WB_JH_merge_df["JH Deaths"].notnull()]
# Replace blank Country Names with name
WB_JH_merge_df["Country Name"].fillna(WB_JH_merge_df["name"], inplace=True)
# Fix minor Country Name value issues
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Taiwan*", "Taiwan")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Korea, Rep.", "South Korea")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Gambia, The", "Gambia")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("St. Lucia", "Saint Lucia")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Egypt, Arab Rep.", "Egypt")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Yemen, Rep.", "Yemen")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Congo, Dem. Rep.", "Democratic Republic of the Congo")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Congo, Rep.", "Republic of the Congo")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Kyrgyz Republic", "Kyrgyzstan")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Bahamas, The", "The Bahamas")
WB_JH_merge_df["Country Name"]= WB_JH_merge_df["Country Name"].replace("Congo, Rep.", "Republic of the Congo")
WB_JH_merge_df.head()

Unnamed: 0,Country Name,Country Code,2018 [YR2018],name,JH Deaths,JH Confirmed,JH Recovered
0,Afghanistan,AFG,19484384937.0,Afghanistan,1426,38815,32098
1,Albania,ALB,15147020535.0,Albania,340,11672,6668
2,Algeria,DZA,174000000000.0,Algeria,1632,48734,34385
4,Andorra,AND,3218316013.0,Andorra,53,1438,945
5,Angola,AGO,101000000000.0,Angola,139,3569,1332


In [348]:
# export merged dataframe to csv
output_path = os.path.join("..","source_data","temp.csv")
WB_JH_merge_df.to_csv(output_path)

In [349]:
countries_not_on_WB = WB_JH_merge_df[WB_JH_merge_df["Country Name"].isnull()]
countries_not_on_WB.shape

(0, 7)

In [359]:
# Read population table from wiki
WIKI_population_df = pd.read_html("https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population_density")
# Return the first table in the list of tables
WIKI_population_df = WIKI_population_df[0]
# use join to rename double header of table
WIKI_population_df.columns = ["_".join(col) for col in WIKI_population_df.columns]
# remove rows that represent the double footer
WIKI_population_df = WIKI_population_df[WIKI_population_df["Rank_Rank"] != "Rank"]
# rename columns
WIKI_population_df.rename(columns={"Rank_Rank": "Rank", "Country (or dependent territory)_Country (or dependent territory)": "Country Name","Area_km2": "Area (km2)","Area_mi2": "Area (mi2)","Population_Population": "Population"}, inplace=True)
WIKI_population_df.rename(columns={"Density_pop./km2": "Population Density (km2)","Density_pop./mi2": "Population Density (mi2)"}, inplace=True)
# limit the df to the necessary columns
WIKI_population_df = WIKI_population_df[["Country Name", "Area (mi2)" , "Population", "Population Density (mi2)"]]
WIKI_population_df
# normalize WIKI_population Country Name values
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Vatican City[note 1]", "Holy See")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Kosovo[note 2]", "Kosovo")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Slovakia", "Slovak Republic")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Saint Vincent and the Grenadines", "St. Vincent and the Grenadines")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Saint Kitts and Nevis", "St. Kitts and Nevis")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("São Tomé and Príncipe", "Sao Tome and Principe")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Northern Cyprus[note 4]", "Cyprus")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Ivory Coast", "Cote d'Ivoire")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Ukraine [note 5]", "Ukraine")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Eswatini (Swaziland)", "Eswatini")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Brunei", "Brunei Darussalam")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Bahamas", "The Bahamas")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Uruguay[note 7][clarification needed]", "Uruguay")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Somaliland[note 8]", "Somalia")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Russia[note 11]", "Russian Federation")
WIKI_population_df["Country Name"]= WIKI_population_df["Country Name"].replace("Western Sahara[note 12]", "Western Sahara")

In [360]:
# merge WB_JH_merge_df with WIKI_population_df
WB_JH_WIKI_merge_df = pd.merge(WB_JH_merge_df,WIKI_population_df,on="Country Name",how="outer")
WB_JH_WIKI_merge_df.head()

Unnamed: 0,Country Name,Country Code,2018 [YR2018],name,JH Deaths,JH Confirmed,JH Recovered,Area (mi2),Population,Population Density (mi2)
0,Afghanistan,AFG,19484384937.0,Afghanistan,1426,38815,32098,249347,31575018,127
1,Albania,ALB,15147020535.0,Albania,340,11672,6668,11082,2862427,258
2,Algeria,DZA,174000000000.0,Algeria,1632,48734,34385,919595,43000000,47
3,Andorra,AND,3218316013.0,Andorra,53,1438,945,179,76177,425
4,Angola,AGO,101000000000.0,Angola,139,3569,1332,481354,29250009,61


In [361]:
WB_JH_WIKI_NaNs = WB_JH_WIKI_merge_df[WB_JH_WIKI_merge_df["Country Code"].isnull()]
WB_JH_WIKI_NaNs.tail(50)

Unnamed: 0,Country Name,Country Code,2018 [YR2018],name,JH Deaths,JH Confirmed,JH Recovered,Area (mi2),Population,Population Density (mi2)
208,Guam (United States),,,,,,,209,175200,839.0
209,Marshall Islands,,,,,,,70,55900,800.0
210,United States Virgin Islands (United States),,,,,,,136,104909,772.0
211,American Samoa (United States),,,,,,,76,57100,751.0
212,Cayman Islands (United Kingdom),,,,,,,100,65813,658.0
213,Guadeloupe (France),,,,,,,629,390253,621.0
214,British Virgin Islands (United Kingdom),,,,,,,58,32206,552.0
215,North Korea,,,,,,,46541,25549604,549.0
216,Kiribati,,,,,,,313,125000,399.0
217,Sint Eustatius (Netherlands),,,,,,,8,3193,394.0


In [366]:
# All nulls below have been researched and name normalization has been completed
clean_data_by_country = WB_JH_WIKI_merge_df.dropna()
# Final data cleaning
del clean_data_by_country["name"]
clean_data_by_country.rename(columns={"2018 [YR2018]": "2018 GDP"}, inplace=True)
clean_data_by_country

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Country Name,Country Code,2018 GDP,JH Deaths,JH Confirmed,JH Recovered,Area (mi2),Population,Population Density (mi2)
0,Afghanistan,AFG,19484384937,1426,38815,32098,249347,31575018,127
1,Albania,ALB,15147020535,340,11672,6668,11082,2862427,258
2,Algeria,DZA,1.74E+11,1632,48734,34385,919595,43000000,47
3,Andorra,AND,3218316013,53,1438,945,179,76177,425
4,Angola,AGO,1.01E+11,139,3569,1332,481354,29250009,61
...,...,...,...,...,...,...,...,...,...
170,Uzbekistan,UZB,50392607758,402,48429,44942,172742,32653900,189
171,Vietnam,VNM,2.45E+11,35,1063,931,127882,96208984,752
173,Yemen,YEM,27591261663,583,2016,1219,175676,28915284,165
174,Zambia,ZMB,27005238896,324,13819,12590,290585,16405229,56


https://public.opendatasoft.com/explore/dataset/countries-codes/export/

In [392]:
# Import world_bank_data.csv
file_path = os.path.join("..","source_data","countries-codes.xlsx")
country_codes = pd.read_excel(file_path)

FileNotFoundError: [Errno 2] No such file or directory: '..\\source_data\\countries-codes.xlsx'

In [385]:
print(country_codes)

    ISO3 CODE                      LABEL EN  \
0         SSD  South Sudan, The Republic of   
1         TUR                        Turkey   
2         CHN                         China   
3         LBY        Libyan Arab Jamahiriya   
4         GNB                 Guinea-Bissau   
..        ...                           ...   
242       TCA      Turks and Caicos Islands   
243       BEL                       Belgium   
244       OMN                          Oman   
245       VNM                      Viet Nam   
246       PSE      West Bank and Gaza Strip   

                                             Geo Shape  \
0                                                  NaN   
1    {"type": "MultiPolygon", "coordinates": [[[[26...   
2    {"type": "MultiPolygon", "coordinates": [[[[11...   
3    {"type": "Polygon", "coordinates": [[[19.29167...   
4    {"type": "MultiPolygon", "coordinates": [[[[-1...   
..                                                 ...   
242  {"type": "MultiPolygon",

CDC Data with Ethnicity: https://dev.socrata.com/foundry/data.cdc.gov/vbim-akqf

In [394]:
# Import CDC Data from CSV
file_path = os.path.join("..","source_data","COVID-19_Case_Surveillance_Public_Use_Data.txt")
cdc_raw_data = pd.read_csv(file_path)

FileNotFoundError: [Errno 2] File ..\source_data\COVID-19_Case_Surveillance_Public_Use_Data.txt does not exist: '..\\source_data\\COVID-19_Case_Surveillance_Public_Use_Data.txt'

In [368]:
cdc_raw_data.head()

Unnamed: 0,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,icu_yn,death_yn,medcond_yn
0,2020/03/03,2020/03/03,,Laboratory-confirmed case,Male,0 - 9 Years,Unknown,Missing,Missing,Missing,Missing
1,2020/03/03,2020/03/03,,Laboratory-confirmed case,Female,0 - 9 Years,Unknown,Missing,Missing,Missing,Missing
2,2020/04/07,2020/03/03,2020/03/03,Laboratory-confirmed case,Unknown,0 - 9 Years,Unknown,No,Missing,Missing,Missing
3,2020/08/04,2020/08/04,,Probable Case,Male,0 - 9 Years,Unknown,Missing,Missing,Missing,Missing
4,2020/07/28,2020/08/04,2020/07/28,Laboratory-confirmed case,Male,0 - 9 Years,Unknown,No,No,No,Missing


In [373]:
df.groupby(['col5','col2']).reset_index()

Unnamed: 0_level_0,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,icu_yn,medcond_yn
death_yn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Missing,1231997,390842,300702,1231997,1231988,1231961,1231993,1231997,1231997,1231997
No,1644402,532951,1233918,1644402,1644395,1644357,1644401,1644402,1644402,1644402
Unknown,660056,354804,289467,660056,660054,660049,660055,660056,660056,660056
Yes,125870,51986,65540,125870,125870,125864,125870,125870,125870,125870
