# Project 1 

## Group One: Covid 19 | Part One: Data Cleansing 

Team Members: Michael Bien, Lupita Lopez, Jinah Porter, Debra Wu

### Research Questions to Answer:

1. WORLD: Compare COVID deaths by GDP by Country: Does a higher GDP result in a lower number of COVID deaths?

2. US: Examine death rates by age group and determine is children under 10 die at a lower rate, adults 60+ at a higher rate

3. US: Compare cases/deaths by state

4. US: Does pre-COVID state GDP related to the cases/deaths in a given state

5. March shutdown, 4th of July and Memorial Day: compare 2 months of shutdown vs. reopening: how did reopening affect case and death rates? 

6. US: Is there a relationship between COVID cases/deaths and gender?

7. US: Is there a relationship between COVID cases/deaths and ethnicity?


In [62]:
#Import necessary libraries
import pandas as pd
import os
import requests
import numpy as np
import requests
from pprint import pprint
import json

Import World Bank data by Country; source: https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.MKTP.CD&country=#

In [85]:
# Import world_bank_data.csv
file_path = os.path.join("..","source_data","world_bank_data.csv")
world_bank_df = pd.read_csv(file_path)
# Drop countries where the 2018 GDP is null, this is indicated by ".."
#Logic: Countries with no GDP data do not contribute to the analysis by GDP
world_bank_df = world_bank_df[world_bank_df["2018 [YR2018]"] != ".."]
world_bank_df.head()

Unnamed: 0,Country Name,Country Code,2018 [YR2018]
0,Afghanistan,AFG,19484384937.0
1,Albania,ALB,15147020535.0
2,Algeria,DZA,174000000000.0
3,American Samoa,ASM,636000000.0
4,Andorra,AND,3218316013.0


Johns Hopkins Country Data with iso: https://documenter.getpostman.com/view/10724784/SzYXWz3x?version=latest
Johns Hopkins Case Data by Country: https://documenter.getpostman.com/view/5352730/SzYbyxR5?version=latest

In [86]:
# Grab Johns Hopkins country list with 3 character country code
JH_countries = requests.get("https://covid-api.com/api/regions").json()
JH_countries_list_df = pd.DataFrame(JH_countries["data"])
JH_countries_list_df = JH_countries_list_df[["name","iso"]]
JH_countries_list_df = JH_countries_list_df.sort_values(by='name', ascending=True)

In [87]:
# create empty columns for deaths, confirmed, recovered
JH_countries_list_df["JH Deaths"] = ""
JH_countries_list_df["JH Confirmed"] = ""
JH_countries_list_df["JH Recovered"] = ""
# create empty list to gather country lookuop errors
JH_country_no_data = []

# John Hopkins base url
base_url = "https://covid19-stats-api.herokuapp.com/api/v1/cases?country="
for index,row in JH_countries_list_df.iterrows():
    country_lookup = row["name"]
    request_url = base_url + country_lookup
    response = requests.get(request_url).json()
    try:
        confirm_JH = response["confirmed"]
        row["JH Confirmed"] = confirm_JH
        death_JH = response["deaths"]
        row["JH Deaths"] = death_JH
        recovered_JH = response["recovered"]
        row["JH Recovered"] = recovered_JH
        print(f"Processing country | {country_lookup} | {response}")
        
    except KeyError:
        print(f"<<Data not found for {country_lookup}")
        JH_country_no_data.append(country_lookup)
        pass

Processing country | Afghanistan | {'confirmed': 38815, 'deaths': 1426, 'recovered': 32098}
Processing country | Albania | {'confirmed': 11672, 'deaths': 340, 'recovered': 6668}
Processing country | Algeria | {'confirmed': 48734, 'deaths': 1632, 'recovered': 34385}
Processing country | Andorra | {'confirmed': 1438, 'deaths': 53, 'recovered': 945}
Processing country | Angola | {'confirmed': 3569, 'deaths': 139, 'recovered': 1332}
Processing country | Antigua and Barbuda | {'confirmed': 95, 'deaths': 3, 'recovered': 91}
Processing country | Argentina | {'confirmed': 577338, 'deaths': 11852, 'recovered': 438883}
Processing country | Armenia | {'confirmed': 46119, 'deaths': 920, 'recovered': 41941}
<<Data not found for Aruba
Processing country | Australia | {'confirmed': 26778, 'deaths': 824, 'recovered': 23650}
Processing country | Austria | {'confirmed': 34305, 'deaths': 757, 'recovered': 27354}
Processing country | Azerbaijan | {'confirmed': 38517, 'deaths': 566, 'recovered': 35998}
Pro

In [66]:
print(JH_country_no_data)

['Aruba', 'Cayman Islands', 'Channel Islands', 'Cruise Ship', 'Curacao', 'Faroe Islands', 'French Guiana', 'Gibraltar', 'Greenland', 'Guadeloupe', 'Guam', 'Guernsey', 'Jersey', 'Macao SAR', 'Martinique', 'Mayotte', 'Others', 'Puerto Rico', 'Reunion', 'Saint Barthelemy', 'Saint Martin', 'Taipei and environs']


In [92]:
# Drop rows with zero death using the folloowing logic:
# 1. The list of countries and case data were obtained independently from two different Johns Hopkins api
# 2. Instance where the Johns Hopkins country returns a null for Johns Hopkins cases indicates that the countries with null cases can be droppped as the case data is not being tracked
JH_countries_list_df = JH_countries_list_df[JH_countries_list_df["JH Deaths"] != ""]
JH_countries_list_df.head()

Unnamed: 0,name,Country Code,JH Deaths,JH Confirmed,JH Recovered
37,Afghanistan,AFG,1426,38815,32098
106,Albania,ALB,340,11672,6668
41,Algeria,DZA,1632,48734,34385
70,Andorra,AND,53,1438,945
178,Angola,AGO,139,3569,1332


In [89]:
# merge World Bank and Johns Hopkins dataframes on iso
#rename columns for merge
JH_countries_list_df.rename(columns={"iso":"Country Code"}, inplace=True)
WB_JH_merge_df = pd.merge(world_bank_df,JH_countries_list_df,on="Country Code",how="outer")
# examine blank country names
cities_not_on_WB = WB_JH_merge_df[WB_JH_merge_df["Country Name"].isnull()]
cities_not_on_JH = WB_JH_merge_df[WB_JH_merge_df["name"].isnull()]
cities_not_on_JH.head(50)

Unnamed: 0,Country Name,Country Code,2018 [YR2018],name,JH Deaths,JH Confirmed,JH Recovered
3,American Samoa,ASM,636000000.0,,,,
33,Cayman Islands,CYM,5485419417.0,,,,
46,Curacao,CUW,3127908045.0,,,,
69,Greenland,GRL,3051626390.0,,,,
71,Guam,GUM,5920000000.0,,,,
78,"Hong Kong SAR, China",HKG,362000000000.0,,,,
92,Kiribati,KIR,196737896.0,,,,
94,Kosovo,XKX,7942961738.0,,,,
105,"Macao SAR, China",MAC,55084050790.0,,,,
112,Marshall Islands,MHL,221278000.0,,,,


In [8]:
cities_not_on_WB
# drop name: Others, Cruise Ship, Diamond Princess

Unnamed: 0,Country Name,Country Code,2018 [YR2018],name,JH Deaths,JH Confirmed,JH Recovered
264,,GGY-JEY,,Channel Islands,,,
265,,cruise,,Cruise Ship,,,
266,,NA-SHIP-DP,,Diamond Princess,13.0,712.0,651.0
267,,GUF,,French Guiana,,,
268,,GLP,,Guadeloupe,,,
269,,GGY,,Guernsey,,,
270,,VAT,,Holy See,0.0,12.0,12.0
271,,JEY,,Jersey,,,
272,,RKS,,Kosovo,488.0,12683.0,8788.0
273,,NA-SHIP-MSZ,,MS Zaandam,4.0,18.0,0.0


In [57]:
# Read population table from wiki
population_df = pd.read_html("https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population_density")
# Return the first table in the list of tables
population_df = population_df[0]
# use join to rename double header of table
population_df.columns = ["_".join(col) for col in population_df.columns]
# remove rows that represent the double footer
population_df = population_df[population_df["Rank_Rank"] != "Rank"]
# rename columns
population_df.rename(columns={"Rank_Rank": "Rank", "Country (or dependent territory)_Country (or dependent territory)": "Country","Area_km2": "Area (km2)","Area_mi2": "Area (mi2)","Population_Population": "Population"}, inplace=True)
population_df.rename(columns={"Density_pop./km2": "Population Density (km2)","Density_pop./mi2": "Population Density (mi2)"}, inplace=True)
# limit the df to the necessary columns
population_df = population_df[["Country", "Area (mi2)" , "Population", "Population Density (mi2)"]]
population_df

Unnamed: 0,Country,Area (mi2),Population,Population Density (mi2)
0,Macau (China),13,696100,54799
1,Monaco,0.78,38300,49107
2,Singapore,279,5703600,20446
3,Hong Kong (China),427,7500700,17565
4,Gibraltar (United Kingdom),2.6,33701,12836
...,...,...,...,...
248,Mongolia,603902,3238479,5.4
249,Pitcairn Islands (United Kingdom),18,56,3.1
250,Falkland Islands (United Kingdom),4700,2563,0.54
251,Svalbard and Jan Mayen (Norway),23706,2655,0.10
