# Data Exploration and Cleanup

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import json
import seaborn as sns
import os
import datetime
from pprint import pprint
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")


In [3]:
parent_folder = 'historical_data'
csv_list = [f for f in os.listdir(parent_folder) if '.csv' in f]
df_list = []
for csv in csv_list:
    df = pd.read_csv(os.path.join(parent_folder, csv))
    df_list.append(df)
master_df = pd.concat(df_list, ignore_index=True)

In [4]:
master_df.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,6/01/2015,KR,Jeonju,co,124,0.1,12.3,4.5,55.74
1,22/01/2015,KR,Jeonju,co,116,4.5,10.0,6.7,16.09
2,30/03/2015,KR,Jeonju,co,118,1.2,11.2,5.6,35.98
3,27/05/2015,KR,Jeonju,co,93,2.3,5.6,3.4,6.54
4,3/02/2015,KR,Jeonju,co,133,4.5,13.4,7.8,39.24


# * Data taken from kaggle.
https://www.kaggle.com/imdevskp/corona-virus-report

In [27]:
# Read in Covid Data
covid_df = pd.read_csv('covid_19_clean_complete.csv', parse_dates=['Date'])
# Replace NaN values by 0
covid_df.fillna(0, inplace=True)
# Remove columns
covid_df = covid_df.drop(['Lat','Long','Province/State'], axis=1)
# Rename the columns so that they make sense
covid_df.rename (columns = {'Country/Region':'Country'}, inplace = True)
#change date time format
covid_df['Date'] = covid_df['Date'].dt.strftime(('%d/%m/%Y'))
#print(covid_df.shape)
covid_df.head()


Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active,WHO Region
0,Afghanistan,22/01/2020,0,0,0,0,Eastern Mediterranean
1,Albania,22/01/2020,0,0,0,0,Europe
2,Algeria,22/01/2020,0,0,0,0,Africa
3,Andorra,22/01/2020,0,0,0,0,Europe
4,Angola,22/01/2020,0,0,0,0,Africa


In [26]:
#Find current active cases
covid_df['active'] = covid_df['Confirmed'] - covid_df['Deaths'] - covid_df['Recovered']
top = covid_df[covid_df['Date'] == covid_df['Date'].max()]

# Group datas by Countryworld = top.groupby('Country')['Confirmed','active','Deaths'].sum()
world = top.groupby('Country')['Confirmed','active','Deaths'].sum().reset_index()

#Dispaly first five countries
print(covid_df.shape)
world.head()

(49068, 8)


Unnamed: 0,Country,Confirmed,active,Deaths
0,Afghanistan,15205,13620,257
1,Albania,1137,232,33
2,Algeria,9394,2993,653
3,Andorra,764,19,51
4,Angola,86,64,4


In [28]:
#plot world map per 10k active infections
figure = px.choropleth(world, locations="Country", locationmode='country names' 
                       , color="active",hover_name="Country",range_color=[1,10000],
                      color_continuous_scale="Peach",title="Active Cases in World")
figure.show()

In [21]:
#Calculate totals 
world_numbers = []
total_deaths = [] 
total_recovered = [] 
recovery_rate = [] 
mortality_rate = []

confirmed_sum = covid_df.Confirmed.sum()
recovered_sum = covid_df.Recovered.sum()
death_sum = covid_df.Deaths.sum()
world_numbers.append(confirmed_sum)
total_recovered.append(recovered_sum)
total_deaths.append(death_sum)
recovery_rate.append(recovered_sum/confirmed_sum)
mortality_rate.append(death_sum/confirmed_sum)
mean_mortality_rate = np.mean(mortality_rate)
mean_recovery_rate = np.mean(recovery_rate)

In [22]:
#Print Output
print ('Total Confirmed Cases = '+ str(confirmed_sum))
print ('Total Recovered Cases = '+ str(recovered_sum))
print ('Total Deaths Cases = '+ str(death_sum))
print ('Total Recovery Rate(%) = '+ str(mean_recovery_rate*100))
print ('Total Fatality Rate(%) = '+ str(mean_mortality_rate*100))

Total Confirmed Cases= 828508482
Total Recovered Cases= 388408229
Total Deaths Cases= 43384903
Total Recovery Rate(%)= 46.88041672939686
Total Fatality Rate(%)= 5.23650680018023


In [20]:
#Group by top 20 Countries and status of COVID infections 
top = top.groupby('Country').sum()
top_active = top.sort_values('active', ascending=False).reset_index()
top_active.head(20).style.background_gradient(cmap='Reds')

Unnamed: 0,Country,Confirmed,Deaths,Recovered,Active,active
0,US,4290259,148011,1325804,2816444,2816444
1,Brazil,2442375,87618,1846641,508116,508116
2,India,1480073,33408,951166,495499,495499
3,United Kingdom,301708,45844,1437,254427,254427
4,Russia,816680,13334,602249,201097,201097
5,South Africa,452529,7067,274925,170537,170537
6,Colombia,257101,8777,131161,117163,117163
7,France,220352,30212,81212,108928,108928
8,Canada,116458,8944,0,107514,107514
9,Peru,389717,18418,272547,98752,98752


In [24]:
# Display an overview of the Specie column
master_df["Specie"].unique()

array(['co', 'pm10', 'o3', 'so2', 'no2', 'pm25', 'psi', 'uvi', 'neph',
       'aqi', 'mepaqi', 'pol', 'temperature', 'humidity', 'pressure',
       'wd', 'wind-speed', 'd', 'pm1', 'wind-gust', 'precipitation',
       'dew', 'wind speed', 'wind gust'], dtype=object)

In [25]:
# Display an overview of the number of rows each Specie has in the dataframe
master_df["Specie"].value_counts()

no2              565392
pm25             552899
pm10             547363
o3               541019
so2              469871
co               429370
temperature      338991
humidity         338874
pressure         336973
wind-speed       327923
dew              230768
wind-gust        214730
wd                30465
precipitation     30222
aqi               19417
uvi               14001
wind speed         4517
pol                4243
wind gust          2875
d                  2031
pm1                1560
neph               1537
mepaqi             1298
psi                 363
Name: Specie, dtype: int64

In [26]:
species_to_remove = ["temperature", "humidity", "pressure", "wind-speed", "dew", "wind-gust",
                     "wind speed", "wind gust", "precipitation", "wd", "aqi", "uvi", "pol", "pm1", "mepaqi", "neph"]

short_airdf = master_df[~master_df["Specie"].isin(species_to_remove)].reset_index(drop=True).copy()

In [27]:
short_airdf.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,6/01/2015,KR,Jeonju,co,124,0.1,12.3,4.5,55.74
1,22/01/2015,KR,Jeonju,co,116,4.5,10.0,6.7,16.09
2,30/03/2015,KR,Jeonju,co,118,1.2,11.2,5.6,35.98
3,27/05/2015,KR,Jeonju,co,93,2.3,5.6,3.4,6.54
4,3/02/2015,KR,Jeonju,co,133,4.5,13.4,7.8,39.24


In [None]:
# Overview of the remaining air pollutant species in focus
short_airdf["Specie"].value_counts()

In [None]:
short_airdf.info()

In [None]:
short_airdf["Date"] = pd.to_datetime(short_airdf["Date"], format="%d/%m/%Y")

In [None]:
# Double check the Date data type
short_airdf.info()

In [None]:
# Find the earliest date the air quality dataset covers:
short_airdf["Date"].min()

In [None]:
# Find the latest date the air quality dataset covers:
short_airdf["Date"].max()

In [None]:
# Because we only focus on air data from 2019 to 2020H1, we'll remove the few data points on 2018-12-31 and the first days of July 2020.
clean_airdf = short_airdf[(short_airdf["Date"] >= pd.to_datetime("2019-01-01")) & (short_airdf["Date"] <= pd.to_datetime("2020-07-01"))].copy()
clean_airdf.head()

In [None]:
# Set the starting date for the air quality analysis
starting_date = clean_airdf["Date"].min()
starting_date

In [None]:
# Set the cutoff date for the air quality analysis
end_date = clean_airdf["Date"].max()
end_date

In [None]:
# Display an overview of the Country column
clean_airdf["Country"].unique()

In [None]:
clean_airdf["Country"].nunique()

In [None]:
# Display an overview of the City column
clean_airdf["City"].unique()

In [None]:
clean_airdf["City"].nunique()

In [None]:
clean_airdf.loc[clean_airdf["Country"] == "AU", "City"].value_counts()