# Data Exploration and Cleanup

In [8]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import json
import seaborn as sns
import os
import datetime
from pprint import pprint
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")


In [9]:
parent_folder = 'historical_data'
csv_list = [f for f in os.listdir(parent_folder) if '.csv' in f]
df_list = []
for csv in csv_list:
    df = pd.read_csv(os.path.join(parent_folder, csv))
    df_list.append(df)
master_df = pd.concat(df_list, ignore_index=True)

In [10]:
master_df.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,6/01/2015,KR,Jeonju,co,124,0.1,12.3,4.5,55.74
1,22/01/2015,KR,Jeonju,co,116,4.5,10.0,6.7,16.09
2,30/03/2015,KR,Jeonju,co,118,1.2,11.2,5.6,35.98
3,27/05/2015,KR,Jeonju,co,93,2.3,5.6,3.4,6.54
4,3/02/2015,KR,Jeonju,co,133,4.5,13.4,7.8,39.24


* Data taken from kaggle.
https://www.kaggle.com/imdevskp/corona-virus-report

In [13]:
# Read in Covid Data
covid_df = pd.read_csv('covid_19_clean_complete.csv', parse_dates=['Date'])
# Replace NaN values by 0
covid_df.fillna(0, inplace=True)
# Remove columns
covid_df = covid_df.drop(['Lat','Long','Province/State'], axis=1)
# Rename the columns so that they make sense
covid_df.rename (columns = {'Country/Region':'Country'}, inplace = True)
covid_df.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered,Active,WHO Region
0,Afghanistan,2020-01-22,0,0,0,0,Eastern Mediterranean
1,Albania,2020-01-22,0,0,0,0,Europe
2,Algeria,2020-01-22,0,0,0,0,Africa
3,Andorra,2020-01-22,0,0,0,0,Europe
4,Angola,2020-01-22,0,0,0,0,Africa


In [15]:
# Group datas by Country
covid_df['active'] = covid_df['Confirmed'] - covid_df['Deaths'] - covid_df['Recovered']
top = covid_df[covid_df['Date'] == covid_df['Date'].max()]

world = top.groupby('Country')['Confirmed','active','Deaths'].sum()
world = top.groupby('Country')['Confirmed','active','Deaths'].sum().reset_index()
#world.head()
world.head()

Unnamed: 0,Country,Confirmed,active,Deaths
0,Afghanistan,36263,9796,1269
1,Albania,4880,1991,144
2,Algeria,27973,7973,1163
3,Andorra,907,52,52
4,Angola,950,667,41


In [16]:
#PLOTTING ON WORLD MAP
figure = px.choropleth(world, locations="Country", locationmode='country names' 
                       , color="active",hover_name="Country",range_color=[1,1000],
                      color_continuous_scale="Peach",title="Active Cases in World")
figure.show()

In [None]:
# Display an overview of the Specie column
master_df["Specie"].unique()

In [None]:
# Display an overview of the number of rows each Specie has in the dataframe
master_df["Specie"].value_counts()

In [None]:
species_to_remove = ["temperature", "humidity", "pressure", "wind-speed", "dew", "wind-gust",
                     "wind speed", "wind gust", "precipitation", "wd", "aqi", "uvi", "pol", "pm1", "mepaqi", "neph"]

short_airdf = master_df[~master_df["Specie"].isin(species_to_remove)].reset_index(drop=True).copy()

In [None]:
short_airdf.head()

In [None]:
# Overview of the remaining air pollutant species in focus
short_airdf["Specie"].value_counts()

In [None]:
short_airdf.info()

In [None]:
short_airdf["Date"] = pd.to_datetime(short_airdf["Date"], format="%d/%m/%Y")

In [None]:
# Double check the Date data type
short_airdf.info()

In [None]:
# Find the earliest date the air quality dataset covers:
short_airdf["Date"].min()

In [None]:
# Find the latest date the air quality dataset covers:
short_airdf["Date"].max()

In [None]:
# Because we only focus on air data from 2019 to 2020H1, we'll remove the few data points on 2018-12-31 and the first days of July 2020.
clean_airdf = short_airdf[(short_airdf["Date"] >= pd.to_datetime("2019-01-01")) & (short_airdf["Date"] <= pd.to_datetime("2020-07-01"))].copy()
clean_airdf.head()

In [None]:
# Set the starting date for the air quality analysis
starting_date = clean_airdf["Date"].min()
starting_date

In [None]:
# Set the cutoff date for the air quality analysis
end_date = clean_airdf["Date"].max()
end_date

In [None]:
# Display an overview of the Country column
clean_airdf["Country"].unique()

In [None]:
clean_airdf["Country"].nunique()

In [None]:
# Display an overview of the City column
clean_airdf["City"].unique()

In [None]:
clean_airdf["City"].nunique()

In [None]:
clean_airdf.loc[clean_airdf["Country"] == "AU", "City"].value_counts()