**We start off by importing the frameworks required for our analysis** 

In [50]:
import numpy as np 
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

**The dataset below records the confirmed incidences of the COVID19 virus by date and location received from Johns Hopkins University in collaboration with the World Health Organisation**

In [51]:
df = pd.read_csv('covid19clean.csv',parse_dates=['Date'])

In [52]:
df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,31.8257,117.2264,2020-01-22,1,0,0
1,Beijing,Mainland China,40.1824,116.4142,2020-01-22,14,0,0
2,Chongqing,Mainland China,30.0572,107.874,2020-01-22,6,0,0
3,Fujian,Mainland China,26.0789,117.9874,2020-01-22,1,0,0
4,Gansu,Mainland China,36.0611,103.8343,2020-01-22,0,0,0


**It is good practice to check how many null values the dataset has (As this dataset has been cleaned already, it will show zero)**

In [67]:
df.isnull().sum().sum()

0

**It is helpful to separate this data into values that are more relevant to our analysis of the effect of the COVID19 virus**

In [54]:
totcases = ['Confirmed', 'Deaths', 'Recovered', 'Still Infected']

df['Still Infected'] = df['Confirmed'] - df['Deaths'] - df['Recovered']

# NA values must be substituted with zeroes as NA values do not help with data analysis
df[['Province/State']] = df[['Province/State']].fillna('NA')
df[totcases] = df[totcases].fillna(0)

**We use temporary variables so as not to accidentally remove values from the dataset**

In [55]:
temp = df.groupby('Date')['Confirmed', 'Deaths', 'Recovered', 'Still Infected'].sum().reset_index()
temp = temp[temp['Date']==max(temp['Date'])].reset_index(drop=True)
temp

Unnamed: 0,Date,Confirmed,Deaths,Recovered,Still Infected
0,2020-03-10,119303,4290,64411,50602


**Now we melt the dataframe in order to use Date as the identifier**

In [56]:
tmp = temp.melt(id_vars="Date", value_vars=['Still Infected', 'Deaths', 'Recovered'])
fig = px.treemap(tmp, path=["variable"], values="value", height=200)
fig.show()

**A cursory analysis shows that the COVID19 virus does not seem to have a high lethality rate however it can be inferred that transmission rates are high**

In [57]:
dat = df.groupby(['Province/State','Country/Region'],as_index=False)['Province/State','Country/Region','Confirmed','Recovered','Deaths'].sum()

**An example of querying one country, the United States**

In [58]:
dat[dat['Country/Region'] == 'US']

Unnamed: 0,Province/State,Country/Region,Confirmed,Recovered,Deaths
0,"Adams, IN",US,1,0,0
1,"Alameda County, CA",US,13,0,0
2,Alaska,US,0,0,0
5,"Arapahoe, CO",US,2,0,0
6,Arizona,US,6,1,0
...,...,...,...,...,...
339,"Wilton, CT",US,1,0,0
340,Wisconsin,US,3,1,0
341,"Worcester, MA",US,1,0,0
342,Wyoming,US,0,0,0


**A treemap can be used to show the prevalence of the virus around the world.**

In [59]:
dat["all"] = "All"
fig = px.treemap(dat, path=['Country/Region'], values='Confirmed',
                  color_continuous_scale='RdBu',
                  color_continuous_midpoint=np.average(dat['Confirmed'], weights=dat['Confirmed']))
fig.update_layout(
    autosize=True,
    width=1000,
    height=1000)

fig.show()

**China has by far the highest incidence of the virus followed by Italy, Republic of Korea and Iran** 

**It is helpful to see what provinces in China had mortalities as this enables us to better understand protective measures**

In [60]:
dat["all"] = "All"
fig = px.treemap(dat, path=['all', 'Country/Region', 'Province/State'], values='Deaths',
                  color_continuous_scale='RdBu',)
fig.update_traces(textposition='top center')
fig.update_layout(
    autosize=False,
    width=1000,
    height=1000)

fig.show()

**Now we should look at the recovery rates around the world, we do this by finding the percentage of recovery by Country and Region**

In [61]:
dat['recoverpct'] = dat['Recovered']/dat['Confirmed']
dat["all"] = "All"
fig = px.treemap(dat, path=['all', 'Country/Region'], values='recoverpct',
                  color_continuous_scale='RdBu')

fig.update_traces(textposition='top center')
fig.update_layout(
    autosize=False,
    width=1000,
    height=1000)
fig.show()

In [62]:
del temp

**In order to determine the transmission rate of the virus, we need to look at confirmed cases over time**

In [63]:
temp = df.groupby('Date')['Still Infected', 'Deaths', 'Recovered'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Still Infected', 'Deaths', 'Recovered'],
                 var_name='Case', value_name='Count')
temp.head()

fig = px.bar(temp, x="Date", y="Count", color='Case',
             title='Confirmed COVID-19 Cases over time')

fig.show()


**Now we look at the deaths per day versus recoveries per day to determine how lethal this virus is**

In [64]:
temp = df.groupby(['Country/Region', 'Date'])['Confirmed', 'Deaths', 'Recovered'].sum()
temp = temp.reset_index()

fig = px.bar(temp, x="Date", y="Deaths", color='Country/Region', orientation='v', height=600,
             title='Confirmed COVID-19 Deaths', color_continuous_scale=px.colors.sequential.thermal)
fig.show()

fig = px.bar(temp, x="Date", y="Recovered", color='Country/Region', orientation='v', height=600,
             title='Confirmed COVID-19 Recovered', color_continuous_scale=px.colors.sequential.thermal)
fig.show()

**It would be helpful to remove Mainland China in order to see the incidence rates of the rest of the affected countries**

In [65]:
indexNames = df[df['Country/Region'] == 'Mainland China' ].index
df.drop(indexNames , inplace=True)

temp = df.groupby(['Country/Region', 'Date'])['Confirmed', 'Deaths', 'Recovered'].sum()

temp = temp.reset_index()

fig = px.bar(temp, x="Date", y="Deaths", color='Country/Region', orientation='v', height=600,
             title='Confirmed COVID-19 Deaths Excluding Mainland China', color_continuous_scale=px.colors.sequential.thermal)
fig.show()

fig = px.bar(temp, x="Date", y="Recovered", color='Country/Region', orientation='v', height=600,
             title='Confirmed COVID-19 Recovered Excluding Mainland China', color_continuous_scale=px.colors.sequential.thermal)
fig.show()

**An animated map known shows the spread of the virus over time (This needs to be run with the buttons below)**

In [66]:
fig = px.choropleth(dat, 
                    locations="Country/Region", 
                    locationmode = "country names",
                    color="Confirmed", 
                    hover_name="Country/Region", 
                    animation_frame="Confirmed"
                   )

fig.update_layout(
    title_text = 'Spread of COVID-19 over time',
    title_x = 0.5,
    geo=dict(
        showframe = False,
        showcoastlines = False,
    ))
    
fig.show()