In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from ydata_profiling import ProfileReport
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default='seaborn'

In [2]:
covid = pd.read_csv('covid_19_data.csv')
covid.describe()

Unnamed: 0,SNo,Confirmed,Deaths,Recovered
count,306429.0,306429.0,306429.0,306429.0
mean,153215.0,85670.91,2036.403268,50420.29
std,88458.577156,277551.6,6410.938048,201512.4
min,1.0,-302844.0,-178.0,-854405.0
25%,76608.0,1042.0,13.0,11.0
50%,153215.0,10375.0,192.0,1751.0
75%,229822.0,50752.0,1322.0,20270.0
max,306429.0,5863138.0,112385.0,6399531.0


In [3]:
covid.head(1)

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0


In [4]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306429 entries, 0 to 306428
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   SNo              306429 non-null  int64  
 1   ObservationDate  306429 non-null  object 
 2   Province/State   228329 non-null  object 
 3   Country/Region   306429 non-null  object 
 4   Last Update      306429 non-null  object 
 5   Confirmed        306429 non-null  float64
 6   Deaths           306429 non-null  float64
 7   Recovered        306429 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 18.7+ MB


In [5]:
covid['ObservationDate']=pd.to_datetime(covid['ObservationDate'])
covid['Last Update'] = pd.to_datetime(covid['Last Update'])
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306429 entries, 0 to 306428
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   SNo              306429 non-null  int64         
 1   ObservationDate  306429 non-null  datetime64[ns]
 2   Province/State   228329 non-null  object        
 3   Country/Region   306429 non-null  object        
 4   Last Update      306429 non-null  datetime64[ns]
 5   Confirmed        306429 non-null  float64       
 6   Deaths           306429 non-null  float64       
 7   Recovered        306429 non-null  float64       
dtypes: datetime64[ns](2), float64(3), int64(1), object(2)
memory usage: 18.7+ MB


In [6]:
covid.isnull().sum()

SNo                    0
ObservationDate        0
Province/State     78100
Country/Region         0
Last Update            0
Confirmed              0
Deaths                 0
Recovered              0
dtype: int64

In [7]:
covid.drop(['SNo'], 1, inplace=True)
#SNo seems useless here, so I am getting rid of it. 

  covid.drop(['SNo'], 1, inplace=True)


In [28]:
covid['Active Cases'] = covid['Confirmed']-covid['Recovered']-covid['Deaths']
covid.info()
#Showing that Sno has been dropped and added 'Active Cases' column.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306429 entries, 0 to 306428
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   ObservationDate  306429 non-null  datetime64[ns]
 1   Province/State   228329 non-null  object        
 2   Country/Region   306429 non-null  object        
 3   Last Update      306429 non-null  datetime64[ns]
 4   Confirmed        306429 non-null  float64       
 5   Deaths           306429 non-null  float64       
 6   Recovered        306429 non-null  float64       
 7   Active Cases     306429 non-null  float64       
dtypes: datetime64[ns](2), float64(4), object(2)
memory usage: 18.7+ MB


In [38]:
#masking for China, USA, and India
covid_china = covid[covid['Country/Region']=='Mainland China']

covid_us = covid[covid['Country/Region']=='USA']

covid_india = covid[covid['Country/Region']=='India']

covid_china.tail()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Active Cases
306340,2021-05-29,Tibet,Mainland China,2021-05-30 04:20:55,1.0,0.0,1.0,0.0
306368,2021-05-29,Unknown,Mainland China,2021-05-30 04:20:55,0.0,0.0,10.0,-10.0
306412,2021-05-29,Xinjiang,Mainland China,2021-05-30 04:20:55,980.0,3.0,977.0,0.0
306420,2021-05-29,Yunnan,Mainland China,2021-05-30 04:20:55,352.0,2.0,331.0,19.0
306426,2021-05-29,Zhejiang,Mainland China,2021-05-30 04:20:55,1364.0,1.0,1324.0,39.0


In [39]:
covid_china= covid_china.groupby(['Country/Region', 'ObservationDate']).agg({'Confirmed':'sum', 'Recovered':'sum', 'Deaths':'sum'})
covid_us= covid_us.groupby(['Country/Region', 'ObservationDate']).agg({'Confirmed':'sum', 'Recovered':'sum', 'Deaths':'sum'})
covid_india= covid_india.groupby(['Country/Region', 'ObservationDate']).agg({'Confirmed':'sum', 'Recovered':'sum', 'Deaths':'sum'})
covid_china.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Recovered,Deaths
Country/Region,ObservationDate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mainland China,2021-05-25,91019.0,86063.0,4636.0
Mainland China,2021-05-26,91038.0,86075.0,4636.0
Mainland China,2021-05-27,91045.0,86097.0,4636.0
Mainland China,2021-05-28,91061.0,86112.0,4636.0
Mainland China,2021-05-29,91072.0,86117.0,4636.0


In [42]:
fig = make_subplots(rows=1, cols=3)
fig.add_trace(
    go.Line(x=covid_china['ObservationDate'], y=covid_china['Confirmed', 'Deaths', 'Recovered','Active Cases']),
    row=1, col=1
)
fig.add_trace(
    go.Line(x=covid_us['ObservationDate'], y=covid_us['Confirmed', 'Deaths', 'Recovered','Active Cases']),
    row=1, col=2
)
fig.add_trace(
    go.Line(x=covid_india['ObservationDate'], y=covid_india['Confirmed', 'Deaths', 'Recovered','Active Cases']),
    row=1, col=3
)

fig.show()

KeyError: 'ObservationDate'