In [31]:
import pandas as pd
import numpy as np
covid_data=pd.read_csv('COVID19-Data.csv')

In [32]:
from scipy import stats

In [33]:
covid_data.head()
#confirming that our data has been imported

Unnamed: 0,ID,Updated,Confirmed,ConfirmedChange,Deaths,DeathsChange,Recovered,RecoveredChange,Latitude,Longitude,ISO2,ISO3,Country_Region,AdminRegion1,AdminRegion2
0,338995,01/21/2020,262.0,,0.0,,,,,,,,Worldwide,,
1,338996,01/22/2020,313.0,51.0,0.0,0.0,,,,,,,Worldwide,,
2,338997,01/23/2020,578.0,265.0,0.0,0.0,,,,,,,Worldwide,,
3,338998,01/24/2020,841.0,263.0,0.0,0.0,,,,,,,Worldwide,,
4,338999,01/25/2020,1320.0,479.0,0.0,0.0,,,,,,,Worldwide,,


In [34]:
#Cleaning the data

#removing columns we don't need
columns_to_drop=['ConfirmedChange','DeathsChange',
                 'RecoveredChange','Latitude','Longitude','ISO2','ISO3','AdminRegion1','AdminRegion2']
covid_data.drop(columns_to_drop,inplace=True,axis=1)
#inplace tells our program that we want to manipulate the covid_data object


In [35]:
covid_data.head(100)

Unnamed: 0,ID,Updated,Confirmed,Deaths,Recovered,Country_Region
0,338995,01/21/2020,262.0,0.0,,Worldwide
1,338996,01/22/2020,313.0,0.0,,Worldwide
2,338997,01/23/2020,578.0,0.0,,Worldwide
3,338998,01/24/2020,841.0,0.0,,Worldwide
4,338999,01/25/2020,1320.0,0.0,,Worldwide
...,...,...,...,...,...,...
95,6955177,04/25/2020,2868539.0,201502.0,811660.0,Worldwide
96,7002306,04/26/2020,2965363.0,206265.0,863464.0,Worldwide
97,7055144,04/27/2020,3002303.0,208131.0,878813.0,Worldwide
98,7098114,04/28/2020,3083467.0,213824.0,915988.0,Worldwide


In [36]:
covid_data['ID'].is_unique
#checking if values in 'ID' column are unique so as to make it our identifier

True

In [37]:
covid_data=covid_data.set_index('ID')
#setting the index to 'ID' since it's unique

In [38]:
covid_data.head(1000)

Unnamed: 0_level_0,Updated,Confirmed,Deaths,Recovered,Country_Region
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
338995,01/21/2020,262.0,0.0,,Worldwide
338996,01/22/2020,313.0,0.0,,Worldwide
338997,01/23/2020,578.0,0.0,,Worldwide
338998,01/24/2020,841.0,0.0,,Worldwide
338999,01/25/2020,1320.0,0.0,,Worldwide
...,...,...,...,...,...
69664694,09/25/2020,13045.0,373.0,7309.0,Albania
70103884,09/26/2020,13153.0,375.0,7397.0,Albania
70512017,09/27/2020,13259.0,377.0,7534.0,Albania
70893024,09/28/2020,13391.0,380.0,7629.0,Albania


In [39]:
covid_data.dtypes.value_counts()
#we are checking for the daya types

float64    3
object     2
dtype: int64

In [40]:
covid_data=covid_data.loc[(covid_data['Country_Region']=='Kenya')]
#selecting only those rows that contain data from Kenya

In [41]:
covid_data.head()

Unnamed: 0_level_0,Updated,Confirmed,Deaths,Recovered,Country_Region
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7174960,03/13/2020,1.0,0.0,0.0,Kenya
7174962,03/14/2020,1.0,0.0,,Kenya
7174964,03/15/2020,1.0,0.0,0.0,Kenya
7174965,03/16/2020,3.0,0.0,0.0,Kenya
7174966,03/17/2020,3.0,0.0,0.0,Kenya


In [42]:
covid_data=covid_data.dropna()
#dropping all rows with 'Nan' as a value 

In [43]:
#checking for duplicate values
covid_data.drop_duplicates(['Updated'])

Unnamed: 0_level_0,Updated,Confirmed,Deaths,Recovered,Country_Region
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7174960,03/13/2020,1.0,0.0,0.0,Kenya
7174964,03/15/2020,1.0,0.0,0.0,Kenya
7174965,03/16/2020,3.0,0.0,0.0,Kenya
7174966,03/17/2020,3.0,0.0,0.0,Kenya
7174980,03/18/2020,7.0,0.0,0.0,Kenya
...,...,...,...,...,...
122067117,03/07/2021,108827.0,1876.0,87570.0,Kenya
122067118,03/08/2021,109164.0,1879.0,87623.0,Kenya
122067119,03/09/2021,109643.0,1886.0,87623.0,Kenya
122067121,03/10/2021,110356.0,1898.0,87903.0,Kenya


In [44]:
covid_data.describe()

Unnamed: 0,Confirmed,Deaths,Recovered
count,362.0,362.0,362.0
mean,44168.066298,779.662983,32680.720994
std,39771.378537,688.899252,32979.688567
min,1.0,0.0,0.0
25%,3343.0,97.0,1178.25
50%,35709.5,611.5,21999.5
75%,90809.75,1578.5,71497.75
max,111185.0,1899.0,87994.0


In [46]:
print(covid_data['Updated'].dtype)

object


In [48]:
covid_data['Updated'] = pd.to_datetime(covid_data['Updated'], format='%m/%d/%Y')
#changing the data type of the data in 'Updated' column from a String to datetime

In [51]:
print(covid_data['Updated'].dtype)

datetime64[ns]


In [50]:
covid_data[(np.abs(stats.zscore(covid_data))<3).all(axis=1)]
#keeping values whose zscore is < 3

TypeError: unsupported operand type(s) for +: 'Timestamp' and 'Timestamp'

In [53]:
covid_data.head(1000)

Unnamed: 0_level_0,Updated,Confirmed,Deaths,Recovered,Country_Region
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7174960,2020-03-13,1.0,0.0,0.0,Kenya
7174964,2020-03-15,1.0,0.0,0.0,Kenya
7174965,2020-03-16,3.0,0.0,0.0,Kenya
7174966,2020-03-17,3.0,0.0,0.0,Kenya
7174980,2020-03-18,7.0,0.0,0.0,Kenya
...,...,...,...,...,...
122067117,2021-03-07,108827.0,1876.0,87570.0,Kenya
122067118,2021-03-08,109164.0,1879.0,87623.0,Kenya
122067119,2021-03-09,109643.0,1886.0,87623.0,Kenya
122067121,2021-03-10,110356.0,1898.0,87903.0,Kenya
