In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# import data
df = pd.read_csv('data.csv')

df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# find the number of missing values in each column
df.isnull().sum().sort_values(ascending=False)

In [None]:
# remove the rows with missing values in the column 'Daily tests'
df = df.dropna(subset=['Daily tests']).reset_index(drop = True)

In [None]:
df.isnull().sum().sort_values(ascending=False)

Removing countries with not enough data

In [None]:
# find the sum of unique days in dataset
# total days between 1/1/2020 and 28/2/2021
total_days = 425 
total_countries = len(df['Entity'].unique())
# for each country, find the number of days 
countries = df['Entity'].unique()
for country in countries:
    # find the data frame for each country
    country_df = df[df['Entity'] == country]
    # find the number of days for each country
    country_days = country_df['Date'].nunique()
    # find the percent of days for each country
    percent = country_days / total_days
    if (percent<0.15): 
        # these countries have less than 15% of the days in the dataset
        # so we will remove them
        df.drop(df[df['Entity'] == country].index, inplace = True)
        # print(country, round(percent, 2))
# fix the index
df = df.reset_index(drop = True)
# print how many countries are left
print('Number of countries left: ',total_countries-len(df['Entity'].unique()))

Replace NaN values before first deaths and cases for each country with 0

In [None]:
entityGroup = df.groupby('Entity')

for entity, group in entityGroup:
    for index in range(group.index[0], group.index[-1] + 1):
        if (df.isnull().iloc[index]['Cases']):
            df.at[index, 'Cases'] = 0
        else:
            break

for entity, group in entityGroup:
    for index in range(group.index[0], group.index[-1] + 1):
        if (df.isnull().iloc[index]['Deaths']):
            df.at[index, 'Deaths'] = 0
        else:
            break

df.isnull().sum().sort_values(ascending=False)

Convert total number of deaths and cases to daily number

In [None]:
for entity, group in entityGroup:
    for index in range(group.index[0]+1, group.index[-1] + 1):
        daily_cases = df.at[index, 'Cases'] - df.at[index - 1, 'Cases'] 
        daily_deaths = df.at[index, 'Deaths'] - df.at[index - 1, 'Deaths']
        if (daily_cases < 0):
            df.at[index, 'Cases'] = df.at[index-1, 'Cases']
        if (daily_deaths < 0):
            df.at[index, 'Deaths'] = df.at[index-1, 'Deaths']

In [None]:
# add a new column to the DataFrame for cases today
df['Daily cases'] = 0
# add a new column to the DataFrame for deaths today
df['Daily deaths'] = 0

dfCopy = df.copy()
for entity, group in entityGroup:
    for index in range(group.index[0]+1, group.index[-1] + 1):
        df.at[index, 'Daily cases'] = dfCopy.at[index, 'Cases'] - dfCopy.at[index - 1, 'Cases'] 
        df.at[index, 'Daily deaths'] = dfCopy.at[index, 'Deaths'] - dfCopy.at[index - 1, 'Deaths']

Drop the row where daily cases are more than daily tests

In [None]:
df.drop(df[df['Daily tests'] < df['Daily cases']].index, inplace = True)
df = df.reset_index(drop = True)

Create Seasons Column\
1 (winter) -> December - February \
2 (spring) -> March - May \
3 (summer) -> June - August \
4 (autumn) -> September - November

In [None]:
seasons = {1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'}
    
df['Date'] =  pd.to_datetime(df['Date'])
df['Season'] = df['Date'].apply(lambda x: seasons[x.month%12 // 3 + 1])

Correlation Heatmap

In [None]:
# Set the size of the figure
fig, ax = plt.subplots(figsize=(10, 10))

sns.heatmap(df.corr(numeric_only=True),annot=True,square=False, ax=ax)
plt.show()

Extract a csv file with the cleaned data

In [None]:
# export datta to csv
df.to_csv('data_cleaned.csv', index=False)