# Coronavirus(COVID-19) Data Cleaning(Worldwide) & Bangladesh
- Confirmed Cases 
- Deaths Cases 
- Recovered Cases 
![Coronavirus](./img/corona.png)

## Library Import

In [2]:
import pandas as pd 
from datetime import datetime, timedelta
import matplotlib as plt 
import seaborn as sns 

## Exploring Raw Data 

In [2]:
# Loading cumulative raw data 
raw_confirmed = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
raw_recovered   = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv") 
raw_deaths = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")

In [3]:
# Shape of cumulative data
print(f"The Shape of Confirmed is: {raw_confirmed.shape}") 
print(f"The Shape of Recovered is: {raw_recovered.shape}")
print(f"The Shape of Deaths is: {raw_deaths.shape}")

The Shape of Confirmed is: (501, 66)
The Shape of Recovered is: (501, 66)
The Shape of Deaths is: (501, 66)


In [4]:
# Examine first few rows of raw_confirmed 
raw_confirmed.head() 

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20
0,,Thailand,15.0,101.0,2,3,5,7,8,8,...,82,114,147,177,212,272,322,411,599,599.0
1,,Japan,36.0,138.0,2,1,2,2,4,4,...,773,839,825,878,889,924,963,1007,1086,1086.0
2,,Singapore,1.2833,103.8333,0,1,3,3,4,5,...,212,226,243,266,313,345,385,432,455,455.0
3,,Nepal,28.1667,84.25,0,0,0,1,1,1,...,1,1,1,1,1,1,1,1,2,2.0
4,,Malaysia,2.5,112.5,0,0,0,3,4,4,...,238,428,566,673,790,900,1030,1183,1306,1306.0


In [5]:
# Examine first few rows of raw_recovered 
raw_recovered.head() 

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20
0,,Thailand,15.0,101.0,0,0,0,0,2,2,...,35,35,35,41,42,42,42,42,44,44.0
1,,Japan,36.0,138.0,0,0,0,0,1,1,...,118,118,144,144,144,150,191,232,235,235.0
2,,Singapore,1.2833,103.8333,0,0,0,0,0,0,...,105,105,109,114,114,114,124,140,144,144.0
3,,Nepal,28.1667,84.25,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1.0
4,,Malaysia,2.5,112.5,0,0,0,0,0,0,...,35,42,42,49,60,75,87,114,139,139.0


In [6]:
# Examine first few rows of raw_recovered 
raw_deaths.head() 

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20
0,,Thailand,15.0,101.0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1.0
1,,Japan,36.0,138.0,0,0,0,0,0,0,...,22,22,27,29,29,29,33,35,40,40.0
2,,Singapore,1.2833,103.8333,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,2,2.0
3,,Nepal,28.1667,84.25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,,Malaysia,2.5,112.5,0,0,0,0,0,0,...,0,0,0,2,2,2,3,4,10,10.0


In [7]:
# Un-Pivoting the data 
raw_confirmed2 = pd.melt(raw_confirmed, id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], var_name=['Date'])
raw_recovered2 = pd.melt(raw_recovered, id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], var_name=['Date'])
raw_deaths2 = pd.melt(raw_deaths, id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], var_name=['Date'])

In [8]:
# Show after un-pivoting
raw_confirmed2
raw_recovered2
raw_deaths2

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,value
0,,Thailand,15.0000,101.0000,1/22/20,0.0
1,,Japan,36.0000,138.0000,1/22/20,0.0
2,,Singapore,1.2833,103.8333,1/22/20,0.0
3,,Nepal,28.1667,84.2500,1/22/20,0.0
4,,Malaysia,2.5000,112.5000,1/22/20,0.0
...,...,...,...,...,...,...
31057,,Jersey,49.1900,-2.1100,3/23/20,0.0
31058,,Puerto Rico,18.2000,-66.5000,3/23/20,1.0
31059,,Republic of the Congo,-1.4400,15.5560,3/23/20,0.0
31060,,The Bahamas,24.2500,-76.0000,3/23/20,0.0


In [9]:
# Now take a look at the shape of data 
print(f"The Shape of Confirmed is: {raw_confirmed2.shape}")
print(f"The Shape of Recovered is: {raw_recovered2.shape}") 
print(f"The Shape of Deaths is: {raw_deaths2.shape}") 

The Shape of Confirmed is: (31062, 6)
The Shape of Recovered is: (31062, 6)
The Shape of Deaths is: (31062, 6)


In [10]:
# Examine first few rows of new dataframe 
raw_confirmed2.head()
raw_recovered2.head() 
raw_deaths2.head() 

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,value
0,,Thailand,15.0,101.0,1/22/20,0.0
1,,Japan,36.0,138.0,1/22/20,0.0
2,,Singapore,1.2833,103.8333,1/22/20,0.0
3,,Nepal,28.1667,84.25,1/22/20,0.0
4,,Malaysia,2.5,112.5,1/22/20,0.0


In [11]:
# Convert date 
raw_confirmed2['Date'] = pd.to_datetime(raw_confirmed2['Date']) 
raw_recovered2['Date'] = pd.to_datetime(raw_recovered2['Date']) 
raw_deaths2['Date'] = pd.to_datetime(raw_deaths2['Date'])  

In [12]:
# Ranaming the values 
raw_confirmed2.columns = raw_confirmed2.columns.str.replace('value', 'Confirmed')
raw_recovered2.columns = raw_recovered2.columns.str.replace('value', 'Recovered')
raw_deaths2.columns = raw_deaths2.columns.str.replace('value', 'Deaths')

In [13]:
# After renaming 
raw_confirmed2.head()
raw_recovered2.head() 
raw_deaths2.head() 

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Deaths
0,,Thailand,15.0,101.0,2020-01-22,0.0
1,,Japan,36.0,138.0,2020-01-22,0.0
2,,Singapore,1.2833,103.8333,2020-01-22,0.0
3,,Nepal,28.1667,84.25,2020-01-22,0.0
4,,Malaysia,2.5,112.5,2020-01-22,0.0


In [14]:
# Handling missing values of confirmed 
raw_confirmed2.isnull().sum() 

Province/State    10788
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed           192
dtype: int64

In [15]:
# Handling missing values of recovered 
raw_recovered2.isnull().sum() 

Province/State    10788
Country/Region        0
Lat                   0
Long                  0
Date                  0
Recovered           192
dtype: int64

In [16]:
# Handling missing values of deaths 
raw_deaths2.isnull().sum() 

Province/State    10788
Country/Region        0
Lat                   0
Long                  0
Date                  0
Deaths              192
dtype: int64

In [17]:
# Filling missing values 
raw_confirmed2['Province/State'].fillna(raw_confirmed2['Country/Region'], inplace=True)
raw_recovered2['Province/State'].fillna(raw_recovered2['Country/Region'], inplace=True)
raw_deaths2['Province/State'].fillna(raw_deaths2['Country/Region'], inplace=True)

In [18]:
# Printing shapes before joining 
print(f"Shape of Confirmed is: {raw_confirmed2.shape}")
print(f"Shape of Recovered is: {raw_recovered2.shape}")
print(f"Shape of Deaths is: {raw_deaths2.shape}")

Shape of Confirmed is: (31062, 6)
Shape of Recovered is: (31062, 6)
Shape of Deaths is: (31062, 6)


In [19]:
# Confirmed with Deaths
full_join = raw_confirmed2.merge(raw_deaths2[['Province/State','Country/Region','Date','Deaths']], 
                                      how = 'outer', 
                                      left_on = ['Province/State','Country/Region','Date'], 
                                      right_on = ['Province/State', 'Country/Region','Date'])


# full join with Recovered
full_join = full_join.merge(raw_recovered2[['Province/State','Country/Region','Date','Recovered']], 
                                      how = 'outer', 
                                      left_on = ['Province/State','Country/Region','Date'], 
                                      right_on = ['Province/State', 'Country/Region','Date'])

full_join.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,Thailand,Thailand,15.0,101.0,2020-01-22,2.0,0.0,0.0
1,Japan,Japan,36.0,138.0,2020-01-22,2.0,0.0,0.0
2,Singapore,Singapore,1.2833,103.8333,2020-01-22,0.0,0.0,0.0
3,Nepal,Nepal,28.1667,84.25,2020-01-22,0.0,0.0,0.0
4,Malaysia,Malaysia,2.5,112.5,2020-01-22,0.0,0.0,0.0


In [20]:
print("Shape of First Join: ", full_join.shape)
print("Shape of Second Join: ", full_join.shape)

Shape of First Join:  (31062, 8)
Shape of Second Join:  (31062, 8)


In [21]:
# Check missing values 
full_join.isnull().sum()

Province/State      0
Country/Region      0
Lat                 0
Long                0
Date                0
Confirmed         192
Deaths            192
Recovered         192
dtype: int64

In [56]:
# Fill missing value and convert to integer
full_join = full_join.astype({'Confirmed':'int64', 'Deaths':'int64', 'Recovered':'int64'})
full_join.loc[:, 'Confirmed'].fillna(value=0, inplace=True)
full_join.loc[:, 'Deaths'].fillna(value=0, inplace=True)
full_join.loc[:, 'Recovered'].fillna(value=0, inplace=True)

In [57]:
full_join.isnull().sum()

Province/State    0
Country/Region    0
Lat               0
Long              0
Date              0
Confirmed         0
Deaths            0
Recovered         0
dtype: int64

In [67]:
# Store clean data 
cleaned_data = full_join
cleaned_data.head() 

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,Thailand,Thailand,15.0,101.0,2020-01-22,2,0,0
1,Japan,Japan,36.0,138.0,2020-01-22,2,0,0
2,Singapore,Singapore,1.2833,103.8333,2020-01-22,0,0,0
3,Nepal,Nepal,28.1667,84.25,2020-01-22,0,0,0
4,Malaysia,Malaysia,2.5,112.5,2020-01-22,0,0,0


In [68]:
# Export 
cleaned_data.to_csv("world_data/covid-19_cleaned_data.csv(updated)", index=False)

In [6]:
# Take a look at new dataset 
df = pd.read_csv("world_data/covid-19_cleaned_data.csv(updated)")

In [7]:
df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,Thailand,Thailand,15.0,101.0,2020-01-22,2,0,0
1,Japan,Japan,36.0,138.0,2020-01-22,2,0,0
2,Singapore,Singapore,1.2833,103.8333,2020-01-22,0,0,0
3,Nepal,Nepal,28.1667,84.25,2020-01-22,0,0,0
4,Malaysia,Malaysia,2.5,112.5,2020-01-22,0,0,0


In [11]:
df_bd = pd.read_csv("./bd_data/COVID-19_in_bd.csv")
df_bd

Unnamed: 0,Date,Confirmed,Deaths,Recovered
0,2020-03-08,3,0,0
1,2020-03-09,3,0,0
2,2020-03-10,3,0,0
3,2020-03-11,3,0,0
4,2020-03-12,3,0,0
5,2020-03-13,3,0,0
6,2020-03-14,3,0,0
7,2020-03-15,5,0,0
8,2020-03-16,8,0,0
9,2020-03-17,10,0,0
