# This notebook is for test dataset creation

## Data Dictionary

| Field          | Description                                                                           |
|----------------|---------------------------------------------------------------------------------------|
|Date |Date of incident	|
|Province/State |If there is a breakdown in states	|
|Lat |Latitude	|
|Lon |Longitude	|
|Confirmed |Confirmed cases	|
|Recovered |Recovered cases	|
|Deaths |Death cases	|
|Active |Active cases	|

### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import datetime
import scipy.stats

%matplotlib inline
#sets the default autosave frequency in seconds
%autosave 60 
sns.set_style('dark')
sns.set(font_scale=1.2)

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)
pd.set_option('display.width', 1000)

np.random.seed(0)
np.set_printoptions(suppress=True)

Autosaving every 60 seconds


In [3]:
df = pd.read_csv("testdataset.csv",parse_dates=['Date'], index_col=["Date"])

In [4]:
df

Unnamed: 0_level_0,Country,Confirmed,Recovered,Deaths,Active
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-22,Afghanistan,0,0,0,0
2020-01-23,Afghanistan,0,0,0,0
2020-01-24,Afghanistan,0,0,0,0
2020-01-25,Afghanistan,0,0,0,0
2020-01-26,Afghanistan,0,0,0,0
...,...,...,...,...,...
2020-11-24,Timor-Leste,0,30,0,-30
2020-11-25,Timor-Leste,0,30,0,-30
2020-11-26,Timor-Leste,0,30,0,-30
2020-11-27,Timor-Leste,0,30,0,-30


### Extract country Malaysia from dataset for analysis and machine learning

In [5]:
malaysia = df[df["Country"] == "Malaysia"]

In [6]:
malaysia

Unnamed: 0_level_0,Country,Confirmed,Recovered,Deaths,Active
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-22,Malaysia,0,0,0,0
2020-01-23,Malaysia,0,0,0,0
2020-01-24,Malaysia,0,0,0,0
2020-01-25,Malaysia,3,0,0,3
2020-01-26,Malaysia,4,0,0,4
...,...,...,...,...,...
2020-11-24,Malaysia,58847,44153,341,14353
2020-11-25,Malaysia,59817,46501,345,12971
2020-11-26,Malaysia,60752,49056,348,11348
2020-11-27,Malaysia,61861,50204,350,11307


### Drop unwanted features

In [7]:
malaysia.columns

Index(['Country', 'Confirmed', 'Recovered', 'Deaths', 'Active'], dtype='object')

In [8]:
malaysia.drop(['Country'],axis=1,inplace=True)

In [9]:
malaysia

Unnamed: 0_level_0,Confirmed,Recovered,Deaths,Active
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-22,0,0,0,0
2020-01-23,0,0,0,0
2020-01-24,0,0,0,0
2020-01-25,3,0,0,3
2020-01-26,4,0,0,4
...,...,...,...,...
2020-11-24,58847,44153,341,14353
2020-11-25,59817,46501,345,12971
2020-11-26,60752,49056,348,11348
2020-11-27,61861,50204,350,11307


In [10]:
malaysia.reset_index(inplace=True)

In [11]:
malaysia

Unnamed: 0,Date,Confirmed,Recovered,Deaths,Active
0,2020-01-22,0,0,0,0
1,2020-01-23,0,0,0,0
2,2020-01-24,0,0,0,0
3,2020-01-25,3,0,0,3
4,2020-01-26,4,0,0,4
...,...,...,...,...,...
307,2020-11-24,58847,44153,341,14353
308,2020-11-25,59817,46501,345,12971
309,2020-11-26,60752,49056,348,11348
310,2020-11-27,61861,50204,350,11307


In [12]:
malaysia["ConfirmDiff"] = malaysia["Confirmed"].diff()

In [13]:
malaysia["DeathsDiff"] = malaysia["Deaths"].diff()

In [14]:
malaysia["RecoverDiff"] = malaysia["Recovered"].diff()

In [15]:
malaysia["ActiveDiff"] = malaysia["Active"].diff()

In [16]:
malaysia.head()

Unnamed: 0,Date,Confirmed,Recovered,Deaths,Active,ConfirmDiff,DeathsDiff,RecoverDiff,ActiveDiff
0,2020-01-22,0,0,0,0,,,,
1,2020-01-23,0,0,0,0,0.0,0.0,0.0,0.0
2,2020-01-24,0,0,0,0,0.0,0.0,0.0,0.0
3,2020-01-25,3,0,0,3,3.0,0.0,0.0,3.0
4,2020-01-26,4,0,0,4,1.0,0.0,0.0,1.0


In [17]:
malaysia.reset_index(inplace=True, drop=True)

In [18]:
malaysia.head()

Unnamed: 0,Date,Confirmed,Recovered,Deaths,Active,ConfirmDiff,DeathsDiff,RecoverDiff,ActiveDiff
0,2020-01-22,0,0,0,0,,,,
1,2020-01-23,0,0,0,0,0.0,0.0,0.0,0.0
2,2020-01-24,0,0,0,0,0.0,0.0,0.0,0.0
3,2020-01-25,3,0,0,3,3.0,0.0,0.0,3.0
4,2020-01-26,4,0,0,4,1.0,0.0,0.0,1.0


In [19]:
malaysia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         312 non-null    datetime64[ns]
 1   Confirmed    312 non-null    int64         
 2   Recovered    312 non-null    int64         
 3   Deaths       312 non-null    int64         
 4   Active       312 non-null    int64         
 5   ConfirmDiff  311 non-null    float64       
 6   DeathsDiff   311 non-null    float64       
 7   RecoverDiff  311 non-null    float64       
 8   ActiveDiff   311 non-null    float64       
dtypes: datetime64[ns](1), float64(4), int64(4)
memory usage: 22.1 KB


In [20]:
malaysia.replace(to_replace=np.nan, value=0.0, inplace=True)

In [21]:
malaysia.head()

Unnamed: 0,Date,Confirmed,Recovered,Deaths,Active,ConfirmDiff,DeathsDiff,RecoverDiff,ActiveDiff
0,2020-01-22,0,0,0,0,0.0,0.0,0.0,0.0
1,2020-01-23,0,0,0,0,0.0,0.0,0.0,0.0
2,2020-01-24,0,0,0,0,0.0,0.0,0.0,0.0
3,2020-01-25,3,0,0,3,3.0,0.0,0.0,3.0
4,2020-01-26,4,0,0,4,1.0,0.0,0.0,1.0


### Treat Missing Values

In [22]:
malaysia.isnull().sum()

Date           0
Confirmed      0
Recovered      0
Deaths         0
Active         0
ConfirmDiff    0
DeathsDiff     0
RecoverDiff    0
ActiveDiff     0
dtype: int64

In [23]:
malaysia["Month"] = malaysia["Date"].dt.month

In [24]:
malaysia["Day"] = malaysia["Date"].dt.day

In [25]:
malaysia.head()

Unnamed: 0,Date,Confirmed,Recovered,Deaths,Active,ConfirmDiff,DeathsDiff,RecoverDiff,ActiveDiff,Month,Day
0,2020-01-22,0,0,0,0,0.0,0.0,0.0,0.0,1,22
1,2020-01-23,0,0,0,0,0.0,0.0,0.0,0.0,1,23
2,2020-01-24,0,0,0,0,0.0,0.0,0.0,0.0,1,24
3,2020-01-25,3,0,0,3,3.0,0.0,0.0,3.0,1,25
4,2020-01-26,4,0,0,4,1.0,0.0,0.0,1.0,1,26


In [26]:
malaysia.columns

Index(['Date', 'Confirmed', 'Recovered', 'Deaths', 'Active', 'ConfirmDiff', 'DeathsDiff', 'RecoverDiff', 'ActiveDiff', 'Month', 'Day'], dtype='object')

In [27]:
malaysia.drop(['Date','Confirmed', 'Recovered', 'Deaths', 'Active', 'ActiveDiff'],axis=1, inplace=True)

In [28]:
malaysia.head()

Unnamed: 0,ConfirmDiff,DeathsDiff,RecoverDiff,Month,Day
0,0.0,0.0,0.0,1,22
1,0.0,0.0,0.0,1,23
2,0.0,0.0,0.0,1,24
3,3.0,0.0,0.0,1,25
4,1.0,0.0,0.0,1,26


In [29]:
malaysia = malaysia[['Month', 'Day','ConfirmDiff','RecoverDiff', 'DeathsDiff']]

In [30]:
malaysia.head()

Unnamed: 0,Month,Day,ConfirmDiff,RecoverDiff,DeathsDiff
0,1,22,0.0,0.0,0.0
1,1,23,0.0,0.0,0.0
2,1,24,0.0,0.0,0.0
3,1,25,3.0,0.0,0.0
4,1,26,1.0,0.0,0.0


In [31]:
malaysia["Month"] = malaysia["Month"].astype("category")

In [32]:
malaysia["Day"] = malaysia["Day"].astype("category")

In [33]:
malaysia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Month        312 non-null    category
 1   Day          312 non-null    category
 2   ConfirmDiff  312 non-null    float64 
 3   RecoverDiff  312 non-null    float64 
 4   DeathsDiff   312 non-null    float64 
dtypes: category(2), float64(3)
memory usage: 9.9 KB


In [34]:
df2 = pd.get_dummies(data=malaysia, drop_first=True)

In [35]:
df2.head()

Unnamed: 0,ConfirmDiff,RecoverDiff,DeathsDiff,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Day_10,Day_11,Day_12,Day_13,Day_14,Day_15,Day_16,Day_17,Day_18,Day_19,Day_20,Day_21,Day_22,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_30,Day_31
0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,3.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [36]:
df2 = df2[['Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 
           'Month_10', 'Month_11', 'Day_2', 'Day_3', 'Day_4', 'Day_5', 'Day_6', 'Day_7', 'Day_8', 'Day_9', 
           'Day_10', 'Day_11', 'Day_12', 'Day_13', 'Day_14', 'Day_15', 'Day_16', 'Day_17', 'Day_18', 
           'Day_19', 'Day_20', 'Day_21', 'Day_22', 'Day_23', 'Day_24', 'Day_25', 'Day_26', 'Day_27', 
           'Day_28', 'Day_29', 'Day_30', 'Day_31', 'ConfirmDiff', 'RecoverDiff', 'DeathsDiff']]

In [37]:
df2.head()

Unnamed: 0,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Day_10,Day_11,Day_12,Day_13,Day_14,Day_15,Day_16,Day_17,Day_18,Day_19,Day_20,Day_21,Day_22,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_30,Day_31,ConfirmDiff,RecoverDiff,DeathsDiff
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1.0,0.0,0.0


In [38]:
df2.tail()

Unnamed: 0,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Day_10,Day_11,Day_12,Day_13,Day_14,Day_15,Day_16,Day_17,Day_18,Day_19,Day_20,Day_21,Day_22,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_30,Day_31,ConfirmDiff,RecoverDiff,DeathsDiff
307,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2188.0,1673.0,4.0
308,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,970.0,2348.0,4.0
309,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,935.0,2555.0,3.0
310,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1109.0,1148.0,2.0
311,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1315.0,1110.0,4.0


In [39]:
test = df2.loc[309:]

In [40]:
test

Unnamed: 0,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Day_2,Day_3,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Day_10,Day_11,Day_12,Day_13,Day_14,Day_15,Day_16,Day_17,Day_18,Day_19,Day_20,Day_21,Day_22,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_30,Day_31,ConfirmDiff,RecoverDiff,DeathsDiff
309,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,935.0,2555.0,3.0
310,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1109.0,1148.0,2.0
311,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1315.0,1110.0,4.0


In [41]:
#test.to_csv("testcovid.csv",index=False)