In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

## Read Data

In [2]:
cov_train = pd.read_csv("./train.csv")

## Check Data

In [3]:
print("Data Shape:", cov_train.shape)

Data Shape: (907306, 9)


In [4]:
cov_train.head(5)

Unnamed: 0,Id,County,Province_State,Country_Region,Population,Weight,Date,Target,TargetValue
0,1,,,Afghanistan,27657145,0.058359,2020-01-23,ConfirmedCases,0.0
1,2,,,Afghanistan,27657145,0.583587,2020-01-23,Fatalities,0.0
2,3,,,Afghanistan,27657145,0.058359,2020-01-24,ConfirmedCases,0.0
3,4,,,Afghanistan,27657145,0.583587,2020-01-24,Fatalities,0.0
4,5,,,Afghanistan,27657145,0.058359,2020-01-25,ConfirmedCases,0.0


In [5]:
cov_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907306 entries, 0 to 907305
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Id              907306 non-null  int64  
 1   County          823466 non-null  object 
 2   Province_State  858312 non-null  object 
 3   Country_Region  907306 non-null  object 
 4   Population      907306 non-null  int64  
 5   Weight          907306 non-null  float64
 6   Date            907306 non-null  object 
 7   Target          907306 non-null  object 
 8   TargetValue     907306 non-null  float64
dtypes: float64(2), int64(2), object(5)
memory usage: 62.3+ MB


In [6]:
cov_train.isnull().sum()

Id                    0
County            83840
Province_State    48994
Country_Region        0
Population            0
Weight                0
Date                  0
Target                0
TargetValue           0
dtype: int64

## Reshape Data

In [7]:
### Hide Cases after 2020-05-10
cov_510 = cov_train[cov_train["Date"] <= "2020-05-10"]
cov_510.tail(5)

Unnamed: 0,Id,County,Province_State,Country_Region,Population,Weight,Date,Target,TargetValue
907257,969574,,,Zimbabwe,14240168,0.607106,2020-05-08,Fatalities,0.0
907258,969575,,,Zimbabwe,14240168,0.060711,2020-05-09,ConfirmedCases,1.0
907259,969576,,,Zimbabwe,14240168,0.607106,2020-05-09,Fatalities,0.0
907260,969577,,,Zimbabwe,14240168,0.060711,2020-05-10,ConfirmedCases,1.0
907261,969578,,,Zimbabwe,14240168,0.607106,2020-05-10,Fatalities,0.0


In [8]:
## Omitting Id, County and Province for country-level prediction
cov_510 = cov_510.drop(["Id", "County", "Province_State"], axis=1)
cov_510.head(5)

Unnamed: 0,Country_Region,Population,Weight,Date,Target,TargetValue
0,Afghanistan,27657145,0.058359,2020-01-23,ConfirmedCases,0.0
1,Afghanistan,27657145,0.583587,2020-01-23,Fatalities,0.0
2,Afghanistan,27657145,0.058359,2020-01-24,ConfirmedCases,0.0
3,Afghanistan,27657145,0.583587,2020-01-24,Fatalities,0.0
4,Afghanistan,27657145,0.058359,2020-01-25,ConfirmedCases,0.0


In [9]:
## Country list
country_list = list(cov_510["Country_Region"].unique())
country_list[:10]

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria']

In [10]:
## Create Timestamp
def create_timestamp(df):
    df["Year"] = pd.to_datetime(df['Date']).dt.year
    df["Month"] = pd.to_datetime(df['Date']).dt.month
    df["Day"] = pd.to_datetime(df['Date']).dt.day
    df["Timestamp"] = df["Month"] * 365 + df["Day"]
    
    return df

cov_with_ts = create_timestamp(cov_510)
cov_with_ts.head(5)

Unnamed: 0,Country_Region,Population,Weight,Date,Target,TargetValue,Year,Month,Day,Timestamp
0,Afghanistan,27657145,0.058359,2020-01-23,ConfirmedCases,0.0,2020,1,23,388
1,Afghanistan,27657145,0.583587,2020-01-23,Fatalities,0.0,2020,1,23,388
2,Afghanistan,27657145,0.058359,2020-01-24,ConfirmedCases,0.0,2020,1,24,389
3,Afghanistan,27657145,0.583587,2020-01-24,Fatalities,0.0,2020,1,24,389
4,Afghanistan,27657145,0.058359,2020-01-25,ConfirmedCases,0.0,2020,1,25,390


In [11]:
## Remove all date time column, only keep timestamp
cov_with_ts = cov_with_ts.drop(["Date", "Year", "Month", "Day"], axis=1)
cov_with_ts.head(5)

Unnamed: 0,Country_Region,Population,Weight,Target,TargetValue,Timestamp
0,Afghanistan,27657145,0.058359,ConfirmedCases,0.0,388
1,Afghanistan,27657145,0.583587,Fatalities,0.0,388
2,Afghanistan,27657145,0.058359,ConfirmedCases,0.0,389
3,Afghanistan,27657145,0.583587,Fatalities,0.0,389
4,Afghanistan,27657145,0.058359,ConfirmedCases,0.0,390


In [12]:
## Encode Country_Region
cov_with_ts["Country_Region"] = LabelEncoder().fit_transform(cov_with_ts["Country_Region"])
cov_with_ts.head(5)

Unnamed: 0,Country_Region,Population,Weight,Target,TargetValue,Timestamp
0,0,27657145,0.058359,ConfirmedCases,0.0,388
1,0,27657145,0.583587,Fatalities,0.0,388
2,0,27657145,0.058359,ConfirmedCases,0.0,389
3,0,27657145,0.583587,Fatalities,0.0,389
4,0,27657145,0.058359,ConfirmedCases,0.0,390


In [13]:
## Split into Fatal and Confirm and omit Target
cov_conf = cov_with_ts[cov_with_ts["Target"] == "ConfirmedCases"].drop(["Target"], axis=1)
cov_fatal = cov_with_ts[cov_with_ts["Target"] == "Fatalities"].drop(["Target"], axis=1)

In [14]:
cov_conf.head(5)

Unnamed: 0,Country_Region,Population,Weight,TargetValue,Timestamp
0,0,27657145,0.058359,0.0,388
2,0,27657145,0.058359,0.0,389
4,0,27657145,0.058359,0.0,390
6,0,27657145,0.058359,0.0,391
8,0,27657145,0.058359,0.0,392


In [15]:
cov_fatal.head(5)

Unnamed: 0,Country_Region,Population,Weight,TargetValue,Timestamp
1,0,27657145,0.583587,0.0,388
3,0,27657145,0.583587,0.0,389
5,0,27657145,0.583587,0.0,390
7,0,27657145,0.583587,0.0,391
9,0,27657145,0.583587,0.0,392


In [24]:
cov_fatal.tail(5)

Unnamed: 0,Country_Region,Population,Weight,TargetValue,Timestamp
907253,186,14240168,0.607106,0.0,1831
907255,186,14240168,0.607106,0.0,1832
907257,186,14240168,0.607106,0.0,1833
907259,186,14240168,0.607106,0.0,1834
907261,186,14240168,0.607106,0.0,1835


## Modelling

In [16]:
## Linear Model
from sklearn import linear_model

In [22]:
## Fatalities
fat_x = cov_fatal.drop(["TargetValue"], axis = 1)
fat_y = cov_fatal[["TargetValue"]]

## Confirmed
con_x = cov_conf.drop(["TargetValue"], axis = 1)
con_y = cov_conf[["TargetValue"]]

In [23]:
lr_fat = linear_model.LinearRegression()
lr_fat.fit(fat_x, fat_y)

LinearRegression()

In [31]:
data = {"Country_Region": [0,0], "Population": [27657145, 27657145], "Weight": [0.583587, 0.583587], "Timestamp": [1836, 1836]}
test = pd.DataFrame.from_dict(data)

In [33]:
pred = lr_fat.predict(test)
pred

array([[7.49289156],
       [7.49289156]])