# Outline

## 1. Data Cleaning
## 2. Data Exploration
## 3. Feature Engineering
## 4. Data Preprocessing
## 5. Model Building

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
import random
%matplotlib inline

### Data Cleaning

In [7]:
data = pd.read_csv('owid-covid-data.csv')
data.dropna(subset=['iso_code'],inplace=True)
pd.set_option("max_rows",10)

In [8]:
data.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'tests_per_case', 'positive_rate', 'tests_units', 'stringency_index',
       'population', 'population_density', 'median_age', 'aged_65_older',
       'aged_70_older', 'gdp_per_capita', 'extreme_poverty',
     

In [9]:
newData = data[['iso_code','continent','location',
                'population','human_development_index','gdp_per_capita',
                'date','total_cases','total_deaths',
                'new_cases','new_deaths','new_cases_smoothed',
                'new_deaths_smoothed','total_cases_per_million','total_deaths_per_million',
                'new_cases_per_million','new_deaths_per_million','new_cases_smoothed_per_million',
                'new_deaths_smoothed_per_million','diabetes_prevalence','cardiovasc_death_rate',
                'median_age','handwashing_facilities','hospital_beds_per_thousand',
                'life_expectancy','stringency_index','total_tests',
                'new_tests','total_tests_per_thousand','new_tests_per_thousand',
                'new_tests_smoothed','new_tests_smoothed_per_thousand','tests_per_case',
                'extreme_poverty']]

newData['id'] = range(1,len(newData)+1)
cols = newData.columns.tolist()
cols = cols[-1:] + cols[:-1]
newData = newData[cols]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
newData

Unnamed: 0,id,iso_code,continent,location,population,human_development_index,gdp_per_capita,date,total_cases,total_deaths,...,life_expectancy,stringency_index,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,tests_per_case,extreme_poverty
0,1,AFG,Asia,Afghanistan,3.892834e+07,0.498,1803.987,2019-12-31,,,...,64.83,,,,,,,,,
1,2,AFG,Asia,Afghanistan,3.892834e+07,0.498,1803.987,2020-01-01,,,...,64.83,0.0,,,,,,,,
2,3,AFG,Asia,Afghanistan,3.892834e+07,0.498,1803.987,2020-01-02,,,...,64.83,0.0,,,,,,,,
3,4,AFG,Asia,Afghanistan,3.892834e+07,0.498,1803.987,2020-01-03,,,...,64.83,0.0,,,,,,,,
4,5,AFG,Asia,Afghanistan,3.892834e+07,0.498,1803.987,2020-01-04,,,...,64.83,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56640,56641,OWID_WRL,,World,7.794799e+09,,15469.207,2020-11-11,51626568.0,1275434.0,...,72.58,,,,,,,,,10.0
56641,56642,OWID_WRL,,World,7.794799e+09,,15469.207,2020-11-12,52250094.0,1286026.0,...,72.58,,,,,,,,,10.0
56642,56643,OWID_WRL,,World,7.794799e+09,,15469.207,2020-11-13,52895696.0,1295138.0,...,72.58,,,,,,,,,10.0
56643,56644,OWID_WRL,,World,7.794799e+09,,15469.207,2020-11-14,53521318.0,1305050.0,...,72.58,,,,,,,,,10.0


In [11]:
# Spitting data into testing and training.
training = pd.DataFrame()
testing = pd.DataFrame()

# Split 20% for each location in the data.
for i in newData['location'].value_counts().index:
    train, test = train_test_split(newData[newData['location']==i], test_size=0.2, shuffle=False)
    training = pd.concat([training,train])
    testing = pd.concat([testing,test])


In [12]:
training['train_data'] = 1
testing['train_data'] = 0
newData = pd.concat([training,testing])

### Data Exploration & Visualization

In [None]:
training

In [None]:
training.info()

In [None]:
training.describe()

In [None]:
training.describe().columns

In [None]:
df_cat = training[['iso_code','continent','location','date']]
df_num = training[['population', 'human_development_index', 'gdp_per_capita',
       'total_cases', 'total_deaths', 'new_cases', 'new_deaths',
       'new_cases_smoothed', 'new_deaths_smoothed', 'total_cases_per_million',
       'total_deaths_per_million', 'new_cases_per_million',
       'new_deaths_per_million', 'new_cases_smoothed_per_million',
       'new_deaths_smoothed_per_million', 'diabetes_prevalence',
       'cardiovasc_death_rate', 'median_age', 'handwashing_facilities',
       'hospital_beds_per_thousand', 'life_expectancy', 'stringency_index',
       'total_tests', 'new_tests', 'total_tests_per_thousand',
       'new_tests_per_thousand', 'new_tests_smoothed',
       'new_tests_smoothed_per_thousand', 'tests_per_case', 'extreme_poverty']]

In [None]:
colors = ['red','orange','blue','green','yellow','purple','darkblue','pink','lightblue']

In [None]:
for i in df_num:
    plt.hist(df_num[i],range=(df_num[i].min(),df_num[i].max()),color=colors[random.randrange(0,len(colors)-1)])
    plt.title("Histogram of "+i)
    plt.ylabel('Frequency of Occurrence')
    plt.xlabel(i)
    plt.show()

In [None]:
for i in df_num:
    df_num[i].value_counts().plot(kind='bar')
    plt.title("Bar Chart of "+i)
    plt.ylabel('Frequency of Occurrence')
    plt.xlabel(i)
    sns.despine
    break

In [None]:
print(df_num.corr())

In [None]:
sns.heatmap(df_num.corr())

In [None]:
for i in df_cat.columns:
    sns.barplot(df_cat[i].value_counts().index , df_cat[i].value_counts()).set_title(i)
    plt.show()

In [None]:
for i in newData['location'].value_counts().index:
    y_axis = newData[newData['location']== i]['new_deaths_smoothed']
    x_axis = newData[newData['location']== i]['date']
    plt.plot(x_axis,y_axis,color=colors[random.randrange(0,len(colors)-1)])
    plt.title("Graph of the number of death in "+i)
    plt.ylabel('New Deaths')
    plt.xlabel('Date')
    plt.show()

In [None]:
for i in newData['continent'].value_counts().index:
    for j in newData[newData['continent']==i]['location'].value_counts().index:
        x_axis = newData[newData['location']== j]['date']
        plt.plot(x_axis,newData[newData['location']== j]['new_deaths_smoothed'],label=j)
    plt.xlabel('Date')
    plt.ylabel('Number of New Death')
    plt.legend(framealpha=1,bbox_to_anchor=(1.05, 1),loc='upper left');
    plt.title('New death in '+i+' continent')
    plt.show()

In [None]:
for i in newData['location'].value_counts().index:
    y_axis = newData[newData['location']== i]['new_deaths_smoothed']
    x_axis = newData[newData['location']== i]['stringency_index']
    plt.scatter(x_axis,y_axis,color=colors[random.randrange(0,len(colors)-1)])
    plt.title("Correlation of "+ "the number of death and the Stringency Index in "+i)
    plt.ylabel('New Deaths')
    plt.xlabel('Stringency Index')
    plt.show()

In [None]:
for i in newData['continent'].value_counts().index:
    population = newData[newData['continent'] == i]['population'].value_counts().index
    tick_label = newData[newData['continent'] == i]['location'].value_counts().index
    x_coordinate = np.arange(1,len(tick_label)+1)
    ax = sns.barplot(population, tick_label)
    ax.set_title("Barplot of population in "+ j +" continent")
    ax.set_xlabel("Population")
    ax.set_ylabel("Location")
    plt.show()

In [None]:
for i in newData['location'].value_counts().index:
    y_axis = newData[newData['location']== i]['new_deaths_smoothed']
    x_axis = newData[newData['location']== i]['tests_per_case']
    plt.scatter(x_axis,y_axis,color=colors[random.randrange(0,len(colors)-1)])
    plt.title("Correlation of "+ "the number of death and the number of Test per case "+i)
    plt.ylabel('New Deaths')
    plt.xlabel('Test per case')
    plt.show()

In [None]:
pd.pivot_table(training, index = 'continent', columns = 'location', values = 'id' ,aggfunc ='count')

In [None]:
pd.pivot_table(training, index = 'continent', columns = 'new_deaths', values = 'id' ,aggfunc ='count')

In [None]:
extremePoverty_newDeaths_corr = newData[['extreme_poverty','new_deaths_smoothed']]
extremePoverty_newDeaths_corr.dropna(subset=['extreme_poverty'],inplace = True)

In [None]:
sns.scatterplot(data=extremePoverty_newDeaths_corr ,x = 'extreme_poverty' , y = 'new_deaths_smoothed',hue="extreme_poverty",marker="+")

In [None]:
pd.pivot_table(training, index = 'continent', values = ['population','new_deaths','handwashing_facilities','stringency_index','extreme_poverty','handwashing_facilities','tests_per_case','gdp_per_capita'])

In [None]:
newData.columns

### Feature Engineering

In [13]:
newData['date_to_integer'] = newData['date'].apply(lambda x : float(''.join(x.split('-'))) if str(x) else x)

### Data Preprocessing

In [14]:
newData['continent'] = newData['continent'].apply(lambda x : "World" if pd.isna(x) else x )

In [15]:
hdi_continent_mean = dict()
hdi_location_mean = dict()
gdp_location_mean = dict()

for i in (newData['continent'].value_counts().index):
    hdi_continent_mean[i] = newData[newData['continent'] == i]['human_development_index'].mean()

for i in newData['location'].value_counts().index:
    hdi_location_mean[i] = newData[newData['location'] == i]['human_development_index'].mean()

for key,value in hdi_location_mean.items():
    if pd.isna(value):
        hdi_location_mean[key]= hdi_continent_mean[newData[newData['location']==key]['continent'].values[0]] 
        
for i in newData['location'].value_counts().index:
    if pd.isna(newData[newData['location']==i]['gdp_per_capita'].values[0]):
        gdp_location_mean[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['gdp_per_capita'].mean()
    else:
        gdp_location_mean[i] = newData[newData['location']==i]['gdp_per_capita'].mean()
        
hdi_location_mean.pop('World')
hdi_continent_mean.pop('World')
hdi_location_mean['World'] = sum(hdi_location_mean.values())/len(hdi_location_mean)
hdi_continent_mean['World'] = sum(hdi_continent_mean.values())/len(hdi_continent_mean)

In [16]:
newData['human_development_index'] = newData['location'].apply(lambda x:  hdi_location_mean[x])  
newData['gdp_per_capita'] = newData['location'].apply(lambda x:  gdp_location_mean[x])
newData['total_cases'] = newData['total_cases'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_deaths'] = newData['total_deaths'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_deaths'] = newData['new_deaths'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_cases'] = newData['new_cases'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_cases_smoothed'] = newData['new_cases_smoothed'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_deaths_smoothed'] = newData['new_deaths_smoothed'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_cases_per_million'] = newData['total_cases_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_deaths_per_million'] = newData['total_deaths_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_cases_per_million'] = newData['new_cases_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_deaths_per_million'] = newData['new_deaths_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_cases_smoothed_per_million'] = newData['new_cases_smoothed_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_deaths_smoothed_per_million'] = newData['new_deaths_smoothed_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_tests'] = newData['total_tests'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_tests'] = newData['new_tests'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_tests_per_thousand'] = newData['total_tests_per_thousand'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_tests_per_thousand'] = newData['new_tests_per_thousand'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_tests_smoothed'] = newData['new_tests_smoothed'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_tests_smoothed_per_thousand'] = newData['new_tests_smoothed_per_thousand'].apply(lambda x: 0 if pd.isna(x) else x)
newData['tests_per_case'] = newData['tests_per_case'].apply(lambda x: 0 if pd.isna(x) else x) 

In [17]:
diabetes_prevalence_location = dict()
cardiovasc_death_rate_location = dict()
median_age_location = dict()
handwashing_facilities_location = dict()
hospital_beds_per_thousand_location = dict()
life_expectancy_location = dict()
stringency_index_location = dict()
extreme_poverty_location = dict()

for i in newData['location'].value_counts().index:
    diabetes_prevalence_location[i] = newData[newData['location']==i]['diabetes_prevalence'].values[0]
    cardiovasc_death_rate_location[i] = newData[newData['location']==i]['cardiovasc_death_rate'].values[0]
    median_age_location[i] = newData[newData['location']==i]['median_age'].values[0]
    handwashing_facilities_location[i] = newData[newData['location']==i]['handwashing_facilities'].values[0]
    hospital_beds_per_thousand_location[i] = newData[newData['location']==i]['hospital_beds_per_thousand'].values[0]
    life_expectancy_location[i] = newData[newData['location']==i]['life_expectancy'].values[0]
    stringency_index_location[i] = newData[newData['location']==i]['stringency_index'].values[0]
    extreme_poverty_location[i] = newData[newData['location']==i]['extreme_poverty'].values[0]
    
    if pd.isna(newData[newData['location']==i]['diabetes_prevalence'].values[0]):
        diabetes_prevalence_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['diabetes_prevalence'].median()
    if pd.isna(newData[newData['location']==i]['cardiovasc_death_rate'].values[0]):
        cardiovasc_death_rate_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['cardiovasc_death_rate'].median()
    if pd.isna(newData[newData['location']==i]['median_age'].values[0]):
        median_age_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['median_age'].median() 
    if pd.isna(newData[newData['location']==i]['handwashing_facilities'].values[0]):
        handwashing_facilities_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['handwashing_facilities'].mean() 
    if pd.isna(newData[newData['location']==i]['hospital_beds_per_thousand'].values[0]):
        hospital_beds_per_thousand_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['hospital_beds_per_thousand'].mean()
    if pd.isna(newData[newData['location']==i]['life_expectancy'].values[0]):
        life_expectancy_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['life_expectancy'].median()
    if pd.isna(newData[newData['location']==i]['stringency_index'].values[0]):
        stringency_index_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['stringency_index'].mean()
    if pd.isna(newData[newData['location']==i]['extreme_poverty'].values[0]):
        extreme_poverty_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['extreme_poverty'].mean()

stringency_index_location.pop('World')
stringency_index_location['World'] = sum(stringency_index_location.values())/len(stringency_index_location)

In [18]:
newData['diabetes_prevalence'] = newData['location'].apply(lambda x:  diabetes_prevalence_location[x])  
newData['cardiovasc_death_rate'] = newData['location'].apply(lambda x:  cardiovasc_death_rate_location[x])  
newData['median_age'] = newData['location'].apply(lambda x:  median_age_location[x])  
newData['handwashing_facilities'] = newData['location'].apply(lambda x:  handwashing_facilities_location[x])  
newData['hospital_beds_per_thousand'] = newData['location'].apply(lambda x:  hospital_beds_per_thousand_location[x])  
newData['life_expectancy'] = newData['location'].apply(lambda x:  life_expectancy_location[x])  
newData['stringency_index'] = newData['location'].apply(lambda x:  stringency_index_location[x])  
newData['extreme_poverty'] = newData['location'].apply(lambda x:  extreme_poverty_location[x])
newData['stringency_index'] = newData['location'].apply(lambda x:  stringency_index_location[x])

In [19]:
newData.drop(newData[newData['new_deaths_smoothed'] < 0].index, inplace = True) 

In [20]:
newData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56631 entries, 10925 to 54571
Data columns (total 37 columns):
id                                 56631 non-null int64
iso_code                           56631 non-null object
continent                          56631 non-null object
location                           56631 non-null object
population                         56631 non-null float64
human_development_index            56631 non-null float64
gdp_per_capita                     56631 non-null float64
date                               56631 non-null object
total_cases                        56631 non-null float64
total_deaths                       56631 non-null float64
new_cases                          56631 non-null float64
new_deaths                         56631 non-null float64
new_cases_smoothed                 56631 non-null float64
new_deaths_smoothed                56631 non-null float64
total_cases_per_million            56631 non-null float64
total_deaths_per_millio

In [21]:
all_dummies = pd.get_dummies(newData[['location','date_to_integer','train_data','population', 'human_development_index', 'gdp_per_capita',
       'total_cases', 'total_deaths', 'new_cases',
       'new_cases_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'diabetes_prevalence',
       'cardiovasc_death_rate', 'median_age', 'handwashing_facilities',
       'hospital_beds_per_thousand', 'life_expectancy', 'stringency_index',
       'total_tests', 'new_tests', 'total_tests_per_thousand',
       'new_tests_per_thousand', 'new_tests_smoothed',
       'new_tests_smoothed_per_thousand', 'tests_per_case', 'extreme_poverty']])

X_train = all_dummies[all_dummies['train_data']==1].drop(['train_data'],axis=1).drop(['total_deaths'],axis=1)
X_test = all_dummies[all_dummies['train_data']==0].drop(['train_data'],axis=1).drop(['total_deaths'],axis=1)

Y_train = newData[newData['train_data']==1]['total_deaths']
Y_test = newData[newData['train_data']==0]['total_deaths']
# Y_train = Y_train.astype('int')
# Y_test = Y_test.astype('int')
X_train.shape

(45188, 240)

In [22]:
## Scale the data for our model
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[['date_to_integer','train_data','population', 'human_development_index', 'gdp_per_capita',
       'total_cases', 'total_deaths', 'new_cases',
       'new_cases_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'diabetes_prevalence',
       'cardiovasc_death_rate', 'median_age', 'handwashing_facilities',
       'hospital_beds_per_thousand', 'life_expectancy', 'stringency_index',
       'total_tests', 'new_tests', 'total_tests_per_thousand',
       'new_tests_per_thousand', 'new_tests_smoothed',
       'new_tests_smoothed_per_thousand', 'tests_per_case', 'extreme_poverty']]= scale.fit_transform(all_dummies_scaled[['date_to_integer','train_data','population', 'human_development_index', 'gdp_per_capita',
       'total_cases', 'total_deaths', 'new_cases',
       'new_cases_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'diabetes_prevalence',
       'cardiovasc_death_rate', 'median_age', 'handwashing_facilities',
       'hospital_beds_per_thousand', 'life_expectancy', 'stringency_index',
       'total_tests', 'new_tests', 'total_tests_per_thousand',
       'new_tests_per_thousand', 'new_tests_smoothed',
       'new_tests_smoothed_per_thousand', 'tests_per_case', 'extreme_poverty']])

X_train_scaled = all_dummies_scaled[all_dummies_scaled['train_data'] == 1].drop(['train_data'],axis=1).drop(['total_deaths'],axis=1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled['train_data'] == 0].drop(['train_data'],axis=1).drop(['total_deaths'],axis=1)

Y_train_scaled = all_dummies_scaled[all_dummies_scaled['train_data']==1]['total_deaths']
Y_test_scaled = all_dummies_scaled[all_dummies_scaled['train_data']==0]['total_deaths']
Y_train_scaled = Y_train_scaled.astype('int')
Y_test_scaled = Y_test_scaled.astype('int')

In [23]:
Indonesia_dummies = all_dummies[all_dummies['location_Indonesia']==1]

X_Indonesia = Indonesia_dummies.drop(['train_data'],axis=1).drop(['total_deaths'],axis=1)
Y_Indonesia = newData[newData['location']=='Indonesia']['total_deaths']

X_train_Indonesia = Indonesia_dummies[Indonesia_dummies['train_data']==1].drop(['train_data'],axis=1).drop(['total_deaths'],axis=1)
X_test_Indonesia = Indonesia_dummies[Indonesia_dummies['train_data']==0].drop(['train_data'],axis=1).drop(['total_deaths'],axis=1)

Y_train_Indonesia = newData[newData['train_data']==1][newData[newData['train_data']==1]["location"]=="Indonesia"]['total_deaths']
Y_test_Indonesia = newData[newData['train_data']==0][newData[newData['train_data']==0]["location"]=="Indonesia"]['total_deaths']

X_test_Indonesia.shape

(65, 240)

In [24]:
X_train

Unnamed: 0,date_to_integer,population,human_development_index,gdp_per_capita,total_cases,new_cases,new_cases_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,...,location_Vanuatu,location_Vatican,location_Venezuela,location_Vietnam,location_Wallis and Futuna,location_Western Sahara,location_World,location_Yemen,location_Zambia,location_Zimbabwe
10925,20191231.0,1.439324e+09,0.752,15308.712,27.0,27.0,0.0,0.019,0.019,0.0,...,0,0,0,0,0,0,0,0,0,0
10926,20200101.0,1.439324e+09,0.752,15308.712,27.0,0.0,0.0,0.019,0.000,0.0,...,0,0,0,0,0,0,0,0,0,0
10927,20200102.0,1.439324e+09,0.752,15308.712,27.0,0.0,0.0,0.019,0.000,0.0,...,0,0,0,0,0,0,0,0,0,0
10928,20200103.0,1.439324e+09,0.752,15308.712,44.0,17.0,0.0,0.031,0.012,0.0,...,0,0,0,0,0,0,0,0,0,0
10929,20200104.0,1.439324e+09,0.752,15308.712,44.0,0.0,0.0,0.031,0.000,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32989,20201111.0,5.919400e+04,0.708,3819.202,1.0,0.0,0.0,16.894,0.000,0.0,...,0,0,0,0,0,0,0,0,0,0
54567,20201111.0,3.071500e+05,0.603,2921.909,1.0,1.0,0.0,3.256,3.256,0.0,...,1,0,0,0,0,0,0,0,0,0
54568,20201112.0,3.071500e+05,0.603,2921.909,1.0,0.0,0.0,3.256,0.000,0.0,...,1,0,0,0,0,0,0,0,0,0
54569,20201113.0,3.071500e+05,0.603,2921.909,1.0,0.0,0.0,3.256,0.000,0.0,...,1,0,0,0,0,0,0,0,0,0


In [25]:
Y_train

10925    0.0
10926    0.0
10927    0.0
10928    0.0
10929    0.0
        ... 
32989    0.0
54567    0.0
54568    0.0
54569    0.0
54570    0.0
Name: total_deaths, Length: 45188, dtype: float64

## Model Building & Evaluation

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import explained_variance_score

from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

In [None]:
linear = LinearRegression()

linear.fit(X_train,Y_train)
y_pred_linear = linear.predict(X_test)

explained_variance_score(Y_test, y_pred_linear, multioutput='uniform_average')

In [None]:
gradient = GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')

gradient.fit(X_train,Y_train)
y_pred_gradient = gradient.predict(X_test)

explained_variance_score(Y_test, y_pred_gradient, multioutput='uniform_average')

In [None]:
ridge = Ridge()

ridge.fit(X_train,Y_train)
y_pred_ridge = ridge.predict(X_test)

explained_variance_score(Y_test, y_pred_ridge, multioutput='uniform_average')

In [None]:
lasso = Lasso()

lasso.fit(X_train,Y_train)
y_pred_lasso = lasso.predict(X_test)

explained_variance_score(Y_test, y_pred_lasso, multioutput='uniform_average')

## Model Saving (Without Tuning)

In [None]:
pickle.dump(linear, open('linear_model', 'wb'))
pickle.dump(gradient, open('gradient_model', 'wb'))
pickle.dump(ridge, open('ridge_model', 'wb'))
pickle.dump(lasso, open('lasso_model', 'wb'))

## Model Loading (Without Tuning)

In [None]:
linear = pickle.load(open('linear_model', 'rb'))
gradient = pickle.load(open('gradient_model', 'rb'))
ridge = pickle.load(open('ridge_model', 'rb'))
lasso = pickle.load(open('lasso_model', 'rb'))



## Model Tuning & Evaluation

In [1]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

In [2]:
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Parameters: ' + str(classifier.best_params_))

In [57]:
linear = LinearRegression()
param_grid = {'fit_intercept':[True,False],
              'normalize':[True,False],
              'copy_X':[True,False],
}
clf_linear = GridSearchCV(linear, param_grid = param_grid , verbose = True , n_jobs = -1)
best_clf_linear = clf_linear.fit(X_train,Y_train)
clf_performance(best_clf_linear,'Linear Regression')

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    8.6s finished


Linear Regression
Best Parameters: {'copy_X': True, 'fit_intercept': True, 'normalize': False}


In [28]:
gbr = GradientBoostingRegressor()
param_grid = {'n_estimators' : [350,400,450,500],
              'max_depth':[5,10,15,20],
              'min_samples_split' :[2,5,7,10],
              'learning_rate' : [0.1,0.5,1,2],
              'loss' : ['ls','lad','huber','quantile']
}
clf_gbr = GridSearchCV(gbr, param_grid = param_grid, verbose = True , n_jobs = -1)
best_clf_gbr  = clf_gbr.fit(X_train,Y_train)
clf_performance(best_clf_gbr,'Gradient Boost Regression')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 1024 candidates, totalling 3072 fits


KeyboardInterrupt: 

In [27]:
ridge = Ridge()
param_grid = {'alpha' : [0.1,0,2,0.5,1,2],
              'fit_intercept':[True,False],
              'normalize' :[True,False],
              'copy_X' : [True,False],
              'max_iter' : [2000],
              'solver':['auto']
}
clf_ridge = GridSearchCV(ridge, param_grid = param_grid, verbose = True ,n_jobs= -1)
best_clf_ridge  = clf_ridge.fit(X_train,Y_train)
clf_performance(best_clf_ridge,'Ridge Regression')

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   21.6s finished


Ridge Regression
Best Parameters: {'alpha': 0.5, 'copy_X': True, 'fit_intercept': True, 'max_iter': 2000, 'normalize': True, 'solver': 'auto'}


In [None]:
# Tuned Models
tuned_linear = best_clf_linear.best_estimator_
tuned_gradient = best_clf_gradient.best_estimator_
tuned_ridge = best_clf_ridge.best_estimator_

## Model Fitting & Saving

In [None]:
tuned_linear.fit(X_train,Y_train)
tuned_gradient.fit(X_train,Y_train)
tuned_ridge.fit(X_train,Y_train)

pickle.dump(tuned_lr, open('tuned_linear_model', 'wb'))
pickle.dump(tuned_gradient, open('tuned_gradient_model', 'wb'))
pickle.dump(tuned_ridge, open('tuned_ridge_model', 'wb'))

## Model Loading & Predictions

In [None]:
tuned_linear = pickle.load(open('tuned_linear_model', 'rb'))
tuned_gradient = pickle.load(open('tuned_gradient_model', 'rb'))
tuned_ridge = pickle.load(open('tuned_ridge_model', 'rb'))

In [None]:
tuned_linear_prediction = tuned_linear.predict(X_test).astype(int)
tuned_gradient_prediction = tuned_gradient.predict(X_test).astype(int)
tuned_ridge_prediction = tuned_ridge.predict(X_test).astype(int)

In [None]:
result_comparison = {'ID':testing['id'],'linear':tuned_linear_prediction,'gradient':tuned_gradient_prediction,'ridge':tuned_ridge_prediction,'Actual':Y_train}
result_comparison = pd.DataFrame(data=result_comparison)

In [None]:
result_comparison['difference_linear_gradient'] = result_comparison.apply(lambda x : 1 if x['linear'] != x['gradient'] else 0, axis = 1)
result_comparison['difference_gradient_ridge'] = result_comparison.apply(lambda x : 1 if x['gradient'] != x['ridge'] else 0, axis = 1)

In [None]:
diff_linear_gradient = result_comparison['difference_linear_gradient'].value_counts()
diff_gradient_ridge = result_comparison['difference_gradient_ridge'].value_counts()

print('Value difference between Tuned Linear Regression and Tuned Gradient Boosting Regression :',diff_linear_gradient.values[1])
print('Value difference between Tuned Gradient Boosting Regression and Tuned Ridge Regression :',diff_gradient_ridge.values[1])