# Outline

## 1. Data Cleaning
## 2. Data Exploration
## 3. Feature Engineering
## 4. Data Preprocessing
## 5. Model Building

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
import random
%matplotlib inline

### Data Cleaning

In [None]:
data = pd.read_csv('owid-covid-data.csv')
data.dropna(subset=['iso_code'],inplace=True)
pd.set_option("max_rows",10)

In [None]:
data.columns

In [None]:
newData = data[['iso_code','continent','location',
                'population','human_development_index','gdp_per_capita',
                'date','total_cases','total_deaths',
                'new_cases','new_deaths','new_cases_smoothed',
                'new_deaths_smoothed','total_cases_per_million','total_deaths_per_million',
                'new_cases_per_million','new_deaths_per_million','new_cases_smoothed_per_million',
                'new_deaths_smoothed_per_million','diabetes_prevalence','cardiovasc_death_rate',
                'median_age','handwashing_facilities','hospital_beds_per_thousand',
                'life_expectancy','stringency_index','total_tests',
                'new_tests','total_tests_per_thousand','new_tests_per_thousand',
                'new_tests_smoothed','new_tests_smoothed_per_thousand','tests_per_case',
                'extreme_poverty']]

newData['id'] = range(1,len(newData)+1)
cols = newData.columns.tolist()
cols = cols[-1:] + cols[:-1]
newData = newData[cols]

In [None]:
newData

In [None]:
# Spitting data into testing and training.
training = pd.DataFrame()
testing = pd.DataFrame()

# Split 20% for each location in the data.
for i in newData['location'].value_counts().index:
    train, test = train_test_split(newData[newData['location']==i], test_size=0.2, shuffle=False)
    training = pd.concat([training,train])
    testing = pd.concat([testing,test])


In [None]:
training['train_data'] = 1
testing['train_data'] = 0
newData = pd.concat([training,testing])

### Data Exploration & Visualization

In [None]:
training

In [None]:
training.info()

In [None]:
training.describe()

In [None]:
training.describe().columns

In [None]:
df_cat = training[['iso_code','continent','location','date']]
df_num = training[['population', 'human_development_index', 'gdp_per_capita',
       'total_cases', 'total_deaths', 'new_cases', 'new_deaths',
       'new_cases_smoothed', 'new_deaths_smoothed', 'total_cases_per_million',
       'total_deaths_per_million', 'new_cases_per_million',
       'new_deaths_per_million', 'new_cases_smoothed_per_million',
       'new_deaths_smoothed_per_million', 'diabetes_prevalence',
       'cardiovasc_death_rate', 'median_age', 'handwashing_facilities',
       'hospital_beds_per_thousand', 'life_expectancy', 'stringency_index',
       'total_tests', 'new_tests', 'total_tests_per_thousand',
       'new_tests_per_thousand', 'new_tests_smoothed',
       'new_tests_smoothed_per_thousand', 'tests_per_case', 'extreme_poverty']]

In [None]:
colors = ['red','orange','blue','green','yellow','purple','darkblue','pink','lightblue']

In [None]:
for i in df_num:
    plt.hist(df_num[i],range=(df_num[i].min(),df_num[i].max()),color=colors[random.randrange(0,len(colors)-1)])
    plt.title("Histogram of "+i)
    plt.ylabel('Frequency of Occurrence')
    plt.xlabel(i)
    plt.show()

In [None]:
for i in df_num:
    df_num[i].value_counts().plot(kind='bar')
    plt.title("Bar Chart of "+i)
    plt.ylabel('Frequency of Occurrence')
    plt.xlabel(i)
    sns.despine
    break

In [None]:
print(df_num.corr())

In [None]:
sns.heatmap(df_num.corr())

In [None]:
for i in df_cat.columns:
    sns.barplot(df_cat[i].value_counts().index , df_cat[i].value_counts()).set_title(i)
    plt.show()

In [None]:
for i in newData['location'].value_counts().index:
    y_axis = newData[newData['location']== i]['new_deaths_smoothed']
    x_axis = newData[newData['location']== i]['date']
    plt.plot(x_axis,y_axis,color=colors[random.randrange(0,len(colors)-1)])
    plt.title("Graph of the number of death in "+i)
    plt.ylabel('New Deaths')
    plt.xlabel('Date')
    plt.show()

In [None]:
for i in newData['continent'].value_counts().index:
    for j in newData[newData['continent']==i]['location'].value_counts().index:
        x_axis = newData[newData['location']== j]['date']
        plt.plot(x_axis,newData[newData['location']== j]['new_deaths_smoothed'],label=j)
    plt.xlabel('Date')
    plt.ylabel('Number of New Death')
    plt.legend(framealpha=1,bbox_to_anchor=(1.05, 1),loc='upper left');
    plt.title('New death in '+i+' continent')
    plt.show()

In [None]:
for i in newData['location'].value_counts().index:
    y_axis = newData[newData['location']== i]['new_deaths_smoothed']
    x_axis = newData[newData['location']== i]['stringency_index']
    plt.scatter(x_axis,y_axis,color=colors[random.randrange(0,len(colors)-1)])
    plt.title("Correlation of "+ "the number of death and the Stringency Index in "+i)
    plt.ylabel('New Deaths')
    plt.xlabel('Stringency Index')
    plt.show()

In [None]:
for i in newData['continent'].value_counts().index:
    population = newData[newData['continent'] == i]['population'].value_counts().index
    tick_label = newData[newData['continent'] == i]['location'].value_counts().index
    x_coordinate = np.arange(1,len(tick_label)+1)
    ax = sns.barplot(population, tick_label)
    ax.set_title("Barplot of population in "+ j +" continent")
    ax.set_xlabel("Population")
    ax.set_ylabel("Location")
    plt.show()

In [None]:
for i in newData['location'].value_counts().index:
    y_axis = newData[newData['location']== i]['new_deaths_smoothed']
    x_axis = newData[newData['location']== i]['tests_per_case']
    plt.scatter(x_axis,y_axis,color=colors[random.randrange(0,len(colors)-1)])
    plt.title("Correlation of "+ "the number of death and the number of Test per case "+i)
    plt.ylabel('New Deaths')
    plt.xlabel('Test per case')
    plt.show()

In [None]:
pd.pivot_table(training, index = 'continent', columns = 'location', values = 'id' ,aggfunc ='count')

In [None]:
pd.pivot_table(training, index = 'continent', columns = 'new_deaths', values = 'id' ,aggfunc ='count')

In [None]:
extremePoverty_newDeaths_corr = newData[['extreme_poverty','new_deaths_smoothed']]
extremePoverty_newDeaths_corr.dropna(subset=['extreme_poverty'],inplace = True)

In [None]:
sns.scatterplot(data=extremePoverty_newDeaths_corr ,x = 'extreme_poverty' , y = 'new_deaths_smoothed',hue="extreme_poverty",marker="+")

In [None]:
pd.pivot_table(training, index = 'continent', values = ['population','new_deaths','handwashing_facilities','stringency_index','extreme_poverty','handwashing_facilities','tests_per_case','gdp_per_capita'])

In [None]:
newData.columns

### Feature Engineering

In [None]:
newData['date_to_integer'] = newData['date'].apply(lambda x : float(''.join(x.split('-'))) if str(x) else x)

### Data Preprocessing

In [None]:
newData['continent'] = newData['continent'].apply(lambda x : "World" if pd.isna(x) else x )

In [None]:
hdi_continent_mean = dict()
hdi_location_mean = dict()
gdp_location_mean = dict()

for i in (newData['continent'].value_counts().index):
    hdi_continent_mean[i] = newData[newData['continent'] == i]['human_development_index'].mean()

for i in newData['location'].value_counts().index:
    hdi_location_mean[i] = newData[newData['location'] == i]['human_development_index'].mean()

for key,value in hdi_location_mean.items():
    if pd.isna(value):
        hdi_location_mean[key]= hdi_continent_mean[newData[newData['location']==key]['continent'].values[0]] 
        
for i in newData['location'].value_counts().index:
    if pd.isna(newData[newData['location']==i]['gdp_per_capita'].values[0]):
        gdp_location_mean[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['gdp_per_capita'].mean()
    else:
        gdp_location_mean[i] = newData[newData['location']==i]['gdp_per_capita'].mean()
        
hdi_location_mean.pop('World')
hdi_continent_mean.pop('World')
hdi_location_mean['World'] = sum(hdi_location_mean.values())/len(hdi_location_mean)
hdi_continent_mean['World'] = sum(hdi_continent_mean.values())/len(hdi_continent_mean)

In [None]:
newData['human_development_index'] = newData['location'].apply(lambda x:  hdi_location_mean[x])  
newData['gdp_per_capita'] = newData['location'].apply(lambda x:  gdp_location_mean[x])
newData['total_cases'] = newData['total_cases'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_deaths'] = newData['total_deaths'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_deaths'] = newData['new_deaths'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_cases'] = newData['new_cases'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_cases_smoothed'] = newData['new_cases_smoothed'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_deaths_smoothed'] = newData['new_deaths_smoothed'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_cases_per_million'] = newData['total_cases_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_deaths_per_million'] = newData['total_deaths_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_cases_per_million'] = newData['new_cases_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_deaths_per_million'] = newData['new_deaths_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_cases_smoothed_per_million'] = newData['new_cases_smoothed_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_deaths_smoothed_per_million'] = newData['new_deaths_smoothed_per_million'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_tests'] = newData['total_tests'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_tests'] = newData['new_tests'].apply(lambda x: 0 if pd.isna(x) else x)
newData['total_tests_per_thousand'] = newData['total_tests_per_thousand'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_tests_per_thousand'] = newData['new_tests_per_thousand'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_tests_smoothed'] = newData['new_tests_smoothed'].apply(lambda x: 0 if pd.isna(x) else x)
newData['new_tests_smoothed_per_thousand'] = newData['new_tests_smoothed_per_thousand'].apply(lambda x: 0 if pd.isna(x) else x)
newData['tests_per_case'] = newData['tests_per_case'].apply(lambda x: 0 if pd.isna(x) else x) 

In [None]:
diabetes_prevalence_location = dict()
cardiovasc_death_rate_location = dict()
median_age_location = dict()
handwashing_facilities_location = dict()
hospital_beds_per_thousand_location = dict()
life_expectancy_location = dict()
stringency_index_location = dict()
extreme_poverty_location = dict()

for i in newData['location'].value_counts().index:
    diabetes_prevalence_location[i] = newData[newData['location']==i]['diabetes_prevalence'].values[0]
    cardiovasc_death_rate_location[i] = newData[newData['location']==i]['cardiovasc_death_rate'].values[0]
    median_age_location[i] = newData[newData['location']==i]['median_age'].values[0]
    handwashing_facilities_location[i] = newData[newData['location']==i]['handwashing_facilities'].values[0]
    hospital_beds_per_thousand_location[i] = newData[newData['location']==i]['hospital_beds_per_thousand'].values[0]
    life_expectancy_location[i] = newData[newData['location']==i]['life_expectancy'].values[0]
    stringency_index_location[i] = newData[newData['location']==i]['stringency_index'].values[0]
    extreme_poverty_location[i] = newData[newData['location']==i]['extreme_poverty'].values[0]
    
    if pd.isna(newData[newData['location']==i]['diabetes_prevalence'].values[0]):
        diabetes_prevalence_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['diabetes_prevalence'].median()
    if pd.isna(newData[newData['location']==i]['cardiovasc_death_rate'].values[0]):
        cardiovasc_death_rate_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['cardiovasc_death_rate'].median()
    if pd.isna(newData[newData['location']==i]['median_age'].values[0]):
        median_age_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['median_age'].median() 
    if pd.isna(newData[newData['location']==i]['handwashing_facilities'].values[0]):
        handwashing_facilities_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['handwashing_facilities'].mean() 
    if pd.isna(newData[newData['location']==i]['hospital_beds_per_thousand'].values[0]):
        hospital_beds_per_thousand_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['hospital_beds_per_thousand'].mean()
    if pd.isna(newData[newData['location']==i]['life_expectancy'].values[0]):
        life_expectancy_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['life_expectancy'].median()
    if pd.isna(newData[newData['location']==i]['stringency_index'].values[0]):
        stringency_index_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['stringency_index'].mean()
    if pd.isna(newData[newData['location']==i]['extreme_poverty'].values[0]):
        extreme_poverty_location[i] = newData[newData['continent'] == newData[newData['location']==i]['continent'].values[0]]['extreme_poverty'].mean()

stringency_index_location.pop('World')
stringency_index_location['World'] = sum(stringency_index_location.values())/len(stringency_index_location)

In [None]:
newData['diabetes_prevalence'] = newData['location'].apply(lambda x:  diabetes_prevalence_location[x])  
newData['cardiovasc_death_rate'] = newData['location'].apply(lambda x:  cardiovasc_death_rate_location[x])  
newData['median_age'] = newData['location'].apply(lambda x:  median_age_location[x])  
newData['handwashing_facilities'] = newData['location'].apply(lambda x:  handwashing_facilities_location[x])  
newData['hospital_beds_per_thousand'] = newData['location'].apply(lambda x:  hospital_beds_per_thousand_location[x])  
newData['life_expectancy'] = newData['location'].apply(lambda x:  life_expectancy_location[x])  
newData['stringency_index'] = newData['location'].apply(lambda x:  stringency_index_location[x])  
newData['extreme_poverty'] = newData['location'].apply(lambda x:  extreme_poverty_location[x])
newData['stringency_index'] = newData['location'].apply(lambda x:  stringency_index_location[x])

In [None]:
newData.drop(newData[newData['new_deaths_smoothed'] < 0].index, inplace = True) 

In [None]:
newData.info()

In [None]:
all_dummies = pd.get_dummies(newData[['new_deaths_smoothed','population','human_development_index','gdp_per_capita','date_to_integer','new_cases_smoothed','diabetes_prevalence','cardiovasc_death_rate','median_age','handwashing_facilities','life_expectancy','stringency_index','new_tests_smoothed','new_tests_smoothed_per_thousand','tests_per_case','extreme_poverty','train_data']])
X_train = all_dummies[all_dummies['train_data']==1].drop(['train_data'],axis=1)
X_test = all_dummies[all_dummies['train_data']==0].drop(['train_data'],axis=1)

Y_train = newData[newData['train_data']==1]['new_deaths_smoothed']
Y_test = newData[newData['train_data']==0]['new_deaths_smoothed']
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')
Y_train.shape

In [None]:
## Scale the data for our model
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[['new_deaths_smoothed', 'population', 'human_development_index',
       'gdp_per_capita', 'date_to_integer', 'new_cases_smoothed',
       'diabetes_prevalence', 'cardiovasc_death_rate', 'median_age',
       'handwashing_facilities', 'life_expectancy', 'stringency_index',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'tests_per_case', 'extreme_poverty']]= scale.fit_transform(all_dummies_scaled[['new_deaths_smoothed', 'population', 'human_development_index',
       'gdp_per_capita', 'date_to_integer', 'new_cases_smoothed',
       'diabetes_prevalence', 'cardiovasc_death_rate', 'median_age',
       'handwashing_facilities', 'life_expectancy', 'stringency_index',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'tests_per_case', 'extreme_poverty']])

X_train_scaled = all_dummies_scaled[all_dummies_scaled['train_data'] == 1].drop(['train_data'],axis=1).drop(['new_deaths_smoothed'],axis=1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled['train_data'] == 0].drop(['train_data'],axis=1).drop(['new_deaths_smoothed'],axis=1)

Y_train_scaled = all_dummies_scaled[all_dummies_scaled['train_data']==1]['new_deaths_smoothed']
Y_test_scaled = all_dummies_scaled[all_dummies_scaled['train_data']==0]['new_deaths_smoothed']
Y_train_scaled = Y_train_scaled.astype('int')
Y_test_scaled = Y_test_scaled.astype('int')

In [None]:
all_dummies_scaled.head()

In [None]:
Y_train_scaled.value_counts().index

## Model Building & Evaluation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

In [None]:
gnb = GaussianNB()
cv = cross_val_score(gnb,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())
# gnb.fit(X_train_scaled,Y_train)
# gnb.score(X_test_scaled,Y_test)

In [None]:
lr = LogisticRegression()#max_iter=100000
# cv = cross_val_score(lr,X_train_scaled,Y_train,cv=5)
# print(cv)
# print(cv.mean())
lr.fit(X_train_scaled,Y_train)
lr.score(X_test_scaled,Y_test)

In [None]:
lin_reg = LinearRegression()
# cv = cross_val_score(lin_reg,X_train_scaled,Y_train,cv=5) #,scoring="neg_mean_absolute_error"
# print(cv)
# print(cv.mean())
lin_reg.fit(X_train_scaled,Y_train)
lin_reg.score(X_test_scaled,Y_test)

In [None]:
gbr = GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')
# cv = cross_val_score(lin_reg,X_train_scaled,Y_train,cv=5) #,scoring="neg_mean_absolute_error"
# print(cv)
# print(cv.mean())
gbr.fit(X_train_scaled,Y_train)
gbr.score(X_test_scaled,Y_test)

In [None]:
rr = Ridge() 
# cv = cross_val_score(rr,X_train_scaled,Y_train,cv=5)
# print(cv)
# print(cv.mean())
rr.fit(X_train_scaled,Y_train)
rr.score(X_test_scaled,Y_test)

In [None]:
svc = SVC(probability = True)
# cv = cross_val_score(svc,X_train_scaled,Y_train,cv=5)
# print(cv)
# print(cv.mean())
svc.fit(X_train_scaled,Y_train)
svc.score(X_test_scaled,Y_test)

In [None]:
rf = RandomForestClassifier() #random_state = 1
# cv = cross_val_score(rf,X_train_scaled,Y_train,cv=5)
# print(cv)
# print(cv.mean())
rf.fit(X_train_scaled,Y_train)
rf.score(X_test_scaled,Y_test)

In [None]:
voting = VotingClassifier(estimators = [('lr',lr),('lin_reg',lin_reg),('rr',rr),('gbr',gbr),('rf',rf)], voting = 'soft')
# cv = cross_val_score(voting,X_train_scaled,Y_train,cv=5)
# print(cv)
# print(cv.mean())
voting.fit(X_train_scaled,Y_train)
voting.score(X_test_scaled,Y_test)

## Model Tuning & Evaluation

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

In [None]:
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: ' + str(classifier.best_score_))
    print('Best Parameters: ' + str(classifier.best_params_))

In [None]:
lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'dual':[False],
              'penalty' : ['l2'],
              'C' : np.logspace(-4, 4, 20),
              'fit_intercept':[True,False],
              'intercept_scaling':np.logspace(-4, 4, 20),
              'max_iter':[1000000],
              'solver' : ['liblinear','lbfgs','newton-cg','sag','saga'],
              'multi_class' : ['auto','ovr','multinomial']
    
}
clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 5 , verbose = True , n_jobs = -1)
best_clf_lr = clf_rf.fit(X_train_scaled,Y_train)
clf_performance(best_clf_lr,'Logistic Regression')

In [None]:
lin_reg = LinearRegression()
param_grid = {'fit_intercept':[True,False],
              'normalize':[True,False],
              'copy_X':[True,False],
}
clf_lin_reg = GridSearchCV(lin_reg, param_grid = param_grid, cv = 5 , verbose = True , n_jobs = -1)
best_clf_lin_reg = clf_lin_reg.fit(X_train_scaled,Y_train)
clf_performance(best_clf_lin_reg,'Linear Regression')

In [None]:
gbr = GradientBoostingRegressor()
param_grid = {'n_estimators' : [350,400,450,500],
              'max_depth':[5,10,15,20],
              'min_samples_split' :[2,5,7,10],
              'learning_rate' : [0.1,0.5,1,2],
              'loss' : ['ls','lad','huber','quantile']
}
clf_gbr = GridSearchCV(gbr, param_grid = param_grid, cv = 5 , verbose = True , n_jobs = -1)
best_clf_gbr  = clf_gbr.fit(X_train_scaled,Y_train)
clf_performance(best_clf_gbr,'Gradient Boost Regression')

In [None]:
rr = Ridge()
param_grid = {'alpha' : [1.0,2.0,5.0,10.0],
              'fit_intercept':[True,False],
              'normalize' :[True,False],
              'copy_X' : [True,False],
              'max_iter' : [10000],
              'solver':['auto','svd','cholesky','lsqr','sparse_cg']
}
clf_rr = GridSearchCV(rr, param_grid = param_grid, cv = 5 , verbose = True , n_jobs = -1)
best_clf_rr  = clf_rr.fit(X_train_scaled,Y_train)
clf_performance(best_clf_rr,'Ridge Regression')

In [None]:
svc = SVC(probability = True)
param_grid = tuned_parameters = [{'kernel': ['rbf'], 'gamma': [.1,.5,1,2,5,10],
                                  'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['linear'], 'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['poly'], 'degree' : [2,3,4,5], 'C': [.1, 1, 10, 100, 1000]}]
clf_svc = GridSearchCV(svc, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train_scaled,Y_train)
clf_performance(best_clf_svc,'SVC')

In [None]:
# Use Randomized Search to narrow down the hyperparameter
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf = RandomForestRegressor()
clf_random_rf = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=True, random_state=42, n_jobs = -1)
best_clf_random_rf.fit(X_train_scaled, Y_train)
clf_performance(best_clf_random_rf,'Random Forest')

In [None]:
# Use granular search from the results from randomized search
rf = RandomForestClassifier(random_state = 1)
param_grid =  {'n_estimators': [400,450,500,550],
               'criterion':['gini','entropy'],
                                  'bootstrap': [True],
                                  'max_depth': [15, 20, 25],
                                  'max_features': ['auto','sqrt', 10],
                                  'min_samples_leaf': [2,3],
                                  'min_samples_split': [2,3]}
                                  
clf_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf = clf_rf.fit(X_train_scaled,Y_train)
clf_performance(best_clf_rf,'Random Forest')

In [None]:
# Tuned Models
tuned_lr = best_clf_lr.best_estimator_
tuned_svc = best_clf_svc.best_estimator_
tuned_rf = best_clf_rf.best_estimator_
tuned_rr = best_clf_rr.best_estimator_
tuned_lin_reg = best_clf_lin_reg.best_estimator_
tuned_gbr = best_clf_gbr.best_estimator_

In [None]:
voting = VotingClassifier(estimators = [('lr',best_lr),('rf',best_rf),('rr',tuned_rr),('lin_reg',tuned_lin_reg),('gbr',tuned_gbr)], voting = 'soft')
cv = cross_val_score(voting,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
# Adjusting the weights for each models in the voting classifier
params = {'weights' : [[1,1,1,1,1],[1,2,1,2,1],[1,1,2,1,1],[2,1,1,1,2],[2,2,1,2,2],[1,2,2,2,1],[2,1,2,1,2]]}
clf_voting = GridSearchCV(voting_clf, param_grid = params, cv = 5, verbose = True, n_jobs = -1)
best_clf_voting = clf_voting.fit(X_train_scaled,Y_train)
clf_performance(best_clf_voting,'Voting Classifier')

In [None]:
tuned_voting = best_clf_voting.best_estimator_

In [None]:
print('Tuned Logsitic Regression:',best_clf_lr.best_score_)
print('Tuned Support Vector Machine:',best_clf_svc.best_score_)
print('Tuned Random Forest:',best_clf_rf.best_score_)
print('Tuned Voting Classifier:',best_clf_voting.best_score_)

## Model Fitting & Saving

In [None]:
tuned_lr.fit(X_train_scaled,Y_train)
tuned_svc.fit(X_train_scaled,Y_train)
tuned_rf.fit(X_train_scaled,Y_train)
tuned_voting.fit(X_train_scaled,Y_train)

pickle.dump(tuned_lr, open('lr_model', 'wb'))
pickle.dump(tuned_svc, open('svc_model', 'wb'))
pickle.dump(tuned_rf, open('rf_model', 'wb'))
pickle.dump(tuned_voting, open('voting_model', 'wb'))

## Model Loading & Predictions

In [None]:
loaded_lr = pickle.load(open('lr_model', 'rb'))
loaded_svc = pickle.load(open('svc_model', 'rb'))
loaded_rf = pickle.load(open('rf_model', 'rb'))
loaded_voting = pickle.load(open('voting_model', 'rb'))

In [None]:
lr_prediction = loaded_lr.predict(X_test_scaled).astype(int)
svc_prediction = loaded_svc.predict(X_test_scaled).astype(int)
rf_prediction = loaded_rf.predict(X_test_scaled).astype(int)
voting_prediction = loaded_voting.predict(X_test_scaled).astype(int)

In [None]:
result_comparison = {'ID':testing['id'],'lr':lr_prediction,'svc':svc_prediction,'rf':rf_prediction,'voting':voting_prediction,'actual':Y_train}
result_comparison = pd.DataFrame(data=result_comparison)

In [None]:
result_comparison['difference_lr_svc'] = result_comparison.apply(lambda x : 1 if x['lr'] != x['svc'] else 0, axis = 1)
result_comparison['difference_svc_rf'] = result_comparison.apply(lambda x : 1 if x['svc'] != x['rf'] else 0, axis = 1)
result_comparison['differences_rf_voting'] = result_comparison.apply(lambda x : 1 if x['rf'] != x['voting'] else 0, axis = 1)
result_comparison['differences_voting_lr'] = result_comparison.apply(lambda x : 1 if x['voting'] != x['lr'] else 0, axis = 1)

In [None]:
diff_lr_svc = result_comparison['difference_lr_svc'].value_counts()
diff_svc_rf = result_comparison['difference_svc_rf'].value_counts()
diff_rf_voting = result_comparison['differences_rf_voting'].value_counts()
diff_voting_lr = result_comparison['differences_voting_lr'].value_counts()

print('Value difference between Tuned Logistic Regression and Tuned Support Vector Classifier :',diff_lr_svc.values[1])
print('Value difference between Tuned Support Vector Classifier and Tuned Random Forest :',diff_svc_rf.values[1])
print('Value difference between Tuned Random Forest and Tuned Voting Classifier:',diff_rf_voting.values[1])
print('Value difference between Tuned Voting Classifier and Tuned Logistic Regression:',diff_voting_lr.values[1])