In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from scipy import stats
import doubleml as dml
from tqdm.notebook import tqdm

In [2]:
data = pd.read_csv("C:\\_Dissertation\\Data\\Story data n=5000.csv", index_col=0)
X = data.drop('y', axis=1).drop('D', axis=1)

In [3]:
m_RF = RandomForestRegressor()
m_SVM = SVR()
m_NN = MLPRegressor()
g_RF = RandomForestRegressor()
g_SVM = SVR()
g_NN = MLPRegressor()

In [4]:
RF_grid = {'n_estimators': [100],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth': [5, 10, 20, 40, 100, None],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4]}

In [5]:
m_RF_tuning = GridSearchCV(estimator=m_RF, param_grid=RF_grid, cv=5, verbose=2, n_jobs=-1)

In [6]:
m_RF_tuning.fit(X, data['D'])

Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:  2.9min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [7]:
m_RF_results = (m_RF_tuning.best_score_, m_RF_tuning.best_estimator_)

In [8]:
m_RF_results

(0.5352948187032898,
 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=4,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False))

In [9]:
g_RF_tuning = GridSearchCV(estimator=g_RF, param_grid=RF_grid, cv=5, verbose=2, n_jobs=-1)

In [10]:
g_RF_tuning.fit(X, data['y'])

Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:  4.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [11]:
g_RF_results = (g_RF_tuning.best_score_, g_RF_tuning.best_estimator_)

In [12]:
g_RF_results

(0.9826356652652961,
 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=40, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False))

In [13]:
SVM_grid = {'C': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10, 30, 50, 100, 300, 1000],
            'gamma': ['scale', 'auto'],
            'kernel': ['rbf']}

In [14]:
m_SVM_tuning = GridSearchCV(m_SVM, SVM_grid, cv=5, n_jobs=-1, verbose=2)

In [15]:
m_SVM_tuning.fit(X, data['D'])

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  4.4min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10, 30, 50, 100,
                               300, 1000],
                         'gamma': ['scale', 'auto'], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [16]:
m_SVM_results = (m_SVM_tuning.best_score_, m_SVM_tuning.best_estimator_)

In [17]:
m_SVM_results

(0.5387693487045127,
 SVR(C=0.03, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))

In [18]:
g_SVM_tuning = GridSearchCV(g_SVM, SVM_grid, cv=5, n_jobs=-1, verbose=2)

In [19]:
g_SVM_tuning.fit(X, data['y'])

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  2.8min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10, 30, 50, 100,
                               300, 1000],
                         'gamma': ['scale', 'auto'], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [20]:
g_SVM_results = (g_SVM_tuning.best_score_, g_SVM_tuning.best_estimator_)

In [21]:
g_SVM_results

(0.9787228505147217,
 SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))

In [22]:
NN_grid = {'hidden_layer_sizes': [(10,), (30,), (100,), (300,), (10,10), (20,20), (50,50)],
           'activation': ['relu'],
           'alpha': [0.00001, 0.0001, 0.001],
           'max_iter': [200]}

In [23]:
m_NN_tuning = GridSearchCV(m_NN, NN_grid, cv=5, n_jobs=-1, verbose=2)

In [24]:
m_NN_tuning.fit(X, data['D'])

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:   39.6s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=MLPRegressor(activation='relu', alpha=0.0001,
                                    batch_size='auto', beta_1=0.9, beta_2=0.999,
                                    early_stopping=False, epsilon=1e-08,
                                    hidden_layer_sizes=(100,),
                                    learning_rate='constant',
                                    learning_rate_init=0.001, max_fun=15000,
                                    max_iter=200, momentum=0.9,
                                    n_iter_no_change=10,
                                    nesterovs_momentum=True, power_t=0.5,
                                    random_state=...True,
                                    solver='adam', tol=0.0001,
                                    validation_fraction=0.1, verbose=False,
                                    warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'activation': ['relu'],
 

In [25]:
m_NN_results = (m_NN_tuning.best_score_, m_NN_tuning.best_estimator_)

In [26]:
m_NN_results

(0.5586133920149454,
 MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False))

In [27]:
g_NN_tuning = GridSearchCV(g_NN, NN_grid, cv=5, n_jobs=-1, verbose=2)

In [28]:
g_NN_tuning.fit(X, data['y'])

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:  3.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=MLPRegressor(activation='relu', alpha=0.0001,
                                    batch_size='auto', beta_1=0.9, beta_2=0.999,
                                    early_stopping=False, epsilon=1e-08,
                                    hidden_layer_sizes=(100,),
                                    learning_rate='constant',
                                    learning_rate_init=0.001, max_fun=15000,
                                    max_iter=200, momentum=0.9,
                                    n_iter_no_change=10,
                                    nesterovs_momentum=True, power_t=0.5,
                                    random_state=...True,
                                    solver='adam', tol=0.0001,
                                    validation_fraction=0.1, verbose=False,
                                    warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'activation': ['relu'],
 

In [29]:
g_NN_results = (g_NN_tuning.best_score_, g_NN_tuning.best_estimator_)

In [30]:
g_NN_results

(0.9671997425603218,
 MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50, 50), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False))

In [32]:
m_agg_est = VotingRegressor([('RF', m_RF_results[1]), ('SVM', m_SVM_results[1]), ('NN', m_NN_results[1])])
g_agg_est = VotingRegressor([('RF', g_RF_results[1]), ('SVM', g_SVM_results[1]), ('NN', g_NN_results[1])])

In [33]:
cross_validate(m_agg_est, X, data['D'], cv=5, n_jobs=-1)['test_score'].mean()

0.5549286538013469

In [34]:
(m_RF_results[0], m_SVM_results[0], m_NN_results[0])

(0.5352948187032898, 0.5387693487045127, 0.5586133920149454)

In [35]:
cross_validate(g_agg_est, X, data['y'], cv=5, n_jobs=-1)['test_score'].mean()

0.9767802231893119

In [36]:
(g_RF_results[0], g_SVM_results[0], g_NN_results[0])

(0.9826356652652961, 0.9787228505147217, 0.9671997425603218)