# Exploration of Socioeconomic Influences on Cancer Mortality:
# Machine Learning with Unscaled Data

In [264]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler #StandardScaler?
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import pickle

In [2]:
df = pd.read_csv('cancer_ml6_ml.csv', index_col=['Geography'])

In [3]:
df.head()

Unnamed: 0_level_0,avgAnnCount,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,MedianAge,MedianAgeMale,MedianAgeFemale,...,city_min_distsl1_sqrd,sc_min_dists_l1_log,PCT_LACCESS_CHILD10_sqrd,PCT_LACCESS_HHNV10_sqrd,PC_DIRSALES07_sqrd,FMRKT13_sqrd,PCH_FMRKT_09_13_sqrd,PCT_OBESE_ADULTS13_log,PCT_OBESE_ADULTS13_sqrd,CHILDPOVRATE10_log
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Abbeville County, South Carolina",143.0,183.7,430.9,35525,24932,21.4,0.0,43.3,40.7,44.9,...,5.827314,-0.674641,49.425404,36.601854,13.9129,4,0.0,3.456317,1004.89,3.280911
"Acadia Parish, Louisiana",323.0,230.5,492.7,40269,62577,22.0,0.0,35.7,34.7,37.2,...,47.212922,-1.386678,0.243122,3.229274,58.9824,0,0.0,3.499533,1095.61,3.387774
"Accomack County, Virginia",221.0,216.2,479.4,38390,32973,19.4,0.0,45.3,42.7,47.3,...,22.077434,0.153911,0.516719,59.388869,2.9584,4,10000.0,3.303217,739.84,3.356897
"Ada County, Idaho",1757.0,151.6,469.0,57908,434211,11.6,414.545002,35.8,35.0,36.6,...,104.907721,-0.244491,24.459579,0.336674,5.2441,100,123.45679,3.387774,876.16,2.778819
"Adair County, Iowa",51.0,178.9,440.7,48216,7228,10.3,138.350858,45.9,45.0,47.7,...,54.729457,-0.522917,3.281391,3.52072,48.3025,4,0.0,3.443618,979.69,2.646175


In [4]:
df.columns

Index(['avgAnnCount', 'TARGET_deathRate', 'incidenceRate', 'medIncome',
       'popEst2015', 'povertyPercent', 'studyPerCap', 'MedianAge',
       'MedianAgeMale', 'MedianAgeFemale',
       ...
       'city_min_distsl1_sqrd', 'sc_min_dists_l1_log',
       'PCT_LACCESS_CHILD10_sqrd', 'PCT_LACCESS_HHNV10_sqrd',
       'PC_DIRSALES07_sqrd', 'FMRKT13_sqrd', 'PCH_FMRKT_09_13_sqrd',
       'PCT_OBESE_ADULTS13_log', 'PCT_OBESE_ADULTS13_sqrd',
       'CHILDPOVRATE10_log'],
      dtype='object', length=329)

In [5]:
len(df.index.unique())

3047

In [6]:
df.shape

(3047, 329)

In [7]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 3047 entries, Abbeville County, South Carolina to Zavala County, Texas
Data columns (total 329 columns):
avgAnnCount                       float64
TARGET_deathRate                  float64
incidenceRate                     float64
medIncome                         int64
popEst2015                        int64
povertyPercent                    float64
studyPerCap                       float64
MedianAge                         float64
MedianAgeMale                     float64
MedianAgeFemale                   float64
AvgHouseholdSize                  float64
PercentMarried                    float64
PctNoHS18_24                      float64
PctHS18_24                        float64
PctSomeCol18_24                   float64
PctBachDeg18_24                   float64
PctHS25_Over                      float64
PctBachDeg25_Over                 float64
PctEmployed16_Over                float64
PctUnemployed16_Over              float64
PctPrivateCove

# Machine Learning with Unscaled Data

To find the best performing algorithm, unscaled data is first experimented with and then scaled data is used. A simple 1-fold cross-validation of a train-test split will be used; multifold cross-validation may be explored later in the Machine Learning pipeline.

## Linear Regression: Basic OLS with no Hyperparameter Tuning or Train-Test Split

The target variable is set as 'TARGET_deathRate', the per capita cancer mortality rate (per 100,000 people).

In [8]:
y = df['TARGET_deathRate']

The predictive feature set X is defined as the rest of the columns in the DataFrame.

In [9]:
target_name = ['TARGET_deathRate']
X = df[[cn for cn in df.columns if cn not in target_name]]

First, a simple Linear Regression algorithm is run with no scaling, no hyperparameter tuning and no train/test split.

In [10]:
lr_1 = linear_model.LinearRegression()
lr_1

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [11]:
lr_1.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [12]:
lr_1.score(X, y)

0.6528892459212396

In [13]:
y_pred_1 = lr_1.predict(X)
y_pred_1[0:20]

array([188.49306902, 204.21317523, 208.04168356, 144.3527763 ,
       172.08713342, 208.03640582, 177.36422251, 211.13721016,
       157.36076551, 164.04873457, 178.13226564, 188.25829087,
       180.11453527, 209.17514377, 178.23192636, 154.9184619 ,
       222.78217363, 166.29058502, 148.85989629, 178.30619184])

In [14]:
print("R^2: {}".format(lr_1.score(X, y)))
rmse = np.sqrt(mean_squared_error(y, y_pred_1))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6528892459212396
Root Mean Squared Error: 16.347426607269345


In [15]:
filename = 'cancer_lr_1.sav'
pickle.dump(lr_1, open(filename, 'wb'))

## Linear Regression: Basic OLS with no Normalization and 80/20 Train-Test Split

In [16]:
lr_2 = linear_model.LinearRegression()
lr_2

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

Next, a train/test split is created and the OLS linear regression algorithm is re-run.

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

The algorithm is fitted on the training set, and the accuracy is returned on the test set.

In [18]:
lr_2.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

An accuracy of 64.1% is returned.

In [19]:
lr_2.score(X_test, y_test)

0.6406253860729231

In [20]:
y_pred_2 = lr_2.predict(X_test)
y_pred_2[0:20]

array([177.32330972, 174.71390582, 162.20675859, 175.78673742,
       178.88472738, 195.88785682, 173.28360858, 164.24285477,
       175.00771197, 171.91171726, 177.00965023, 206.91234944,
       158.15670133, 157.5426522 , 219.88087468, 108.69273994,
       188.34900034, 205.93368446, 208.31387692, 183.87287744])

An RMSE of 16.2 is returned.

In [21]:
print("R^2: {}".format(lr_2.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_2))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6406253860729231
Root Mean Squared Error: 16.199560046349408


In [22]:
filename = 'cancer_lr_2.sav'
pickle.dump(lr_2, open(filename, 'wb'))

## Linear Regression: Basic OLS with Normalization and 80/20 Train-Test Split

In [23]:
lr_2n = linear_model.LinearRegression(normalize=True)
lr_2n

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

The algorithm is fitted on the training set, and the accuracy is returned on the test set.

In [24]:
lr_2n.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

An accuracy of -4.86% is returned. This path of normalization should obviously not be followed.

In [25]:
lr_2n.score(X_test, y_test)

-4.485371328856633e+22

In [26]:
y_pred_2n = lr_2n.predict(X_test)
y_pred_2n[0:20]

array([176.8984375, 174.3125   , 160.9609375, 175.953125 , 179.3515625,
       196.3984375, 173.1875   , 163.75     , 174.7734375, 173.03125  ,
       177.015625 , 206.9453125, 158.375    , 157.25     , 220.046875 ,
       110.421875 , 188.90625  , 206.8515625, 207.8359375, 183.8125   ])

An RMSE of 5723065594363.49 is returned.

In [27]:
print("R^2: {}".format(lr_2n.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_2n))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -4.485371328856633e+22
Root Mean Squared Error: 5723065594363.49


In [28]:
filename = 'cancer_lr_2n.sav'
pickle.dump(lr_2n, open(filename, 'wb'))

## Ridge Regression

In [29]:
lr_3 = linear_model.Ridge(alpha=0.001)

In [30]:
lr_3.fit(X_train, y_train)

  overwrite_a=True).T


Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [31]:
lr_3.score(X_test, y_test)

0.640764496169676

In [32]:
y_pred_3 = lr_3.predict(X_test)
y_pred_3[0:20]

array([177.35025528, 175.37003441, 162.28118062, 175.61564994,
       178.79949237, 195.82574326, 173.0892533 , 164.0994861 ,
       174.86180419, 172.19905287, 177.03470202, 206.91320063,
       157.80007646, 157.37575809, 220.00185219, 108.90253059,
       188.29362249, 205.92327747, 208.22259907, 183.78457325])

In [33]:
print("R^2: {}".format(lr_3.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_3))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.640764496169676
Root Mean Squared Error: 16.196424404043412


In [34]:
filename = 'cancer_lr_3.sav'
pickle.dump(lr_3, open(filename, 'wb'))

In [35]:
lr_4 = linear_model.Ridge(alpha=0.01)

In [36]:
lr_4.fit(X_train, y_train)

  overwrite_a=True).T


Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [37]:
lr_4.score(X_test, y_test)

0.6398034038943229

In [38]:
y_pred_4 = lr_4.predict(X_test)
y_pred_4[0:20]

array([177.27737461, 175.84086151, 162.39717802, 175.15845425,
       178.70525048, 195.52244817, 173.3364408 , 164.07576811,
       174.89145599, 172.53565544, 176.99916744, 206.934674  ,
       157.34917026, 157.17733021, 220.12558738, 109.72125012,
       188.33461623, 205.93000184, 208.11227026, 183.61566338])

In [39]:
print("R^2: {}".format(lr_4.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_4))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6398034038943229
Root Mean Squared Error: 16.218075745153282


In [40]:
filename = 'cancer_lr_4.sav'
pickle.dump(lr_4, open(filename, 'wb'))

In [41]:
lr_5 = linear_model.Ridge(alpha=0.1)

In [42]:
lr_5.fit(X_train, y_train)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [43]:
lr_5.score(X_test, y_test)

0.6342414810354621

In [44]:
y_pred_5 = lr_5.predict(X_test)
y_pred_5[0:20]

array([177.13405645, 175.00340625, 162.34699436, 173.79786501,
       178.48061028, 194.9497753 , 173.09927587, 164.42108045,
       175.15798399, 173.22456357, 176.72338185, 206.77115793,
       156.94780335, 157.19234894, 220.28363922, 113.02093137,
       188.878909  , 206.64952077, 208.33444119, 182.93581045])

In [45]:
print("R^2: {}".format(lr_5.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_5))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6342414810354621
Root Mean Squared Error: 16.34281058890365


In [46]:
filename = 'cancer_lr_5.sav'
pickle.dump(lr_5, open(filename, 'wb'))

In [47]:
lr_6 = linear_model.Ridge(alpha=1)

In [48]:
lr_6.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [49]:
lr_6.score(X_test, y_test)

0.621875687695347

In [50]:
y_pred_6 = lr_6.predict(X_test)
y_pred_6[0:20]

array([178.28647303, 174.17563067, 162.27984663, 172.30645936,
       177.01143381, 194.83043131, 172.5284356 , 164.25192961,
       174.2756491 , 174.77172973, 176.31234177, 205.97603669,
       156.19165277, 157.61862647, 221.01656062, 118.96910465,
       190.12973799, 208.2054883 , 209.28291173, 181.50484775])

In [51]:
print("R^2: {}".format(lr_6.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_6))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.621875687695347
Root Mean Squared Error: 16.616778213212243


In [52]:
filename = 'cancer_lr_6.sav'
pickle.dump(lr_6, open(filename, 'wb'))

In [53]:
lr_7 = linear_model.Ridge(alpha=10)

In [54]:
lr_7.fit(X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [55]:
lr_7.score(X_test, y_test)

0.6130069769511672

In [56]:
y_pred_7 = lr_7.predict(X_test)
y_pred_7[0:20]

array([181.51286025, 174.10250058, 162.20529353, 172.25515681,
       174.55663576, 195.29457997, 174.05345022, 164.98582162,
       173.29383255, 175.64665633, 177.42640552, 204.45822472,
       155.87860822, 157.57432719, 221.611936  , 123.83752835,
       192.35713456, 208.18793317, 208.72564655, 181.32054181])

In [57]:
print("R^2: {}".format(lr_7.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_7))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6130069769511672
Root Mean Squared Error: 16.810517763883496


In [58]:
filename = 'cancer_lr_7.sav'
pickle.dump(lr_7, open(filename, 'wb'))

In [59]:
lr_8 = linear_model.Ridge(alpha=100)

In [60]:
lr_8.fit(X_train, y_train)

Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [61]:
lr_8.score(X_test, y_test)

0.6077184879304566

In [62]:
y_pred_8 = lr_8.predict(X_test)
y_pred_8[0:20]

array([183.68141199, 173.60746417, 162.40412631, 173.34668138,
       173.94359086, 196.84462871, 174.8321663 , 167.2311882 ,
       172.32978501, 175.49194848, 181.82877995, 199.60967788,
       155.46650297, 157.4052525 , 220.99979024, 126.20640224,
       193.99067578, 207.39410635, 204.84855801, 182.33624219])

In [63]:
print("R^2: {}".format(lr_8.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_8))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6077184879304566
Root Mean Squared Error: 16.92499084949676


In [64]:
filename = 'cancer_lr_8.sav'
pickle.dump(lr_8, open(filename, 'wb'))

## LASSO

LASSO will never converge without normalization. Although the MinMax Scaler will be experimented later in the notebook in conjunction with LASSO, the LASSO function with internal normalization is first tried below.

In [65]:
lr_9 = linear_model.Lasso(alpha=0.001, normalize=True, max_iter=2000)

In [66]:
lr_9.fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=2000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [67]:
lr_9.score(X_test, y_test)

0.5998320401737283

In [68]:
y_pred_9 = lr_9.predict(X_test)
y_pred_9[0:20]

array([179.08404154, 176.19810178, 159.42083274, 175.09287333,
       177.27253783, 196.87050709, 173.28475524, 165.38649937,
       170.85577602, 171.49963145, 177.28394713, 203.97845261,
       159.40360776, 156.60794932, 221.34930732, 115.38069907,
       187.57933553, 205.12641525, 206.61495516, 183.56065355])

In [69]:
print("R^2: {}".format(lr_9.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_9))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.5998320401737283
Root Mean Squared Error: 17.094274705054904


In [70]:
filename = 'cancer_lr_9.sav'
pickle.dump(lr_9, open(filename, 'wb'))

In [71]:
lr_10 = linear_model.Lasso(alpha=0.01, normalize=True, max_iter=2000)

In [72]:
lr_10.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=2000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [73]:
lr_10.score(X_test, y_test)

0.594475255322545

In [74]:
y_pred_10 = lr_10.predict(X_test)
y_pred_10[0:20]

array([185.11889985, 176.30771271, 166.3214211 , 176.98537907,
       170.28123545, 200.57791348, 173.72497032, 172.01673999,
       172.08606202, 170.06650011, 183.86795521, 194.1600771 ,
       163.7421692 , 161.67062688, 220.46487863, 122.25124147,
       190.42182788, 199.79208002, 201.43295303, 183.92432696])

In [75]:
print("R^2: {}".format(lr_10.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_10))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.594475255322545
Root Mean Squared Error: 17.208309244371353


In [76]:
filename = 'cancer_lr_10.sav'
pickle.dump(lr_10, open(filename, 'wb'))

In [77]:
lr_11 = linear_model.Lasso(alpha=0.1, normalize=True, max_iter=2000)

In [78]:
lr_11.fit(X_train, y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=2000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [79]:
lr_11.score(X_test, y_test)

0.45548542937157255

In [80]:
y_pred_11 = lr_11.predict(X_test)
y_pred_11[0:20]

array([187.81731941, 176.3460354 , 171.92094489, 176.59754821,
       172.80994976, 191.12387984, 177.23686082, 181.80709607,
       172.2819364 , 177.99533327, 183.23656609, 185.27581793,
       174.67196715, 170.54334155, 201.03917248, 136.80641452,
       188.87892173, 191.15520664, 188.31907096, 183.94689769])

In [81]:
print("R^2: {}".format(lr_11.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_11))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.45548542937157255
Root Mean Squared Error: 19.940418782365576


In [82]:
filename = 'cancer_lr_11.sav'
pickle.dump(lr_11, open(filename, 'wb'))

In [83]:
lr_12 = linear_model.Lasso(alpha=1, normalize=True, max_iter=2000)

In [84]:
lr_12.fit(X_train, y_train)

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=2000, normalize=True,
   positive=False, precompute=False, random_state=None, selection='cyclic',
   tol=0.0001, warm_start=False)

In [85]:
lr_12.score(X_test, y_test)

-0.00798750229846834

In [86]:
y_pred_12 = lr_12.predict(X_test)
y_pred_12[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [87]:
print("R^2: {}".format(lr_12.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_12))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


In [88]:
filename = 'cancer_lr_12.sav'
pickle.dump(lr_12, open(filename, 'wb'))

In [89]:
lr_13 = linear_model.Lasso(alpha=10, normalize=True, max_iter=2000)

In [90]:
lr_13.fit(X_train, y_train)

Lasso(alpha=10, copy_X=True, fit_intercept=True, max_iter=2000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [91]:
lr_13.score(X_test, y_test)

-0.00798750229846834

In [92]:
y_pred_13 = lr_13.predict(X_test)
y_pred_13[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [93]:
print("R^2: {}".format(lr_13.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_13))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


In [94]:
filename = 'cancer_lr_13.sav'
pickle.dump(lr_13, open(filename, 'wb'))

In [95]:
lr_14 = linear_model.Lasso(alpha=100, normalize=True, max_iter=2000)

In [96]:
lr_14.fit(X_train, y_train)

Lasso(alpha=100, copy_X=True, fit_intercept=True, max_iter=2000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [97]:
lr_14.score(X_test, y_test)

-0.00798750229846834

In [98]:
y_pred_14 = lr_14.predict(X_test)
y_pred_14[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [99]:
print("R^2: {}".format(lr_14.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_14))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


In [100]:
filename = 'cancer_lr_14.sav'
pickle.dump(lr_14, open(filename, 'wb'))

## Elastic Net with L1 Ratio of 0.25

In [101]:
lr_15 = linear_model.ElasticNet(alpha=0.001, l1_ratio=0.25, normalize=True)

In [102]:
lr_15.fit(X_train, y_train)

ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.25,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [103]:
lr_15.score(X_test, y_test)

0.49190155235439337

In [104]:
y_pred_15 = lr_15.predict(X_test)
y_pred_15[0:20]

array([170.95437434, 176.36094391, 164.45840772, 175.63669806,
       178.08345981, 193.98510616, 175.32038492, 167.86477987,
       165.50403983, 174.36013448, 181.79767063, 203.00044749,
       162.33978453, 161.07682502, 213.08867172, 127.14256212,
       189.16329271, 201.59158519, 195.99667923, 187.37673869])

In [105]:
print("R^2: {}".format(lr_15.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_15))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.49190155235439337
Root Mean Squared Error: 19.26209215267006


In [106]:
lr_16 = linear_model.ElasticNet(alpha=0.01, l1_ratio=0.25, normalize=True)

In [107]:
lr_16.fit(X_train, y_train)

ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=0.25,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [108]:
lr_16.score(X_test, y_test)

0.30787577694259743

In [109]:
y_pred_16 = lr_16.predict(X_test)
y_pred_16[0:20]

array([170.76237575, 177.20255587, 171.01026219, 177.40422787,
       179.15725048, 188.30935096, 178.11517384, 173.8346168 ,
       168.40437921, 176.21044743, 183.77945312, 194.91755311,
       170.63556204, 169.14381897, 195.87889423, 151.55066664,
       187.90977896, 191.60971669, 188.02412753, 187.55979514])

In [110]:
print("R^2: {}".format(lr_16.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_16))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.30787577694259743
Root Mean Squared Error: 22.481306505397043


In [111]:
lr_17 = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.25, normalize=True)

In [112]:
lr_17.fit(X_train, y_train)

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.25,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [113]:
lr_17.score(X_test, y_test)

0.0725357091321811

In [114]:
y_pred_17 = lr_17.predict(X_test)
y_pred_17[0:20]

array([176.81419944, 178.87146455, 177.56952794, 178.71108992,
       178.94841071, 181.06972995, 179.28864234, 178.03382554,
       176.96131869, 178.53663968, 180.39169398, 182.19726188,
       177.57181621, 177.22082932, 182.37613486, 173.81807071,
       181.0517028 , 181.35501076, 180.69809804, 181.11019085])

In [115]:
print("R^2: {}".format(lr_17.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_17))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.0725357091321811
Root Mean Squared Error: 26.024243926681066


In [116]:
lr_18 = linear_model.ElasticNet(alpha=1, l1_ratio=0.25, normalize=True)

In [117]:
lr_18.fit(X_train, y_train)

ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=0.25,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [118]:
lr_18.score(X_test, y_test)

-0.007738882107724887

In [119]:
y_pred_18 = lr_18.predict(X_test)
y_pred_18[0:20]

array([179.1480978 , 179.14316307, 179.1467997 , 179.14705374,
       179.1476209 , 179.15174564, 179.14472912, 179.15098806,
       179.1397567 , 179.14746986, 179.14925033, 179.15845061,
       179.14442068, 179.14286662, 179.15587489, 179.1244432 ,
       179.1518528 , 179.15487349, 179.14990126, 179.15109355])

In [120]:
print("R^2: {}".format(lr_18.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_18))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.007738882107724887
Root Mean Squared Error: 27.127110095625508


In [121]:
filename = 'cancer_lr_18.sav'
pickle.dump(lr_18, open(filename, 'wb'))

In [122]:
lr_19 = linear_model.ElasticNet(alpha=10, l1_ratio=0.25, normalize=True)

In [123]:
lr_19.fit(X_train, y_train)

ElasticNet(alpha=10, copy_X=True, fit_intercept=True, l1_ratio=0.25,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [124]:
lr_19.score(X_test, y_test)

-0.00798750229846834

In [125]:
y_pred_19 = lr_19.predict(X_test)
y_pred_19[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [126]:
print("R^2: {}".format(lr_19.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_19))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


In [127]:
lr_20 = linear_model.ElasticNet(alpha=100, l1_ratio=0.25, normalize=True)

In [128]:
lr_20.fit(X_train, y_train)

ElasticNet(alpha=100, copy_X=True, fit_intercept=True, l1_ratio=0.25,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [129]:
lr_20.score(X_test, y_test)

-0.00798750229846834

In [130]:
y_pred_20 = lr_20.predict(X_test)
y_pred_20[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [131]:
print("R^2: {}".format(lr_20.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_20))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


## Elastic Net with L1 Ratio of 0.5

In [132]:
lr_21 = linear_model.ElasticNet(alpha=0.001, l1_ratio=0.5, normalize=True)

In [133]:
lr_21.fit(X_train, y_train)

ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [134]:
lr_21.score(X_test, y_test)

0.5171383101166998

In [135]:
y_pred_21 = lr_21.predict(X_test)
y_pred_21[0:20]

array([171.66842242, 176.42945061, 163.30730721, 175.69270898,
       177.84493978, 194.36741685, 175.14652324, 166.88145197,
       165.9978823 , 173.82642174, 181.10044387, 203.15344217,
       161.44612749, 160.2450128 , 215.53400894, 124.29084818,
       188.89226082, 202.33228133, 197.22356577, 186.65856415])

In [136]:
print("R^2: {}".format(lr_21.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_21))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.5171383101166998
Root Mean Squared Error: 18.777635183012883


In [137]:
lr_22 = linear_model.ElasticNet(alpha=0.01, l1_ratio=0.5, normalize=True)

In [138]:
lr_22.fit(X_train, y_train)

ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [139]:
lr_22.score(X_test, y_test)

0.3404507974746751

In [140]:
y_pred_22 = lr_22.predict(X_test)
y_pred_22[0:20]

array([170.61636164, 176.86268322, 170.04027523, 176.89932787,
       178.93232248, 189.40310438, 177.51211655, 173.52147116,
       167.22292419, 175.887877  , 184.04899179, 196.69059728,
       169.22150972, 167.71674436, 198.73571331, 147.25453885,
       188.59718556, 193.30814215, 189.15295386, 188.28880949])

In [141]:
print("R^2: {}".format(lr_22.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_22))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.3404507974746751
Root Mean Squared Error: 21.945886158164377


In [142]:
lr_23 = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.5, normalize=True)

In [143]:
lr_23.fit(X_train, y_train)

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [144]:
lr_23.score(X_test, y_test)

0.08364511786129704

In [145]:
y_pred_23 = lr_23.predict(X_test)
y_pred_23[0:20]

array([176.74512687, 178.7977992 , 177.30246787, 178.54119787,
       178.89101848, 181.28248327, 179.16331977, 178.25666609,
       176.53706307, 178.44269821, 180.51507686, 182.58406651,
       177.34045126, 176.94545028, 182.91069337, 172.89178476,
       181.27084026, 181.56904081, 180.8195803 , 181.3809751 ])

In [146]:
print("R^2: {}".format(lr_23.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_23))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.08364511786129704
Root Mean Squared Error: 25.86791178514166


In [147]:
lr_24 = linear_model.ElasticNet(alpha=1, l1_ratio=0.5, normalize=True)

In [148]:
lr_24.fit(X_train, y_train)

ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [149]:
lr_24.score(X_test, y_test)

-0.00798750229846834

In [150]:
y_pred_24 = lr_24.predict(X_test)
y_pred_24[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [151]:
print("R^2: {}".format(lr_24.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_24))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


In [152]:
lr_25 = linear_model.ElasticNet(alpha=10, l1_ratio=0.5, normalize=True)

In [153]:
lr_25.fit(X_train, y_train)

ElasticNet(alpha=10, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [154]:
lr_25.score(X_test, y_test)

-0.00798750229846834

In [155]:
y_pred_25 = lr_25.predict(X_test)
y_pred_25[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [156]:
print("R^2: {}".format(lr_25.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_25))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


In [157]:
lr_26 = linear_model.ElasticNet(alpha=100, l1_ratio=0.5, normalize=True)

In [158]:
lr_26.fit(X_train, y_train)

ElasticNet(alpha=100, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [159]:
lr_26.score(X_test, y_test)

-0.00798750229846834

In [160]:
y_pred_26 = lr_26.predict(X_test)
y_pred_26[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [161]:
print("R^2: {}".format(lr_26.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_26))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


## Elastic Net with L1 Ratio of 0.75

In [162]:
lr_27 = linear_model.ElasticNet(alpha=0.001, l1_ratio=0.75, normalize=True)

In [163]:
lr_27.fit(X_train, y_train)

ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.75,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [164]:
lr_27.score(X_test, y_test)

0.5517773798702275

In [165]:
y_pred_27 = lr_27.predict(X_test)
y_pred_27[0:20]

array([173.05814565, 176.3736161 , 161.55932882, 175.86873184,
       177.33470983, 195.06204289, 174.89435439, 165.50164657,
       167.08055001, 172.92321233, 180.1114842 , 202.85071936,
       160.35898262, 159.03549412, 218.80713232, 120.58610834,
       188.49627093, 202.86697182, 199.41163971, 185.61934068])

In [166]:
print("R^2: {}".format(lr_27.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_27))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.5517773798702275
Root Mean Squared Error: 18.09157626232452


In [167]:
lr_28 = linear_model.ElasticNet(alpha=0.01, l1_ratio=0.75, normalize=True)

In [168]:
lr_28.fit(X_train, y_train)

ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=0.75,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [169]:
lr_28.score(X_test, y_test)

0.39299117790692584

In [170]:
y_pred_28 = lr_28.predict(X_test)
y_pred_28[0:20]

array([171.0469776 , 176.3500255 , 168.83794284, 176.35551687,
       178.63255147, 191.31705295, 176.30740184, 173.01952326,
       165.95534085, 175.53181274, 183.9738743 , 198.8669463 ,
       167.19036601, 165.86671323, 203.5200834 , 140.13127039,
       189.48343678, 195.96556675, 191.17258393, 188.73709384])

In [171]:
print("R^2: {}".format(lr_28.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_28))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.39299117790692584
Root Mean Squared Error: 21.05363181543672


In [172]:
lr_29 = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.75, normalize=True)

In [173]:
lr_29.fit(X_train, y_train)

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.75,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [174]:
lr_29.score(X_test, y_test)

0.11405651856112609

In [175]:
y_pred_29 = lr_29.predict(X_test)
y_pred_29[0:20]

array([176.56514742, 178.54369489, 176.56874677, 178.37242339,
       179.01528181, 181.94004916, 178.75331567, 178.51974704,
       175.470963  , 178.22901419, 180.73520631, 183.76332844,
       176.70625696, 176.17858516, 184.31529576, 170.37338747,
       181.91524665, 182.46060596, 181.35170544, 182.06965917])

In [176]:
print("R^2: {}".format(lr_29.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_29))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.11405651856112609
Root Mean Squared Error: 25.435046196771765


In [177]:
lr_30 = linear_model.ElasticNet(alpha=1, l1_ratio=0.75, normalize=True)

In [178]:
lr_30.fit(X_train, y_train)

ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=0.75,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [179]:
lr_30.score(X_test, y_test)

-0.00798750229846834

In [180]:
y_pred_30 = lr_30.predict(X_test)
y_pred_30[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [181]:
print("R^2: {}".format(lr_30.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_30))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


In [182]:
lr_31 = linear_model.ElasticNet(alpha=10, l1_ratio=0.75, normalize=True)

In [183]:
lr_31.fit(X_train, y_train)

ElasticNet(alpha=10, copy_X=True, fit_intercept=True, l1_ratio=0.75,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [184]:
lr_31.score(X_test, y_test)

-0.00798750229846834

In [185]:
y_pred_31 = lr_31.predict(X_test)
y_pred_31[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [186]:
print("R^2: {}".format(lr_31.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_31))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


In [187]:
lr_32 = linear_model.ElasticNet(alpha=100, l1_ratio=0.75, normalize=True)

In [188]:
lr_32.fit(X_train, y_train)

ElasticNet(alpha=100, copy_X=True, fit_intercept=True, l1_ratio=0.75,
      max_iter=1000, normalize=True, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [189]:
lr_32.score(X_test, y_test)

-0.00798750229846834

In [190]:
y_pred_32 = lr_32.predict(X_test)
y_pred_32[0:20]

array([179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847,
       179.14755847, 179.14755847, 179.14755847, 179.14755847])

In [191]:
print("R^2: {}".format(lr_32.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_32))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.00798750229846834
Root Mean Squared Error: 27.130456166458586


## Stochastic Gradient Descent with L1 Penalty

In [192]:
lr_33 = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=0.00001)

In [193]:
lr_33.fit(X_train, y_train)



SGDRegressor(alpha=1e-05, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [194]:
lr_33.score(X_test, y_test)

-1684463487429.632

In [195]:
y_pred_33 = lr_33.predict(X_test)
y_pred_33[0:20]

array([  4805606.61378396,  -6572477.25158047,   5478359.45157729,
         4164701.96138232,   4003051.02384702,   -256800.0470064 ,
        -6104005.56785677,   2634814.65929894,  -8838996.13992465,
         4734173.05091088,    588810.59713576,   3615827.47463434,
         4806855.69008278,   -127472.60359337,    470508.60934598,
       -12388153.33974496,   3325830.34914112,   1747439.01116739,
         1596844.778505  ,  -9112723.31760527])

In [196]:
print("R^2: {}".format(lr_33.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_33))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -1684463487429.632
Root Mean Squared Error: 35071983.08759262


In [197]:
lr_34 = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=0.0001)

In [198]:
lr_34.fit(X_train, y_train)



SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [199]:
lr_34.score(X_test, y_test)

-431124369467.1897

In [200]:
y_pred_34 = lr_34.predict(X_test)
y_pred_34[0:20]

array([ 1755127.94108787,  3740342.74554397,  2051469.76198468,
        2078366.97108213,  1768382.86482292,  2510904.96136763,
        3974829.56041152,  2210672.98910004,  4310080.72671576,
        1958573.10570216,  2613476.44658265,   963985.61589711,
        1655771.73263497,  3511479.22203542,  1186869.86817024,
       19791968.60735921,  1980650.82150231,  2717229.32267018,
        2202965.49583436,  8999389.46216077])

In [201]:
print("R^2: {}".format(lr_34.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_34))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -431124369467.1897
Root Mean Squared Error: 17743153.442649473


In [202]:
lr_35 = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=0.001)

In [203]:
lr_35.fit(X_train, y_train)



SGDRegressor(alpha=0.001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [204]:
lr_35.score(X_test, y_test)

-571632155748.7104

In [205]:
y_pred_35 = lr_35.predict(X_test)
y_pred_35[0:20]

array([  -554757.80116258,   1262602.21043634,   -239322.62936354,
         -245940.68884345,   -625498.01981573,   4452615.08110651,
         1594450.81342583,    -54740.25141891,   1856571.54483308,
         -374769.32495607,    269778.98434479,    191126.50182935,
         -275348.51014361,  -1527416.25486249,   1589307.78718789,
       -23545078.81675937,   -409297.67113839,  -1543494.37566134,
         -150941.00044507,  -8456985.04201551])

In [206]:
print("R^2: {}".format(lr_35.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_35))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -571632155748.7104
Root Mean Squared Error: 20430916.446123432


In [207]:
lr_36 = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=0.01)

In [208]:
lr_36.fit(X_train, y_train)



SGDRegressor(alpha=0.01, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [209]:
lr_36.score(X_test, y_test)

-234585135211.05222

In [210]:
y_pred_36 = lr_36.predict(X_test)
y_pred_36[0:20]

array([ -418723.57165135, -3775513.82952902,  -687974.21696748,
        -763962.26492945,  -331018.53828296, -4119440.7295293 ,
       -4010636.97682366, -1291529.86398461, -4870114.89230471,
        -536180.25965856, -1657530.36193911,  -623541.12588027,
        -865689.66954556, -1037765.3156918 , -2366997.18119827,
       11975979.56559417,  -541123.58764514,    43396.38265694,
       -1087254.80610882,  2940294.27969031])

In [211]:
print("R^2: {}".format(lr_36.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_36))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -234585135211.05222
Root Mean Squared Error: 13088195.296902977


In [212]:
lr_37 = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=0.1)

In [213]:
lr_37.fit(X_train, y_train)



SGDRegressor(alpha=0.1, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [214]:
lr_37.score(X_test, y_test)

-5325723711077.835

In [215]:
y_pred_37 = lr_37.predict(X_test)
y_pred_37[0:20]

array([ 4534213.52161916, 11555492.67100715,  5178173.00430864,
        5500538.27757471,  4800747.41111056, -5382513.99715459,
       11904725.05736755,  6241265.07986538, 13516589.0929867 ,
        4768967.51632361,  7704840.25191826,  2314034.52100915,
        5190652.33565062, 12044553.49898134,  2592250.15582983,
       69303158.2405696 ,  5438731.36855177,  8521117.75558183,
        6501565.16345504, 30924676.27314443])

In [216]:
print("R^2: {}".format(lr_37.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_37))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -5325723711077.835
Root Mean Squared Error: 62361828.183784515


In [217]:
lr_38 = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=1)

In [218]:
lr_38.fit(X_train, y_train)



SGDRegressor(alpha=1, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [219]:
lr_38.score(X_test, y_test)

-4252598995555.149

In [220]:
y_pred_38 = lr_38.predict(X_test)
y_pred_38[0:20]

array([ 4477482.68868022,  9605163.06196534,  4855629.67009252,
        5073054.95024541,  4587014.37504087, 20227933.11141673,
       10152559.58106465,  5656551.09786826, 10750135.68763793,
        5218704.64262902,  6523589.4496639 ,  2240506.28345486,
        4858724.85388658, 10593966.50784375,  2084357.16455002,
       62903412.98869274,  4945754.01166057,  7687461.40759845,
        5725362.40916836, 27943627.72483522])

In [221]:
print("R^2: {}".format(lr_38.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_38))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -4252598995555.149
Root Mean Squared Error: 55725854.717884585


In [222]:
lr_39 = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=10)

In [223]:
lr_39.fit(X_train, y_train)



SGDRegressor(alpha=10, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [224]:
lr_39.score(X_test, y_test)

-42.77282312174619

In [225]:
y_pred_39 = lr_39.predict(X_test)
y_pred_39[0:20]

array([0.00115572, 0.00115572, 0.00115572, 0.00115572, 0.00115572,
       0.00115572, 0.00115572, 0.00115572, 0.00115572, 0.00115572,
       0.00115572, 0.00115572, 0.00115572, 0.00115572, 0.00115572,
       0.00115572, 0.00115572, 0.00115572, 0.00115572, 0.00115572])

In [226]:
print("R^2: {}".format(lr_39.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_39))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -42.77282312174619
Root Mean Squared Error: 178.7852972413635


In [227]:
lr_40 = linear_model.SGDRegressor(loss='huber', penalty='l1', alpha=100)

In [228]:
lr_40.fit(X_train, y_train)



SGDRegressor(alpha=100, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [229]:
lr_40.score(X_test, y_test)

-1564098059.8320208

In [230]:
y_pred_40 = lr_40.predict(X_test)
y_pred_40[0:20]

array([ 11792.9130716 , 454824.98485076,  49748.68300463,  68146.10457692,
        23445.75461562, 213865.75955801, 481190.16132494, 117231.83213853,
       576407.94480679,  38808.58203582, 198291.28864823,  25465.70497942,
        48152.01961999, 179635.53752247, 187036.83484931,  24522.64648908,
        59456.27188705,  63875.88593419, 124944.36989122, 213205.9298552 ])

In [231]:
print("R^2: {}".format(lr_40.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_40))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -1564098059.8320208
Root Mean Squared Error: 1068714.030527726


In [232]:
lr_41 = linear_model.SGDRegressor(loss='huber', penalty='l2', alpha=0.00001)

In [233]:
lr_41.fit(X_train, y_train)



SGDRegressor(alpha=1e-05, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [234]:
lr_41.score(X_test, y_test)

-10738269233975.428

In [235]:
y_pred_41 = lr_41.predict(X_test)
y_pred_41[0:20]

array([ 7.88115879e+06, -1.91277204e+04,  7.66292317e+06,  7.14312069e+06,
        7.70585203e+06, -1.89389605e+06, -1.85990606e+05,  6.11678231e+06,
       -2.15129363e+06,  7.71437835e+06,  4.85409403e+06,  3.45891821e+06,
        7.63774271e+06,  1.26165598e+07, -2.78753041e+06,  1.08701069e+08,
        7.28643507e+06,  1.16017312e+07,  5.96459791e+06,  4.02664850e+07])

In [236]:
print("R^2: {}".format(lr_41.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_41))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -10738269233975.428
Root Mean Squared Error: 88551651.7577043


In [237]:
lr_42 = linear_model.SGDRegressor(loss='huber', penalty='l2', alpha=0.0001)

In [238]:
lr_42.fit(X_train, y_train)



SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [239]:
lr_42.score(X_test, y_test)

-1418778365667.9895

In [240]:
y_pred_42 = lr_42.predict(X_test)
y_pred_42[0:20]

array([ 2141679.51774515,  -697778.23749604,  1963462.47789632,
        1862955.87525944,  2165225.12278237,  1290059.45590844,
        -741115.63074814,  1558351.39742491, -1607434.53185459,
        2152275.6285099 ,  1099707.53626138,   772111.06114145,
        1737285.1876826 ,  3541742.71409441, -1591465.81681921,
       38927300.04218837,  2085204.79204478,  3675108.21955998,
        1565033.67447   , 14296071.45768733])

In [241]:
print("R^2: {}".format(lr_42.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_42))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -1418778365667.9895
Root Mean Squared Error: 32187466.988798093


In [242]:
lr_43 = linear_model.SGDRegressor(loss='huber', penalty='elasticnet', alpha=0.0001)

In [243]:
lr_43.fit(X_train, y_train)



SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='huber', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='elasticnet', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [244]:
lr_43.score(X_test, y_test)

-211099765173.92786

In [245]:
y_pred_43 = lr_43.predict(X_test)
y_pred_43[0:20]

array([ -740728.12149812,  -294394.27324116, -1248119.89835627,
        -974260.15121306,  -634011.80901428, -4408810.72326653,
        -721363.36414488,  -920018.1979965 ,  -331214.17444253,
       -1023864.27786275,  -849445.40152353,  -935626.15482859,
       -1320548.46483373,   -97753.65350883, -1185886.53067186,
       12365808.37859571,  -805700.88886046,    72322.41391468,
        -639672.64276608,  4922161.03726372])

In [246]:
print("R^2: {}".format(lr_43.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_43))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -211099765173.92786
Root Mean Squared Error: 12415762.52638403


## Kernel Ridge

In [265]:
lr_44 = KernelRidge(alpha=0.001)

In [266]:
lr_44.fit(X_train, y_train)



KernelRidge(alpha=0.001, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)

In [267]:
lr_44.score(X_test, y_test)

0.6166500004431256

In [268]:
y_pred_44 = lr_44.predict(X_test)
y_pred_44[0:20]

array([171.31933594, 182.15234375, 165.21557617, 177.56420898,
       173.06542969, 195.51171875, 169.60546875, 167.9296875 ,
       178.1953125 , 174.59619141, 178.9296875 , 196.88708496,
       157.62866211, 154.1875    , 219.109375  , 109.7890625 ,
       181.87353516, 200.62060547, 202.02734375, 183.99023438])

In [269]:
print("R^2: {}".format(lr_44.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_44))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6166500004431256
Root Mean Squared Error: 16.73120636067689


In [270]:
lr_45 = KernelRidge(alpha=0.01)

In [271]:
lr_45.fit(X_train, y_train)



KernelRidge(alpha=0.01, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)

In [272]:
lr_45.score(X_test, y_test)

-1.1531403237383362

In [273]:
y_pred_45 = lr_45.predict(X_test)
y_pred_45[0:20]

array([124.04882812, 241.125     , 191.0546875 , 203.03515625,
       123.63476562, 201.84912109, 140.53710938, 200.07617188,
       186.22851562, 193.0546875 , 197.54492188, 106.73144531,
       154.07421875, 138.91992188, 210.07226562, 113.53125   ,
       120.10449219, 162.73046875, 158.59570312, 188.6171875 ])

In [274]:
print("R^2: {}".format(lr_45.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_45))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -1.1531403237383362
Root Mean Squared Error: 39.65205404618008


In [275]:
lr_46 = KernelRidge(alpha=0.1)

In [276]:
lr_46.fit(X_train, y_train)

  overwrite_a=False)


KernelRidge(alpha=0.1, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)

In [277]:
lr_46.score(X_test, y_test)

0.6327295325360558

In [278]:
y_pred_46 = lr_46.predict(X_test)
y_pred_46[0:20]

array([177.5       , 175.484375  , 162.5234375 , 172.84375   ,
       177.4375    , 195.3046875 , 173.796875  , 163.6640625 ,
       175.1875    , 173.6328125 , 176.390625  , 208.10546875,
       155.453125  , 158.390625  , 220.51757812, 118.75      ,
       188.21875   , 207.65625   , 208.203125  , 182.8125    ])

In [279]:
print("R^2: {}".format(lr_46.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_46))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6327295325360558
Root Mean Squared Error: 16.37655417065911


In [280]:
lr_47 = KernelRidge(alpha=1)

In [281]:
lr_47.fit(X_train, y_train)

  overwrite_a=False)


KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)

In [282]:
lr_47.score(X_test, y_test)

0.6215538051580854

In [283]:
y_pred_47 = lr_47.predict(X_test)
y_pred_47[0:20]

array([178.39648438, 174.34765625, 162.29980469, 172.21679688,
       176.91015625, 194.85498047, 172.4765625 , 164.20703125,
       174.26757812, 174.81640625, 176.296875  , 205.90185547,
       156.203125  , 157.6015625 , 221.03222656, 118.875     ,
       190.15722656, 208.20117188, 209.2265625 , 181.4453125 ])

In [284]:
print("R^2: {}".format(lr_47.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_47))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6215538051580854
Root Mean Squared Error: 16.62384931756596


In [285]:
lr_48 = KernelRidge(alpha=10)

In [286]:
lr_48.fit(X_train, y_train)

KernelRidge(alpha=10, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)

In [287]:
lr_48.score(X_test, y_test)

0.6130167553240591

In [288]:
y_pred_48 = lr_48.predict(X_test)
y_pred_48[0:20]

array([181.58984375, 174.14208984, 162.21936035, 172.21887207,
       174.53308105, 195.28686523, 174.00268555, 165.01391602,
       173.32568359, 175.67236328, 177.41259766, 204.43432617,
       155.89025879, 157.5637207 , 221.60331726, 123.80859375,
       192.36755371, 208.20532227, 208.69787598, 181.29199219])

In [289]:
print("R^2: {}".format(lr_48.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_48))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6130167553240591
Root Mean Squared Error: 16.810305382083577


In [290]:
lr_49 = KernelRidge(alpha=100)

In [291]:
lr_49.fit(X_train, y_train)

KernelRidge(alpha=100, coef0=1, degree=3, gamma=None, kernel='linear',
      kernel_params=None)

In [292]:
lr_49.score(X_test, y_test)

0.6075483807778729

In [293]:
y_pred_49 = lr_49.predict(X_test)
y_pred_49[0:20]

array([183.81863403, 173.63423157, 162.44210815, 173.29025269,
       173.89501953, 196.8480835 , 174.81037903, 167.3421936 ,
       172.41014099, 175.53692627, 181.84005737, 199.61955261,
       155.51078796, 157.33041382, 220.95731831, 126.10913086,
       193.97167969, 207.45880127, 204.79071045, 182.30224609])

In [294]:
print("R^2: {}".format(lr_49.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred_49))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.6075483807778729
Root Mean Squared Error: 16.92866008939742


## Random Forest

In [247]:
rfr_1 = RandomForestRegressor(n_estimators=10, random_state=0)

In [248]:
rfr_1.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [249]:
rfr_pred_1 = rfr_1.predict(X_test)

In [250]:
rfr_1.score(X_test, y_test)

0.5165718288315223

In [251]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rfr_pred_1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rfr_pred_1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rfr_pred_1)))

Mean Absolute Error: 14.058147540983606
Mean Squared Error: 353.0132440983606
Root Mean Squared Error: 18.7886466808645


In [252]:
rfr_2 = RandomForestRegressor(n_estimators=100, random_state=0)

In [253]:
rfr_2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [254]:
rfr_pred_2 = rfr_2.predict(X_test)

In [255]:
rfr_2.score(X_test, y_test)

0.567818289853375

In [256]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rfr_pred_2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rfr_pred_2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rfr_pred_2)))

Mean Absolute Error: 13.18401639344262
Mean Squared Error: 315.59159485901637
Root Mean Squared Error: 17.76489782855551


In [257]:
rfr_3 = RandomForestRegressor(n_estimators=1000, random_state=0)

In [258]:
rfr_3.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [259]:
rfr_pred_3 = rfr_3.predict(X_test)

In [260]:
rfr_3.score(X_test, y_test)

0.5713508272227407

In [261]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rfr_pred_3))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rfr_pred_3))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rfr_pred_3)))

Mean Absolute Error: 13.116432295081951
Mean Squared Error: 313.01203381762264
Root Mean Squared Error: 17.692146105479196


## Deep Learning

In [303]:
df_for_dl = pd.read_csv('cancer_ml6_ml.csv', index_col=['Geography'])

In [304]:
df_for_dl.shape

(3047, 329)

In [305]:
boolean_cols = ['PctSomeCol18_24_isnull', 'PctEmployed16_Over_isnull', 'PctPrivateCoverageAlone_isnull', 
               'age_gt_100', 'household_lt_1', 'PCT_LACCESS_POP10_isnull', 'PCT_LACCESS_LOWI10_isnull', 
               'PCT_LACCESS_CHILD10_isnull', 'PCT_LACCESS_SENIORS10_isnull', 'PCT_LACCESS_HHNV10_isnull', 
               'FOODINSEC_00_02_isnull', 'FOODINSEC_07_09_isnull', 'FOODINSEC_10_12_isnull', 
               'CH_FOODINSEC_02_12_isnull', 'CH_FOODINSEC_09_12_isnull', 'VLFOODSEC_00_02_isnull', 
               'VLFOODSEC_07_09_isnull', 'VLFOODSEC_10_12_isnull', 'CH_VLFOODSEC_02_12_isnull', 
               'CH_VLFOODSEC_09_12_isnull', 'FOODINSEC_CHILD_01_07_isnull', 'FOODINSEC_CHILD_03_11_isnull', 
               'PCT_LOCLFARM07_isnull', 'PCT_LOCLSALE07_isnull', 'PC_DIRSALES07_isnull', 'FMRKT09_isnull', 
               'FMRKT13_isnull', 'PCH_FMRKT_09_13_isnull', 'FMRKTPTH09_isnull', 'FMRKTPTH13_isnull', 
               'PCH_FMRKTPTH_09_13_isnull', 'PCT_FMRKT_SNAP13_isnull', 'PCT_FMRKT_WIC13_isnull', 
               'PCT_FMRKT_WICCASH13_isnull', 'PCT_FMRKT_SFMNP13_isnull', 'PCT_FRMKT_FRVEG13_isnull', 
               'PCT_FRMKT_ANMLPROD13_isnull', 'PCT_FMRKT_OTHER13_isnull', 'VEG_FARMS07_isnull', 
               'VEG_ACRES07_isnull', 'VEG_ACRESPTH07_isnull', 'FRESHVEG_FARMS07_isnull', 'FRESHVEG_ACRES07_isnull', 
               'FRESHVEG_ACRESPTH07_isnull', 'ORCHARD_FARMS07_isnull', 'ORCHARD_ACRES07_isnull', 
               'ORCHARD_ACRESPTH07_isnull', 'BERRY_FARMS07_isnull', 'BERRY_ACRES07_isnull', 
               'BERRY_ACRESPTH07_isnull', 'SLHOUSE07_isnull', 'GHVEG_FARMS07_isnull', 'GHVEG_SQFT07_isnull', 
               'GHVEG_SQFTPTH07_isnull', 'FOODHUB12_isnull', 'CSA07_isnull', 'AGRITRSM_OPS07_isnull', 
               'AGRITRSM_RCT07_isnull', 'FARM_TO_SCHOOL_isnull', 'PCT_OBESE_CHILD08_isnull', 
               'PCT_OBESE_CHILD11_isnull', 'PCH_OBESE_CHILD_08_11_isnull', 'PCT_HSPA09_isnull', 
               'PCH_RECFAC_07_12_isnull', 'PCH_RECFACPTH_07_12_isnull', 'NATAMEN_isnull']

In [306]:
for col in boolean_cols:
    df_for_dl[col] = df_for_dl[col].astype(int)

In [307]:
df_for_dl.to_csv('cancer_ml6_dl.csv')

In [321]:
del df_for_dl

In [322]:
df_for_dl = pd.read_csv('cancer_ml6_dl.csv', index_col=['Geography'])

In [323]:
predictors = np.loadtxt('cancer_ml6_dl.csv', skiprows=1, usecols=np.arange(3,331), delimiter=',')
predictors

array([[1.43000000e+02, 4.30900000e+02, 3.55250000e+04, ...,
        3.45631668e+00, 1.00489000e+03, 3.28091122e+00],
       [3.23000000e+02, 4.92700000e+02, 4.02690000e+04, ...,
        3.49953328e+00, 1.09561000e+03, 3.38777436e+00],
       [2.21000000e+02, 4.79400000e+02, 3.83900000e+04, ...,
        3.30321697e+00, 7.39840000e+02, 3.35689712e+00],
       ...,
       [5.40000000e+01, 4.27200000e+02, 4.68380000e+04, ...,
        3.05870707e+00, 4.53690000e+02, 2.85070650e+00],
       [4.10000000e+01, 3.40900000e+02, 3.32910000e+04, ...,
        3.43075618e+00, 9.54810000e+02, 3.84160054e+00],
       [3.70000000e+01, 3.21000000e+02, 2.63360000e+04, ...,
        3.43075618e+00, 9.54810000e+02, 3.93573953e+00]])

In [324]:
n_cols = predictors.shape[1]
n_cols

328

In [325]:
target = df_for_dl['TARGET_deathRate']

In [326]:
model = Sequential()
model.add(Dense(100, activation='relu', input_shape=(n_cols,)))
model.add(Dense(100, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(predictors, target)



<tensorflow.python.keras.callbacks.History at 0x1a54a5978>