We perform a paired t-test for repeated measurements due to the fact that our test and training sets remain the same over multiple experiments. See https://en.wikipedia.org/wiki/T-test#Dependent_t-test_for_paired_samples.

In [1]:
%cd ..

/Users/Matthew/Repos/modeling


In [2]:
import utility

In [3]:
X_train, X_test, y_train, y_test = utility.load()
X_train, X_test = utility.standardize(X_train, X_test)

Loading data...
Standardizing data...


In [4]:
import numpy
from scipy.stats import ttest_rel

In [5]:
baseline = numpy.repeat(numpy.mean(y_train), len(y_test))
baseline_err = y_test - baseline

In [6]:
from sklearn.linear_model import LinearRegression

In [7]:
ols = LinearRegression()
ols.fit(X_train, y_train)
ols_err = y_test - ols.predict(X_test)
ttest_rel(baseline_err, ols_err)

Ttest_relResult(statistic=-0.28086285691366863, pvalue=0.7788164623287861)

In [8]:
from sklearn.linear_model import Ridge

This `alpha` value was obtained via cross-validation (see `../linear.py` and `../outputs/linear.txt`)

In [9]:
ridge = Ridge(alpha=1000.0)
ridge.fit(X_train, y_train)
ridge_err = y_test - ridge.predict(X_test)
ttest_rel(baseline_err, ridge_err)

Ttest_relResult(statistic=-0.28229249066311085, pvalue=0.7777201199332316)

In [10]:
from sklearn.linear_model import Lasso

This `alpha` value was obtained via cross-validation (see `../linear.py` and `../outputs/linear.txt`)

In [11]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso_err = y_test - lasso.predict(X_test)
ttest_rel(baseline_err, lasso_err)

Ttest_relResult(statistic=-0.284643300625411, pvalue=0.7759183184032874)

In [12]:
from sklearn.tree import DecisionTreeRegressor

This `max_leaf_nodes` value was obtained via cross-validation (see `../decision_tree.py` and `../ouputs/decision_tree.txt`

In [13]:
dtree = DecisionTreeRegressor(random_state=1, max_leaf_nodes=5)
dtree.fit(X_train, y_train)
dtree_err = y_test - dtree.predict(X_test)
ttest_rel(baseline_err, dtree_err)

Ttest_relResult(statistic=-0.4585430209759148, pvalue=0.6465638575176489)

In [14]:
from sklearn.neighbors import KNeighborsRegressor

This `n_neighbors` value was obtained via cross-validation (see `../instance_based.py` and `../outputs/instance_based.txt`)

In [15]:
knn = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, algorithm='auto', weights='uniform')
knn.fit(X_train, y_train)
knn_err = y_test - knn.predict(X_test)
ttest_rel(baseline_err, knn_err)

Ttest_relResult(statistic=-12.822847379068683, pvalue=1.353146458659577e-37)

In [16]:
from sklearn.neural_network import MLPRegressor

In [17]:
nnet = MLPRegressor(hidden_layer_sizes=(100, 100, 100), random_state=1, early_stopping=True)
nnet.fit(X_train, y_train)
nnet_err = y_test - nnet.predict(X_test)

In [18]:
ttest_rel(baseline_err, nnet_err)

Ttest_relResult(statistic=-25.179257459734313, pvalue=3.0399385109528293e-139)

Now let's compare our two best methods...

In [19]:
ttest_rel(knn_err, nnet_err)

Ttest_relResult(statistic=-9.196038647632946, pvalue=3.8172179087580504e-20)