# Resampling
This notebook explores resampling. It uses the boston house price dataset built into Sklearn.

## Imports

In [1]:
# Core libraries
import pandas as pd

# Sklearn processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# Sklearn regression algorithms
from sklearn.linear_model import LinearRegression

# Sklearn regression model evaluation functions
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

## Load data, split into X and y and scale data

In [2]:
# Load Boston housing data set
boston = pd.read_csv("boston.csv")

# Define the X (input) and y (target) features
X = boston.drop("MEDV", axis=1)
y = boston["MEDV"]

# Rescale the input features
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(X)

  return self.partial_fit(X, y)


## Resample using train / test split method

In [3]:
# Train test split
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

# Create model
model = LinearRegression()

#Fit model
model.fit(X_train, y_train)

# Evaluate
predictions = model.predict(X_test)
r2_score(y_test, predictions)

0.6590081405512094

## Resample using k-fold cross-validation method

In [4]:
# Create 5 folds
seed = 7
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

# Create a model 
model = LinearRegression()

# Train and evaluate multiple models using kfolds
results = cross_val_score(model, X, y, cv=kfold, scoring='r2')
print(results)
print("Mean:", results.mean())
print("Std:", results.std())

[0.57790144 0.76990344 0.64138006 0.73139225 0.80395154]
Mean: 0.7049057438479572
Std: 0.08354868173256094


### Finalise model

Note that the above model is not fitted when cross_val_score() returns.  The following inspection of the model returns an error:

In [5]:
model.coef_

AttributeError: 'LinearRegression' object has no attribute 'coef_'

If we want to proceed to build our final model we can fit it using all the data:

In [6]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

Get the cross-validation predictions:

In [7]:
cross_val_predict(model, X, y, cv=kfold)

array([29.78263869, 25.18759152, 31.32588939, 28.58704763, 27.62169827,
       25.8584904 , 22.75120309, 18.59873664, 10.76099096, 18.08494152,
       18.51605175, 20.76908112, 20.48931998, 19.22031474, 19.44404476,
       19.29017101, 20.39068546, 16.95335226, 17.63863428, 18.41757429,
       12.3666236 , 17.60176948, 16.40430934, 13.5872312 , 15.17196734,
       14.1307111 , 15.58348197, 15.05117627, 19.36015011, 20.77348932,
       12.24408724, 18.15630888, 11.61538763, 14.45729109, 14.64402186,
       24.14145594, 22.70873851, 23.37379119, 22.64951455, 31.74193298,
       34.54204849, 27.58611291, 25.17867436, 24.18151264, 22.90523928,
       22.15572472, 20.26191386, 17.85234857,  8.5071921 , 17.14163933,
       21.12611181, 23.71587266, 27.75289469, 23.93146756, 15.05032112,
       30.64239719, 24.3953698 , 33.5261546 , 21.62289099, 21.16278011,
       17.41641328, 17.57480033, 24.06501729, 22.5502131 , 22.17089456,
       29.16834785, 26.19995125, 21.13689766, 17.40285449, 20.61

### Alternative evaluation metrics

Get a list of alternative evaluation metrics that can be used in the call to cross_val_score():

In [8]:
from sklearn.metrics import SCORERS
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']