In [1]:
# work in progress

# last updated: 2022-05-18

# Train-Test Split, Cross-Validation & K-Fold Validation Python Cookbook

---

## 0. Prepare the Workspace 

In [1]:
# import data analysis packages
import numpy as np
import pandas as pd

# import functions from sci-kit learn 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split , KFold 
from sklearn import metrics

# import functions from statsmodels
import statsmodels.formula.api as smf

# import data visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# allow graphs to display in the notebook inline
%matplotlib inline

In [5]:
from sklearn.datasets import load_boston

boston = load_boston()

In [6]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [7]:
X = pd.DataFrame(boston.data, columns=boston.feature_names)

In [8]:
y = pd.DataFrame(boston.target, columns=['MEDV'] )

In [12]:
boston = pd.concat([y, X], axis=1)

In [13]:
boston.head()

Unnamed: 0,MEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,24.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,21.6,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,34.7,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,33.4,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,36.2,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


## Train-Test Split

>**The Holdout Method: Train/Test Split**
>- **Training set**: Used to train the classifier.
>- **Testing set**: Used to estimate the error rate of the trained classifier.
>- **Advantages**: Fast, simple, computationally inexpensive.
>- **Disadvantages** Eliminates data, imperfectly splits.

---

In [18]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [20]:
lr.fit(X,y)

LinearRegression()

In [21]:
y_pred = lr.predict(X)

In [22]:
from sklearn import metrics

In [24]:
print(metrics.mean_squared_error(y, y_pred))

21.894831181729206


---

In [14]:
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_split, y_train, y_split = train_test_split(X, y, test_size=0.33, random_state=42)

In [31]:
cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error', cv=3)

array([-29.93705035, -25.62338829, -21.56673485])

In [None]:
# Step 2: Train the model on the training set

lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
# Step 3: Test the model on the testing set and check the accuracy.

y_pred = lr.predict(X_test)

print(metrics.mean_squared_error(y_train, lr.predict(X_train)))
print(metrics.mean_squared_error(y_test, y_pred))

In [None]:
# Go back to Step 1 and try adding new variables and transformations.
Training error: Decreases as model complexity increases (lower value of k).
Testing error: Is minimized at the optimum model complexity.

### Cross-Validation

In [23]:
# Create a cross-valiation with five folds.

In [None]:
# step 6B.1

from sklearn import model_selection

In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=True)

In [None]:
mse_values = []
scores = []
n = 0

print("~~~~ CROSS VALIDATION each fold ~~~~")
for train_index, test_index in kf.split(X, y):
    lr = LinearRegression().fit(X.iloc[train_index], y.iloc[train_index])
    
    mse_values.append(metrics.mean_squared_error(y.iloc[test_index], lr.predict(X.iloc[test_index])))
    scores.append(lr.score(X, y))
    
    n += 1
    
    print('Model {}'.format(n))
    print('MSE: {}'.format(mse_values[n-1]))
    print('R2: {}\n'.format(scores[n-1]))


print("~~~~ SUMMARY OF CROSS VALIDATION ~~~~")
print('Mean of MSE for all folds: {}'.format(np.mean(mse_values)))
print('Mean of R2 for all folds: {}'.format(np.mean(scores)))

In [None]:
from sklearn.model_selection import cross_val_score

# Note the results will vary each run since we take a different
#   subset of the data each time (since shuffle=True)
kf = model_selection.KFold(n_splits=5, shuffle=True)

print(np.mean(-cross_val_score(lr, X, y, cv=kf, scoring='neg_mean_squared_error')))
print(np.mean(cross_val_score(lr, X, y, cv=kf)))

In [None]:
# Comparing Test Performance With a Null Baseline

# Use .apply() to broadcast a mean for every prediction.

print(metrics.mean_squared_error(y_test, y_test.apply(np.mean, broadcast=True)))

### Three-Way Split

#### Create a cross-valiation with five folds.

In [58]:
from sklearn import model_selection

In [59]:
kf = model_selection.KFold(n_splits=5, shuffle=True)

In [62]:
mse_values = []
scores = []
n = 0

print("~~~~ CROSS VALIDATION each fold ~~~~")
for train_index, test_index in kf.split(X, y):
    lr = LinearRegression().fit(X.iloc[train_index], y.iloc[train_index])
    
    mse_values.append(metrics.mean_squared_error(y.iloc[test_index], lr.predict(X.iloc[test_index])))
    scores.append(lr.score(X, y))
    
    n += 1
    
    print('Model {}'.format(n))
    print('MSE: {}'.format(mse_values[n-1]))
    print('R2: {}\n'.format(scores[n-1]))


print("~~~~ SUMMARY OF CROSS VALIDATION ~~~~")
print('Mean of MSE for all folds: {}'.format(np.mean(mse_values)))
print('Mean of R2 for all folds: {}'.format(np.mean(scores)))

~~~~ CROSS VALIDATION each fold ~~~~
Model 1
MSE: 31.18394377505681
R2: 0.5295983154008959

Model 2
MSE: 33.17999548239591
R2: 0.5301957632849592

Model 3
MSE: 55.51400945679381
R2: 0.5265627748310775

Model 4
MSE: 43.508568066461216
R2: 0.5247978536144398

Model 5
MSE: 43.166183708408894
R2: 0.52926223445908

~~~~ SUMMARY OF CROSS VALIDATION ~~~~
Mean of MSE for all folds: 41.310540097823335
Mean of R2 for all folds: 0.5280833883180904


In [64]:
from sklearn.model_selection import cross_val_score

# Note the results will vary each run since we take a different
#   subset of the data each time (since shuffle=True)
kf = model_selection.KFold(n_splits=5, shuffle=True)

print(np.mean(-cross_val_score(lr, X, y, cv=kf, scoring='neg_mean_squared_error')))
print(np.mean(cross_val_score(lr, X, y, cv=kf)))

39.9426122793
0.529757772933


### Three-Way Data Split

- If model selection and true error estimates are to be computed simultaneously, three disjointed data sets are best.
    - **Training set**: A set of examples used for learning – what parameters of the classifier?
    - **Validation set**: A set of examples used to tune the parameters of the classifier.
    - **Testing set**: A set of examples used ONLY to assess the performance of the fully trained classifier.
- Validation and testing must be separate data sets. Once you have the final model set, you cannot do any additional tuning after testing.

1. Divide data into training, validation, and testing sets.
2. Select architecture (model type) and training parameters (k).
3. Train the model using the training set.
4. Evaluate the model using the training set.
5. Repeat 2–4 times, selecting different architectures (models) and tuning parameters.
6. Select the best model.
7. Assess the model with the final testing set.

---

---