
## Train | Validation | Test Split Procedure 

This is often also called a "hold-out" set, since you should not adjust parameters based on the final test set, but instead use it *only* for reporting final expected performance.

0. Clean and adjust data as necessary for X and y
1. Split Data in Train/Validation/Test for both X and y
2. Fit/Train Scaler on Training X Data
3. Scale X Eval Data
4. Create Model
5. Fit/Train Model on X Train Data
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval)
7. Adjust Parameters as Necessary and repeat steps 5 and 6
8. Get final metrics on Test set (not allowed to go back and adjust after this!)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
df = pd.read_csv("../DATA/Advertising.csv")

In [24]:
X = df.drop('sales', axis=1)

In [25]:
y = df['sales']

In [26]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()

In [9]:
scaler.fit(X_train)

StandardScaler()

In [10]:
X_train = scaler.transform(X_train)

In [11]:
X_test = scaler.transform(X_test)

In [12]:
from sklearn.linear_model import Ridge

In [13]:
model = Ridge(alpha=100)

In [14]:
model.fit(X_train, y_train)

Ridge(alpha=100)

In [15]:
y_pred = model.predict(X_test)

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
mean_squared_error(y_test, y_pred)

7.952592037243845

In [18]:
model_two = Ridge(alpha=1)

In [19]:
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [20]:
y_pred_two = model_two.predict(X_test)

In [22]:
mean_squared_error(y_test,y_pred_two)

2.37316582827988

# Train / Validation / Test Split

In [58]:
df = pd.read_csv("../DATA/Advertising.csv")
X = df.drop('sales', axis=1)
y = df['sales']

In [59]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=101)

In [60]:
# test_size = 15%(50% of 30%)
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=101)

In [61]:
from sklearn.preprocessing import StandardScaler

In [62]:
scaler = StandardScaler()

In [63]:
scaler.fit(X_train)

StandardScaler()

In [64]:
X_train = scaler.transform(X_train)

In [65]:
X_test = scaler.transform(X_test)

In [66]:
X_eval = scaler.transform(X_eval)

In [67]:
from sklearn.linear_model import Ridge

In [68]:
model_one = Ridge(alpha=100)

In [69]:
model_one.fit(X_train, y_train)

Ridge(alpha=100)

In [71]:
y_eval_pred = model_one.predict(X_eval)

In [73]:
mean_squared_error(y_eval, y_eval_pred)

7.320101458823872

In [74]:
model_two = Ridge(alpha=1)

In [75]:
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [76]:
new_pred_eval = model_two.predict(X_eval)

In [77]:
mean_squared_error(new_pred_eval, y_eval)

2.3837830750569866

In [79]:
y_final_test_pred = model_two.predict(X_test)

In [80]:
mean_squared_error(y_test,y_final_test_pred)

2.254260083800517

## Cross Validation

In [104]:
df = pd.read_csv("../DATA/Advertising.csv")
X = df.drop('sales', axis=1)
y = df['sales']

In [105]:
from sklearn.model_selection import train_test_split

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [117]:
from sklearn.preprocessing import StandardScaler

In [118]:
scaler = StandardScaler()

In [119]:
scaler.fit(X_train)

StandardScaler()

In [120]:
X_train = scaler.transform(X_train)

In [121]:
X_test = scaler.transform(X_test)

In [122]:
model = Ridge(alpha=100)

In [123]:
from sklearn.model_selection import cross_val_score

In [124]:
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [125]:
scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [126]:
abs(scores.mean())

8.215396464543607

In [127]:
model = Ridge(alpha=1)

In [128]:
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
abs(scores.mean())

3.344839296530695

In [129]:
scores

array([-3.15513238, -1.58086982, -5.40455562, -2.21654481, -4.36709384])

In [130]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [131]:
y_final_test_pred = model.predict(X_test)

In [132]:
mean_squared_error(y_test, y_final_test_pred)

2.3190215794287514

# cross_validate

In [134]:
## CREATE X and y
X = df.drop('sales',axis=1)
y = df['sales']

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# SCALE DATA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [135]:
from sklearn.model_selection import cross_validate

In [136]:
model = Ridge(alpha=100)

In [137]:
scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=10)

In [139]:
scores = pd.DataFrame(scores)

In [141]:
scores.mean()

fit_time                        0.002749
score_time                      0.000648
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

In [142]:
model = Ridge(alpha=1)

In [143]:
scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=10)

In [144]:
scores = pd.DataFrame(scores)
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.0,0.0,-2.962508,-1.457174
1,0.009959,0.0,-3.057378,-1.555308
2,0.0,0.0,-2.17374,-1.23877
3,0.0,0.008527,-0.833034,-0.768938
4,0.0,0.0,-3.464018,-1.434489
5,0.0,0.0,-8.232647,-1.494316
6,0.0,0.0,-1.905864,-1.081362
7,0.010163,0.000866,-2.765048,-1.250011
8,0.0,0.0,-4.989505,-1.580971
9,0.0,0.0,-2.846438,-1.223326


In [145]:
scores.mean()

fit_time                        0.002012
score_time                      0.000939
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [147]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [149]:
y_final_pred = model.predict(X_test)

In [150]:
mean_squared_error(y_test, y_final_pred)

2.3190215794287514