--------------------
#### Cross-validation 
- is a crucial technique in machine learning and statistics for evaluating the performance of predictive models. 
-------------------------

In [3]:
import pandas as pd
import numpy as np

# import plotting libraries
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# import the ML algorithm
from sklearn.linear_model import LinearRegression

# import libraries for model validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import libraries for metrics and reporting
from sklearn import metrics

In [4]:
location = "https://github.com/gridflowai/gridflowAI-datasets-icons/raw/master/AI-DATASETS/01-MISC/Advertising.csv"

In [5]:
# load the training data from glass data set
df_training = pd.read_csv(location)

df_training.drop(['Unnamed: 0'], axis=1, inplace=True)

In [6]:
df_training.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [9]:
# create a Python list of feature names
feature_cols = ['TV', 'radio', 'newspaper']

# use the list to select a subset of the original DataFrame
X = df_training[feature_cols]

# select a Series from the DataFrame
y = df_training['sales']

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 3.1740973539761024


#### K-Fold Cross-Validation:
- Divide the dataset into K folds.
- For each fold, use K-1 folds for training and the remaining fold for testing.
- Average the performance metric across all folds.

In [14]:
from sklearn.model_selection import cross_val_score, KFold

# Define the cross-validation method
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
mse_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)
mse_mean = -mse_scores.mean()
print("Mean Squared Error (K-Fold CV):", mse_mean)

Mean Squared Error (K-Fold CV): 2.965087804268161


#### Leave-One-Out Cross-Validation (LOOCV):
Similar to K-Fold CV, but with K equal to the number of samples in the dataset.
For each fold, use all samples except one for training and the remaining sample for testing.

In [15]:
from sklearn.model_selection import LeaveOneOut

# Define the cross-validation method
loo = LeaveOneOut()

# Perform cross-validation
mse_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=loo)
mse_mean = -mse_scores.mean()
print("Mean Squared Error (LOOCV):", mse_mean)

Mean Squared Error (LOOCV): 2.9468998005695037
