In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Initialize the model (Random Forest Classifier in this example)
clf = RandomForestClassifier(random_state=42)

# Perform 10-fold cross-validation
scores = cross_val_score(clf, X, y, cv=10)

# Output the scores
print("Cross-Validation Scores:", scores)

# Calculate and output the average score
average_score = scores.mean()
print("Average Cross-Validation Score:", average_score)

Cross-Validation Scores: [1.         0.93333333 1.         0.93333333 0.93333333 0.93333333
 0.93333333 0.93333333 1.         1.        ]
Average Cross-Validation Score: 0.96


In [7]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
import numpy as np

# Generate a dataset
X, y = make_classification(n_samples=100, n_features=20, random_state=42)

# Create a model
model = LogisticRegression()

# Perform k-fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
out_of_sample_scores = cross_val_score(model, X, y, cv=kf)

# Calculate in-sample error and out-of-sample error for each fold
in_sample_errors = []
for train_index, test_index in kf.split(X):
    print("train_index:",train_index)
    print("test_index:",test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Calculate in-sample error
    in_sample_error = np.mean((y_train - model.predict(X_train)) ** 2)
    in_sample_errors.append(in_sample_error)

# Calculate average in-sample error and out-of-sample error
avg_in_sample_error = np.mean(in_sample_errors)
avg_out_of_sample_error = 1 - np.mean(out_of_sample_scores)

# Print the results
print(f"Average In-Sample Error: {avg_in_sample_error}")
print(f"Average Out-Of-Sample Error: {avg_out_of_sample_error}")

# Examine the difference
difference = avg_out_of_sample_error - avg_in_sample_error
print(f"Difference between Out-Of-Sample Error and In-Sample Error: {difference}")


train_index: [ 1  2  3  5  6  7  8  9 11 13 14 15 16 17 19 20 21 23 24 25 26 27 28 29
 32 34 35 36 37 38 40 41 42 43 46 47 48 49 50 51 52 54 55 56 57 58 59 60
 61 62 63 64 65 66 67 68 69 71 72 74 75 78 79 81 82 84 85 86 87 88 89 91
 92 93 94 95 96 97 98 99]
test_index: [ 0  4 10 12 18 22 30 31 33 39 44 45 53 70 73 76 77 80 83 90]
train_index: [ 0  1  2  3  4  6  7  8 10 12 13 14 17 18 19 20 21 22 23 24 25 27 29 30
 31 32 33 34 36 37 38 39 41 43 44 45 46 48 49 50 51 52 53 54 56 57 58 59
 60 61 62 63 64 67 68 70 71 73 74 75 76 77 78 79 80 81 82 83 84 86 87 89
 90 91 92 94 95 97 98 99]
test_index: [ 5  9 11 15 16 26 28 35 40 42 47 55 65 66 69 72 85 88 93 96]
train_index: [ 0  1  2  4  5  9 10 11 12 14 15 16 18 20 21 22 23 26 28 29 30 31 32 33
 35 37 39 40 41 42 43 44 45 46 47 48 50 51 52 53 54 55 56 57 58 59 60 61
 63 65 66 67 68 69 70 71 72 73 74 75 76 77 79 80 82 83 84 85 86 87 88 90
 91 92 93 94 96 97 98 99]
test_index: [ 3  6  7  8 13 17 19 24 25 27 34 36 38 49 62 64 78 81 89 95]
trai

In [5]:
X.shape

(100, 20)

### Comparing in-sample vs out-of-sample error

In [8]:
from sklearn.model_selection import KFold
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Initialize the model (Random Forest Classifier in this example)
clf = RandomForestClassifier(random_state=42)

# Number of folds
num_folds = 10

# Initialize KFold cross-validation
kf = KFold(n_splits=num_folds)

# Store training and validation scores
training_scores = []
validation_scores = []

# Perform KFold cross-validation
for train_index, val_index in kf.split(X):
    # Split data into training and validation sets
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Train the model on the training data
    clf.fit(X_train, y_train)
    
    # Calculate training accuracy
    train_accuracy = accuracy_score(y_train, clf.predict(X_train))
    training_scores.append(train_accuracy)
    
    # Calculate validation accuracy
    val_accuracy = accuracy_score(y_val, clf.predict(X_val))
    validation_scores.append(val_accuracy)

# Output the training and validation scores
print("Training Scores:", training_scores)
print("Validation Scores:", validation_scores)

# Calculate and output the average scores
average_train_score = np.mean(training_scores)
average_val_score = np.mean(validation_scores)
print("Average Training Score:", average_train_score)
print("Average Validation Score:", average_val_score)

# Check for overfitting
if average_train_score > average_val_score:
    print("The model may be overfitting")
else:
    print("The model is not overfitting")


Training Scores: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Validation Scores: [1.0, 1.0, 1.0, 1.0, 0.9333333333333333, 0.8666666666666667, 1.0, 0.8666666666666667, 0.8, 1.0]
Average Training Score: 1.0
Average Validation Score: 0.9466666666666667
The model may be overfitting
