<a href="https://colab.research.google.com/github/dajebbar/FreeCodeCamp-python-data-analysis/blob/main/data_leakage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train-split naive approch

## Test classification dataset

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from sklearn.datasets import make_classification

# define dataset
X, y = make_classification(n_samples=10000, n_features=200, n_informative=140,
                           n_redundant=15, random_state=42)

# summarize the dataset
print(X.shape, y.shape)

(10000, 200) (10000,)


## Naive approch to normalizing the data before splitting the data and evaluating the model

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# standardize the dataset
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
)

# fit the model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# evaluate the model
preds = model.predict(X_test)

# evaluate the prediction
score = accuracy_score(y_test, preds)

print(f'Score: {score* 100:.3f}%')

Score: 81.500%


# Train-split correct approch

In [4]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
)

# standardize the dataset
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# fit the model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# evaluate the model
preds = model.predict(X_test)

# evaluate the prediction
score = accuracy_score(y_test, preds)

print(f'Score: {score* 100:.3f}%')

Score: 81.367%


# Data leakage k-fold naive approch

In [5]:
from sklearn.model_selection import (
    RepeatedStratifiedKFold,
    cross_val_score,
)


# standardize the dataset
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# define the model
model = LogisticRegression(max_iter=500)

# define the evaluation procedure
cv = RepeatedStratifiedKFold(
    n_splits=10, 
    n_repeats=100, 
    random_state=42
)

# evaluating the model using cross validation
score = cross_val_score(model,
                        X,
                        y,
                        scoring='accuracy',
                        cv=cv,
                        n_jobs=-1,
                        verbose=1)



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  4.0min


Accuracy: 0.808 +/- 0.012


[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  5.1min finished


In [6]:
# report performance
print(f'Accuracy: {score.mean():.3f} +/- {score.std():.3f}')

Accuracy: 0.808 +/- 0.012


## Correct data preparation for model evaluation with k-fold cross-validation

In [7]:
from sklearn.pipeline import Pipeline

processor = Pipeline([
                      ('scaler', MinMaxScaler()),
                      ('model', LogisticRegression(max_iter=500))
])

# define the evaluation procedure
cv = RepeatedStratifiedKFold(
    n_splits=10, 
    n_repeats=100, 
    random_state=42
)

# evaluating the model using cross validation
score = cross_val_score(processor,
                        X,
                        y,
                        scoring='accuracy',
                        cv=cv,
                        n_jobs=-1,
                        verbose=0)

# report performance
print(f'Accuracy: {score.mean():.3f} +/- {score.std():.3f}')

Accuracy: 0.808 +/- 0.012
