<a href="https://colab.research.google.com/github/chiaramarzi/ML-pipeline/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# (The importance of) pipelines for machine learning analysis

In [1]:
# My repo cloning
! git clone https://github.com/chiaramarzi/ML-pipeline.git

%cd /content/ML-pipeline
! git pull

Cloning into 'ML-pipeline'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 25 (delta 8), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 877.98 KiB | 7.20 MiB/s, done.
Resolving deltas: 100% (8/8), done.
/content/ML-pipeline
Already up to date.


# Radiomics data
![Alt Text](https://raw.githubusercontent.com/chiaramarzi/ML-pipeline/main/Figures-Overview.png)

What is a Machine Learning framework?
![Alt Text](https://raw.githubusercontent.com/chiaramarzi/ML-pipeline/main/Figures-ML%20framework.png)

In [21]:
# Libraries importation

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve,
    make_scorer
)
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

In [3]:
# Data loading
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

In [7]:
# Data exploration
print(data.head(10))
freq_relative = pd.Series(y).value_counts(normalize=True)
print(freq_relative)

   Gene_mutation  T1w_shape_Elongation  T1w_shape_Flatness  \
0              1              1.497686            1.347577   
1              1              1.377417            1.227308   
2              1              1.088981            0.938873   
3              1                   NaN                 NaN   
4              1                   NaN                 NaN   
5              0              0.922665            0.772557   
6              1                   NaN                 NaN   
7              1                   NaN                 NaN   
8              0                   NaN                 NaN   
9              0              1.169147            1.019039   

   T1w_shape_LeastAxisLength  T1w_shape_MajorAxisLength  \
0                  33.548088                  55.370788   
1                  33.427819                  55.250519   
2                  33.139384                  54.962084   
3                  33.718013                  55.540713   
4                  33.

In [8]:
seed = 1

In [12]:
imputer = SimpleImputer()
scaler = StandardScaler()
feat_selector = SelectKBest(k=10)

X_preprocessed = feat_selector.fit_transform(scaler.fit_transform(imputer.fit_transform(X)), y)

clf = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, C=1.0, max_iter=1000)

X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.15, random_state=seed)

clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]  # probability for positive class

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label=1)
rec = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)
auc = roc_auc_score(y_test, y_prob)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy:  {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall:    {rec:.2f}")
print(f"F1-score:  {f1:.2f}")
print(f"AUC:       {auc:.2f}")
print("\nConfusion Matrix:")
print(cm)

Accuracy:  0.77
Precision: 0.78
Recall:    0.92
F1-score:  0.84
AUC:       0.91

Confusion Matrix:
[[12 13]
 [ 4 46]]


In [13]:
### HOLDOUT validation ###
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=seed)

# Preprocessing on training set (fit and transform)
X_train_imputed = imputer.fit_transform(X_train)
X_train_imputed_scaled = scaler.fit_transform(X_train_imputed)
X_train_imputed_scaled_feat_selected = feat_selector.fit_transform(X_train_imputed_scaled, y_train)
# Preprocessing on test set (only transform)
X_test_imputed = imputer.transform(X_test)
X_test_imputed_scaled = scaler.transform(X_test_imputed)
X_test_imputed_scaled_feat_selected = feat_selector.transform(X_test_imputed_scaled)

clf = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, C=1.0, max_iter=1000)
clf.fit(X_train_imputed_scaled_feat_selected, y_train)

# Predictions
y_pred = clf.predict(X_test_imputed_scaled_feat_selected)
y_prob = clf.predict_proba(X_test_imputed_scaled_feat_selected)[:, 1]  # probability for positive class

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label=1)
rec = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)
auc = roc_auc_score(y_test, y_prob)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy:  {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall:    {rec:.2f}")
print(f"F1-score:  {f1:.2f}")
print(f"AUC:       {auc:.2f}")
print("\nConfusion Matrix:")
print(cm)

Accuracy:  0.75
Precision: 0.77
Recall:    0.88
F1-score:  0.82
AUC:       0.86

Confusion Matrix:
[[12 13]
 [ 6 44]]


In [16]:
### kfold CV validation ###
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
kf = KFold(n_splits=5, shuffle = True, random_state = seed)

# Metrics storage
acc_list = []
prec_list = []
rec_list = []
f1_list = []
auc_list = []
for train_index, test_index in kf.split(X):
  X_train = X.iloc[train_index, :]
  X_test = X.iloc[test_index, :]
  y_train = y[train_index]
  y_test = y[test_index]

  # Preprocessing on training set (fit and transform)
  X_train_imputed = imputer.fit_transform(X_train)
  X_train_imputed_scaled = scaler.fit_transform(X_train_imputed)
  X_train_imputed_scaled_feat_selected = feat_selector.fit_transform(X_train_imputed_scaled, y_train)
  # Preprocessing on test set (only transform)
  X_test_imputed = imputer.transform(X_test)
  X_test_imputed_scaled = scaler.transform(X_test_imputed)
  X_test_imputed_scaled_feat_selected = feat_selector.transform(X_test_imputed_scaled)

  clf = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, C=1.0, max_iter=1000)
  clf.fit(X_train_imputed_scaled_feat_selected, y_train)

  # Predictions
  y_pred = clf.predict(X_test_imputed_scaled_feat_selected)
  y_prob = clf.predict_proba(X_test_imputed_scaled_feat_selected)[:, 1]  # probability for positive class

  # Metrics
  acc = accuracy_score(y_test, y_pred)
  prec = precision_score(y_test, y_pred, pos_label=1)
  rec = recall_score(y_test, y_pred, pos_label=1)
  f1 = f1_score(y_test, y_pred, pos_label=1)
  auc = roc_auc_score(y_test, y_prob)
  cm = confusion_matrix(y_test, y_pred)

  print(f"Accuracy:  {acc:.2f}")
  print(f"Precision: {prec:.2f}")
  print(f"Recall:    {rec:.2f}")
  print(f"F1-score:  {f1:.2f}")
  print(f"AUC:       {auc:.2f}")
  print("\nConfusion Matrix:")
  print(cm)

  # Store
  acc_list.append(acc)
  prec_list.append(prec)
  rec_list.append(rec)
  f1_list.append(f1)
  auc_list.append(auc)

print("\n--- Cross-Validation Results ---")
print(f"Mean Accuracy:  {np.mean(acc_list):.2f}")
print(f"Mean Precision: {np.mean(prec_list):.2f}")
print(f"Mean Recall:    {np.mean(rec_list):.2f}")
print(f"Mean F1-score:  {np.mean(f1_list):.2f}")
print(f"Mean AUC:       {np.mean(auc_list):.2f}")

Accuracy:  0.72
Precision: 0.74
Recall:    0.88
F1-score:  0.80
AUC:       0.84

Confusion Matrix:
[[16 20]
 [ 8 56]]
Accuracy:  0.72
Precision: 0.75
Recall:    0.82
F1-score:  0.78
AUC:       0.88

Confusion Matrix:
[[22 17]
 [11 50]]
Accuracy:  0.71
Precision: 0.72
Recall:    0.85
F1-score:  0.78
AUC:       0.86

Confusion Matrix:
[[20 20]
 [ 9 51]]
Accuracy:  0.75
Precision: 0.87
Recall:    0.76
F1-score:  0.81
AUC:       0.82

Confusion Matrix:
[[22  8]
 [17 53]]
Accuracy:  0.72
Precision: 0.77
Recall:    0.79
F1-score:  0.78
AUC:       0.86

Confusion Matrix:
[[22 15]
 [13 50]]

--- Cross-Validation Results ---
Mean Accuracy:  0.72
Mean Precision: 0.77
Mean Recall:    0.82
Mean F1-score:  0.79
Mean AUC:       0.85


In [17]:
### The pipeline ###
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
clf = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, C=1.0, max_iter=1000)
pipe = Pipeline([('imputer', imputer), ('scaler', scaler), ('feat_selector', feat_selector), ('clf', clf)])

pipe

In [19]:
## HOLDOUT validation ##
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=seed)
pipe.fit(X_train, y_train)

# Predictions
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]  # probability for positive class

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, pos_label=1)
rec = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)
auc = roc_auc_score(y_test, y_prob)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy:  {acc:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall:    {rec:.2f}")
print(f"F1-score:  {f1:.2f}")
print(f"AUC:       {auc:.2f}")
print("\nConfusion Matrix:")
print(cm)

Accuracy:  0.75
Precision: 0.77
Recall:    0.88
F1-score:  0.82
AUC:       0.86

Confusion Matrix:
[[12 13]
 [ 6 44]]


In [25]:
## kfold CV validation ##
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
clf = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, C=1.0, max_iter=1000)
pipe = Pipeline([('imputer', imputer), ('scaler', scaler), ('feat_selector', feat_selector), ('clf', clf)])

# Scoring dictionary
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

kf = KFold(n_splits=5, shuffle = True, random_state = seed)

scores = cross_validate(pipe, X, y=y, scoring=scoring, cv=kf, return_estimator=True)

# Display mean results only
print("\n--- Average Cross-Validation Metrics ---")
for metric in scoring.keys():
    mean = np.mean(scores[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f}")


--- Average Cross-Validation Metrics ---
Accuracy  : 0.72
Precision : 0.77
Recall    : 0.82
F1        : 0.79
Roc_auc   : 0.85


In [26]:
## Nested kfold CV validation with hyperparameters optimization##
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
clf = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, C=1.0, max_iter=1000)
pipe = Pipeline([('imputer', imputer), ('scaler', scaler), ('feat_selector', feat_selector), ('clf', clf)])

param_grid = {
    "imputer__strategy": ["mean", "median"],
    "feat_selector__k": [1, 10, 20],
    "clf__C": [0.01, 0.1, 1, 10],
    "clf__l1_ratio": [0.0, 0.25, 0.5, 0.75, 1.0],  # da Ridge a Lasso
    "clf__class_weight": [None, 'balanced']
}

kf_inner = KFold(n_splits=10, shuffle = True, random_state = seed)
kf_outer = KFold(n_splits=5, shuffle = True, random_state = seed)

gs = GridSearchCV(pipe, param_grid, scoring='roc_auc', refit=True, cv=kf_inner, verbose=0, return_train_score=True)
scores = cross_validate(gs, X, y=y, scoring=scoring, cv=kf_outer, return_estimator=True)

# Display mean results only
print("\n--- Average Nested-Cross-Validation Metrics ---")
for metric in scoring.keys():
    mean = np.mean(scores[f'test_{metric}'])
    print(f"{metric.capitalize():<10}: {mean:.2f}")


--- Average Nested-Cross-Validation Metrics ---
Accuracy  : 0.77
Precision : 0.82
Recall    : 0.82
F1        : 0.82
Roc_auc   : 0.86


In [27]:
print(scores['estimator'][0].best_params_)
print(scores['estimator'][1].best_params_)
print(scores['estimator'][2].best_params_)
print(scores['estimator'][3].best_params_)
print(scores['estimator'][4].best_params_)

{'clf__C': 0.1, 'clf__class_weight': None, 'clf__l1_ratio': 0.75, 'feat_selector__k': 20, 'imputer__strategy': 'mean'}
{'clf__C': 0.01, 'clf__class_weight': 'balanced', 'clf__l1_ratio': 0.0, 'feat_selector__k': 20, 'imputer__strategy': 'median'}
{'clf__C': 0.1, 'clf__class_weight': 'balanced', 'clf__l1_ratio': 1.0, 'feat_selector__k': 20, 'imputer__strategy': 'mean'}
{'clf__C': 0.01, 'clf__class_weight': None, 'clf__l1_ratio': 0.0, 'feat_selector__k': 20, 'imputer__strategy': 'mean'}
{'clf__C': 0.01, 'clf__class_weight': None, 'clf__l1_ratio': 0.0, 'feat_selector__k': 20, 'imputer__strategy': 'mean'}
