<a href="https://colab.research.google.com/github/chiaramarzi/ML-pipeline-2024/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# (The importance of) pipelines for machine learning analysis

In [1]:
# Libraries importation

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

In [2]:
# My repo cloning
! git clone https://github.com/chiaramarzi/ML-pipeline-2024

%cd /content/ML-pipeline-2024
! git pull

Cloning into 'ML-pipeline-2024'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 9 (delta 1), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (9/9), 466.21 KiB | 8.04 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/ML-pipeline-2024
Already up to date.


In [3]:
# Data loading
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

In [4]:
seed = 1

In [5]:
scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)

X_preprocessed = feat_selector.fit_transform(scaler.fit_transform(imputer.fit_transform(X)), y)

clf = SVC(random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.15, random_state=seed)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {round(acc, 2)}\n")

Test Accuracy: 0.89



In [6]:
### HOLDOUT validation ###
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=seed)

# Preprocessing on training set (fit and transform)
X_train_imputed = imputer.fit_transform(X_train)
X_train_imputed_scaled = scaler.fit_transform(X_train_imputed)
X_train_imputed_scaled_feat_selected = feat_selector.fit_transform(X_train_imputed_scaled, y_train)
# Preprocessing on test set (only transform)
X_test_imputed = imputer.transform(X_test)
X_test_imputed_scaled = scaler.transform(X_test_imputed)
X_test_imputed_scaled_feat_selected = feat_selector.transform(X_test_imputed_scaled)

clf = SVC(random_state=0)
clf.fit(X_train_imputed_scaled_feat_selected, y_train)
y_pred = clf.predict(X_test_imputed_scaled_feat_selected)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {round(acc, 2)}\n")

Test Accuracy: 0.85



In [7]:
### kfold CV validation ###
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
kf = KFold(n_splits=5, shuffle = True, random_state = seed)

acc_list = []
for train_index, test_index in kf.split(X):
  X_train = X.iloc[train_index, :]
  X_test = X.iloc[test_index, :]
  y_train = y[train_index]
  y_test = y[test_index]

  # Preprocessing on training set (fit and transform)
  X_train_imputed = imputer.fit_transform(X_train)
  X_train_imputed_scaled = scaler.fit_transform(X_train_imputed)
  X_train_imputed_scaled_feat_selected = feat_selector.fit_transform(X_train_imputed_scaled, y_train)
  # Preprocessing on test set (only transform)
  X_test_imputed = imputer.transform(X_test)
  X_test_imputed_scaled = scaler.transform(X_test_imputed)
  X_test_imputed_scaled_feat_selected = feat_selector.transform(X_test_imputed_scaled)

  clf = SVC(random_state=0)
  clf.fit(X_train_imputed_scaled_feat_selected, y_train)
  y_pred = clf.predict(X_test_imputed_scaled_feat_selected)
  acc = accuracy_score(y_test, y_pred)
  print(f"Test Accuracy: {round(acc, 2)}\n")
  acc_list.append(acc)

print(f"Average Test Accuracy: {round(np.mean(acc_list), 2)}\n")

Test Accuracy: 0.83

Test Accuracy: 0.8

Test Accuracy: 0.79

Test Accuracy: 0.81

Test Accuracy: 0.79

Average Test Accuracy: 0.8



In [8]:
### The pipeline ###
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
clf = SVC(random_state=0)
pipe = Pipeline([('imputer', imputer), ('scaler', scaler), ('feat_selector', feat_selector), ('clf', clf)])

pipe

In [9]:
## HOLDOUT validation ##
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=seed)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {round(acc, 2)}\n")

Test Accuracy: 0.85



In [10]:
## kfold CV validation ##
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
clf = SVC(random_state=0)
pipe = Pipeline([('imputer', imputer), ('scaler', scaler), ('feat_selector', feat_selector), ('clf', clf)])

kf = KFold(n_splits=5, shuffle = True, random_state = seed)

scores = cross_validate(pipe, X, y=y, scoring='accuracy', cv=kf, return_estimator=True)
print(f"Test Accuracy: {scores['test_score']}\n")
print(f"Average Test Accuracy: {round(np.mean(scores['test_score']),2)}\n")

Test Accuracy: [0.83 0.8  0.79 0.81 0.79]

Average Test Accuracy: 0.8



In [11]:
## Nested kfold CV validation with hyperparameters optimization##
data = pd.read_csv("simulated_data_MV.csv")

X = data.iloc[:, 2::]
y = data['Gene_mutation']

scaler = StandardScaler()
imputer = SimpleImputer()
feat_selector = SelectKBest(k=10)
clf = SVC(random_state=0)
pipe = Pipeline([('imputer', imputer), ('scaler', scaler), ('feat_selector', feat_selector), ('clf', clf)])

param_grid = {
    "imputer__strategy": ["mean", "most_frequent"],
    "feat_selector__k": [1, 20, 20],
    "clf__C": [0.1, 1],
    "clf__kernel": ["poly", "rbf"],
    "clf__degree": [2]
}

kf_inner = KFold(n_splits=10, shuffle = True, random_state = seed)
kf_outer = KFold(n_splits=5, shuffle = True, random_state = seed)

gs = GridSearchCV(pipe, param_grid, scoring='accuracy', refit=True, cv=kf_inner, verbose=0, return_train_score=True)
scores = cross_validate(gs, X, y=y, scoring='accuracy', cv=kf_outer, return_estimator=True)
print(f"Test Accuracy: {scores['test_score']}\n")
print(f"Average Test Accuracy: {round(np.mean(scores['test_score']),2)}\n")

Test Accuracy: [0.8  0.82 0.78 0.79 0.78]

Average Test Accuracy: 0.79



In [12]:
print(scores['estimator'][0].best_params_)
print(scores['estimator'][1].best_params_)
print(scores['estimator'][2].best_params_)
print(scores['estimator'][3].best_params_)
print(scores['estimator'][4].best_params_)

{'clf__C': 1, 'clf__degree': 2, 'clf__kernel': 'rbf', 'feat_selector__k': 1, 'imputer__strategy': 'mean'}
{'clf__C': 1, 'clf__degree': 2, 'clf__kernel': 'rbf', 'feat_selector__k': 20, 'imputer__strategy': 'mean'}
{'clf__C': 1, 'clf__degree': 2, 'clf__kernel': 'rbf', 'feat_selector__k': 20, 'imputer__strategy': 'mean'}
{'clf__C': 1, 'clf__degree': 2, 'clf__kernel': 'rbf', 'feat_selector__k': 20, 'imputer__strategy': 'mean'}
{'clf__C': 1, 'clf__degree': 2, 'clf__kernel': 'rbf', 'feat_selector__k': 20, 'imputer__strategy': 'mean'}
