# Pipeline for clinical and proteomic data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from scipy.stats import reciprocal, uniform
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
from sklearn.decomposition import PCA
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize']= 18, 8

import warnings
warnings.filterwarnings("ignore")

np.random.seed(123456)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Import the biological dataset containing peptide data. Data will be normalized and analyzed through the pipeline phases.


In [None]:
# load dataset
data = pd.read_excel('/content/drive/...', sheet_name = '...')
# delete unnamed columns
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
# data preview
data

## Data preprocessing




In [None]:
# print column names
data.columns

In [None]:
# delete unwanted columns
data = data.drop(['Name','Gleason'], axis=1)

Optionally impute missing values



In [None]:
# replace not a number with zeroes
data = data.replace(np.nan,0.0)
# replace infinitive values with zeroes
data = data.replace(np.inf,0.0)
# report missing values
data.isna().sum()

Optionally delete row not having the class variable

In [None]:
class_column = 'COLUMN COINTAING CLASS'
# delete rows without target value
data = data.drop(data[data[class_column] == 0.0].index, axis=0)
data = data.reset_index()
data = data.drop(['index'], axis = 1)
data.head()

Now uniform back all zeroes to "not a number" values before value imputation

In [None]:
data = data.replace(0.0, np.nan)
data.isna().sum()

## Missing values imputation

In [None]:
for column in data:
  data[column] = data[column].fillna(data[column].mean())

In [None]:
# count how many rows are in each target
data[class_column].value_counts()

Let's binarize the class variable: 0 for the first one, 1 for the second one

In [None]:
data[class_column].loc[data[class_column] == 'FIRST_CLASS_LABEL' ] = 0
data[class_column].loc[data[class_column] == 'SECOND_CLASS_LABEL' ] = 1

In [None]:
# output variable
Y = data[class_column]
# training data
X_1 = data.drop([class_column], axis=1)

# Feature selection

Select a subset of variables significant for the analysis phases, by using several (and optional) Machine Learning tools:

* **Pearson correlation**
* **Chi-square test**
* **RFE**
* **Logistic regression**
* **Random Forest**




In [None]:
# convert output to float type
Y = Y.astype(float)
# max number of features
num_feats = 20

In [None]:
# normalization
X_scaled = MinMaxScaler().fit_transform(X_1)
X_scaled = pd.DataFrame(X_scaled, columns = X_1.columns)

#  Pearson correlation


In [None]:
def cor_selector(X, Y, num_feats):
  cor_list = []
  feature_name = X.columns.tolist()

  for i in X.columns.tolist():
    cor = np.corrcoef(X[i], Y)[0, 1]
    cor_list.append(cor)
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    cor_feature = X.iloc[:, np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    cor_support = [True if i in cor_feature else False for i in feature_name]

  return cor_support, cor_feature

cor_support, cor_feature = cor_selector(X_scaled, Y, num_feats)
print(str(len(cor_feature)), 'selected features')
print(cor_feature)

# Chi square


In [None]:
chi_selector = SelectKBest(chi2, k = num_feats)
chi_selector.fit(X_scaled, Y)
chi_support = chi_selector.get_support()
chi_feature = X_scaled.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')
print(chi_feature)

# RFE - Recursive Feature Elimination


In [None]:
rfe_selector = RFE(estimator = LogisticRegression(), n_features_to_select = num_feats, step = 5, verbose = 5)
rfe_selector.fit(X_scaled, Y)
rfe_support = rfe_selector.get_support()
rfe_feature = X_scaled.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
print(rfe_feature)

# Logistic Regression


In [None]:
embedded_lr_selector = SelectFromModel(LogisticRegression(penalty = "l1", solver = 'liblinear'), max_features = num_feats)
embedded_lr_selector.fit(X_scaled, Y)
embedded_lr_support = embedded_lr_selector.get_support()
embedded_lr_feature = X_scaled.loc[:,embedded_lr_support].columns.tolist()
print(str(len(embedded_lr_feature)), 'selected features')
print(embedded_lr_feature)

# Random Forest



In [None]:
embedded_rf_selector = SelectFromModel(RandomForestClassifier(30), max_features = num_feats)
embedded_rf_selector.fit(X_scaled, Y)
embedded_rf_support = embedded_rf_selector.get_support()
embedded_rf_feature = X_scaled.loc[:,embedded_rf_support].columns.tolist()
print(str(len(embedded_rf_feature)), 'selected features')
print(embedded_rf_feature)

# Feature selection table


In [None]:
feature_name= X_scaled.columns

feature_selection_df = pd.DataFrame({'Feature': feature_name,
                                     'Pearson': cor_support,
                                     'Chi-2': chi_support,
                                     'RFE': rfe_support,
                                     'Logistic regression': embedded_lr_support,
                                     'Random Forest': embedded_rf_support})

feature_selection_df['Total'] = np.sum(feature_selection_df, axis = 1)
feature_selection_df = feature_selection_df.sort_values(['Total', 'Feature'], ascending = False)
feature_selection_df.index = range(1, len(feature_selection_df) + 1)
feature_selection_df[:20]

Now select the columns on which at least 'min_score' feature selection tools agreed to be relevant


In [None]:
min_score = 4

features = []
for row in feature_selection_df.itertuples():
  if (row[-1] >= min_score):
    features.append(row[1])

features

# Machine Learning models training

We will adopt a 10-fold cross validation approach and build the training and test sets through the StratifiedKFold class

In [None]:
# definition of the ML models
lr = LogisticRegression()
dtree = DecisionTreeClassifier()
neigh = KNeighborsClassifier(n_neighbors=3)
svclassifier = SVC(C=1.2058449429580245, gamma=0.0870602087830485, probability=True)
rf_fit = RandomForestClassifier(n_estimators=8, criterion="gini", min_samples_split=2, bootstrap=True,
                                 max_features='auto', random_state=42, min_samples_leaf=1)

models = [lr, dtree, neigh, svclassifier, rf_fit]

# creation of the StratifiedKFold object for the 10-fold cross-validation strategy
num_folds = 10
stratified_kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

# ML models training
for model in models:
    all_fpr = []
    all_tpr = []
    all_f1 = []
    all_acc = []
    all_sens = []
    all_spec = []

    model_name = model.__class__.__name__
    cross_val_results = cross_val_score(model, X_scaled, Y, cv=stratified_kfold, scoring='accuracy')
    all_confusion_matrices = []

    for train_index, test_index in stratified_kfold.split(X_scaled, Y):
        X_train, X_test = X_scaled.iloc[train_index], X_scaled.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        # calculates the probability for the positive class
        y_prob = model.predict_proba(X_test)[:, 1]
        # calculate the ROC curve
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)

        # Interpola la curva ROC per avere la stessa lunghezza
        mean_fpr = np.linspace(0, 1, 100)  # Aggiunto questo per inizializzare mean_fpr
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0
        all_fpr.append(mean_fpr)
        all_tpr.append(interp_tpr)

        # Calcola le altre metriche
        predicted = model.predict(X_test)
        cm = confusion_matrix(y_test, predicted)
        f1 = f1_score(y_test, predicted)
        acc = accuracy_score(y_test, predicted)
        sens = cm[0, 0]/(cm[0, 0] + cm[0, 1])
        spec = cm[1, 1] / (cm[1, 0] + cm[1, 1])

        all_f1.append(f1)
        all_acc.append(acc)
        all_sens.append(sens)
        all_spec.append(spec)

    # calculate the average for the ROC curve
    mean_fpr = np.mean(all_fpr, axis=0)
    mean_tpr = np.mean(all_tpr, axis=0)

    # calculate the average for the F1
    mean_f1 = np.mean(all_f1)
    # add the F1 score to the list
    f1_scores.append(mean_f1)

    # calculate the area under the ROC curve (AUC)
    roc_auc = auc(mean_fpr, mean_tpr)

    # plot the averaged ROC curve for the model
    plt.plot(mean_fpr, mean_tpr, lw=2, label=f'{model.__class__.__name__} (AUC = {roc_auc:.2f})')

    # print all of the other metrices
    print(f'{model.__class__.__name__}:')
    print(f'  AUC: {np.mean(roc_auc):.2f}')
    print(f'  F1: {np.mean(all_f1):.2f}')
    print(f'  Accuracy: {np.mean(all_acc):.2f}')
    print(f'  Sensitivity: {np.mean(all_sens):.2f}')
    print(f'  Specificity: {np.mean(all_spec):.2f}')
    print()

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Mean ROC Curve - All Models')
plt.legend(loc="lower right")
plt.show()

# Voting

We will use soft voting (i.e. weighting ML models by their accuracies) and hard voting (i.e. majority of according models)

In [None]:
from sklearn.ensemble import VotingClassifier

# define validation dataset (please edit the code)
validation_data = ...

In [None]:
# soft voting
s_voting = VotingClassifier(estimators=[('Rf', rf_fit),
                                        ('Lr', lr),
                                        ('Knn', neigh),
                                        ('Svc', svclassifier),
                                        ('Dtc', dtree)],
                            voting='soft',
                            weights=f1_scores)
s_voting = s_voting.fit(data, Y)
soft_voting = s_voting.predict(validation_data)

In [None]:
# hard voting
h_voting = VotingClassifier( estimators=[ ('Rf', rf_fit),
                                          ('Lr', lr),
                                          ('Knn', neigh),
                                          ('Svc', svclassifier),
                                          ('Dtc', dtree)],
                             voting='hard')
h_voting = h_voting.fit(data, Y)
hard_voting = h_voting.predict(validation_data)

## Voting tables




In [None]:
tab_voting = pd.concat([X, soft_voting, hard_voting], axis = 1)
tab_voting