In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef, confusion_matrix
from sklearn.model_selection import cross_val_score

In [3]:
def print_metrics(y_true, predictions, dataset_name):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, predictions, average='binary')
    accuracy = accuracy_score(y_true, predictions)
    mcc = matthews_corrcoef(y_true, predictions)
    cm = confusion_matrix(y_true, predictions)
    
    print(f"{dataset_name} Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    print(f"{dataset_name} Accuracy: {accuracy:.2f}, MCC: {mcc:.2f}")
    print(f"{dataset_name} Confusion Matrix:")
    print(cm)

def get_cv_metrics(algorithm, X_train, y_train):
    acc = cross_val_score(algorithm, X_train, y_train, cv=10, scoring="accuracy")
    precision = cross_val_score(algorithm, X_train, y_train, cv=10, scoring="precision")
    recall = cross_val_score(algorithm, X_train, y_train, cv=10, scoring="recall")
    f1 = cross_val_score(algorithm, X_train, y_train, cv=10, scoring="f1")
    mcc = cross_val_score(algorithm, X_train, y_train, cv=10, scoring="roc_auc")
    
    return pd.DataFrame([[acc.mean(), precision.mean(), recall.mean(), f1.mean(), mcc.mean()]], columns=['Mean Accuracy', 'Mean Precision', 'Mean Recall', 'Mean F1', 'Mean MCC'])

In [4]:
clinical_attributes = pd.read_csv('step_04/clinical_attributes.csv')
z_score = pd.read_csv('step_04/z_score.csv')
mutation = pd.read_csv('step_04/mutation.csv')
response = pd.read_csv('step_04/response.csv')

In [5]:
class Datasets():
    def __init__(self, seed):
        X_clinical_train_val, self.X_clinical_test, y_train_val, self.y_test = train_test_split(clinical_attributes, response, test_size=0.1, random_state=seed)
        self.X_clinical_train, self.X_clinical_val, self.y_train, self.y_val = train_test_split(X_clinical_train_val, y_train_val, test_size=0.2, random_state=seed)

        X_mutation_train_val, self.X_mutation_test, _, _ = train_test_split(mutation, response, test_size=0.1, random_state=seed)
        self.X_mutation_train, self.X_mutation_val, _, _ = train_test_split(X_mutation_train_val, y_train_val, test_size=0.2, random_state=seed)

        X_z_score_train_val, self.X_z_score_test, _, _ = train_test_split(z_score, response, test_size=0.1, random_state=seed)
        self.X_z_score_train, self.X_z_score_val, _, _ = train_test_split(X_z_score_train_val, y_train_val, test_size=0.2, random_state=seed)

    def get_dataset(self, datasets):
        X_train, X_val, X_test = None, None, None

        if 'clinical' in datasets:
            dataset = {
                'X_train': self.X_clinical_train,
                'X_val': self.X_clinical_val,
                'X_test': self.X_clinical_test
            }

        if 'mutation' in datasets:
            dataset = {
                'X_train': self.X_mutation_train,
                'X_val': self.X_mutation_val,
                'X_test': self.X_mutation_test
            }

        if 'z_score' in datasets:
            dataset = {
                'X_train': self.X_z_score_train,
                'X_val': self.X_z_score_val,
                'X_test': self.X_z_score_test
            }
        
        dataset.update({
            'y_train': np.squeeze(self.y_train),
            'y_val': np.squeeze(self.y_val),
            'y_test': np.squeeze(self.y_test)
        })

        return dataset

In [6]:
data = Datasets(42)

# Halving Search

### Model definition

In [7]:
clinical_algorithm = AdaBoostClassifier(
    learning_rate=0.6, 
    n_estimators=30, 
    algorithm='SAMME'
    )
z_score_algorithm = DecisionTreeClassifier(
    ccp_alpha=0.0, 
    criterion='log_loss', 
    max_depth=12, 
    max_features='log2', 
    min_samples_leaf=1, 
    min_samples_split=2,
    splitter='random'
    )
mutation_algorithm = RandomForestClassifier(
    ccp_alpha=0.03333333333333333, 
    class_weight='balanced_subsample', 
    criterion='log_loss', 
    max_depth=67, 
    max_features='log2', 
    n_estimators=144
    )

In [8]:
clinical_algorithm.fit(data.get_dataset(['clinical'])['X_train'], data.get_dataset(['clinical'])['y_train'])
z_score_algorithm.fit(data.get_dataset(['z_score'])['X_train'], data.get_dataset(['z_score'])['y_train'])
mutation_algorithm.fit(data.get_dataset(['mutation'])['X_train'], data.get_dataset(['mutation'])['y_train'])

### Test models separated

In [11]:
from sklearn.model_selection import cross_val_score
lst = []

algorithms = {
    'clinical': clinical_algorithm,
    'z_score': z_score_algorithm,
    'mutation': mutation_algorithm

}

for name, algorithm in algorithms.items():
    acc = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="accuracy")
    precision = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="precision")
    recall = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="recall")
    f1 = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="f1")
    mcc = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="roc_auc")

    lst.append([name, acc.mean(), precision.mean(), recall.mean(), f1.mean(), mcc.mean()])

pd.DataFrame(lst, columns=['Algorithm', 'Mean Accuracy', 'Mean Precision', 'Mean Recall', 'Mean F1', 'Mean MCC'])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Algorithm,Mean Accuracy,Mean Precision,Mean Recall,Mean F1,Mean MCC
0,clinical,0.650588,0.626479,0.678537,0.649847,0.724585
1,z_score,0.537647,0.505351,0.502622,0.500936,0.551893
2,mutation,0.487059,0.143529,0.5,0.408254,0.503659


In [62]:
clinical_algorithm_pred = clinical_algorithm.predict(data.get_dataset(['clinical'])['X_val'])
mutation_algorithm_pred = mutation_algorithm.predict(data.get_dataset(['mutation'])['X_val'])
z_score_algorithm_pred = z_score_algorithm.predict(data.get_dataset(['z_score'])['X_val'])

clinical_algorithm_pred_test = clinical_algorithm.predict(data.get_dataset(['clinical'])['X_test'])
mutation_algorithm_pred_test = mutation_algorithm.predict(data.get_dataset(['mutation'])['X_test'])
z_score_algorithm_pred_test = z_score_algorithm.predict(data.get_dataset(['z_score'])['X_test'])

print_metrics(data.get_dataset(['clinical'])['y_val'], clinical_algorithm_pred, 'Clinical Validation')
print_metrics(data.get_dataset(['mutation'])['y_val'], mutation_algorithm_pred, 'Mutation Validation')
print_metrics(data.get_dataset(['z_score'])['y_val'], z_score_algorithm_pred, 'Z-Score Validation')

Clinical Validation Precision: 0.68, Recall: 0.70, F1 Score: 0.69
Clinical Validation Accuracy: 0.66, MCC: 0.31
Clinical Validation Confusion Matrix:
[[59 39]
 [34 81]]
Mutation Validation Precision: 0.54, Recall: 1.00, F1 Score: 0.70
Mutation Validation Accuracy: 0.54, MCC: 0.00
Mutation Validation Confusion Matrix:
[[  0  98]
 [  0 115]]
Z-Score Validation Precision: 0.57, Recall: 0.48, F1 Score: 0.52
Z-Score Validation Accuracy: 0.53, MCC: 0.06
Z-Score Validation Confusion Matrix:
[[57 41]
 [60 55]]


### Merge models in one

In [9]:
meta_X_train = np.column_stack([
    data.get_dataset('clinical')['X_train'].to_numpy(),
    data.get_dataset('mutation')['X_train'].to_numpy(),
    data.get_dataset('z_score')['X_train'].to_numpy() 
])
meta_y_train = data.get_dataset('clinical')['y_train']

meta_X_val = np.column_stack([
    data.get_dataset('clinical')['X_val'].to_numpy(),
    data.get_dataset('mutation')['X_val'].to_numpy(),
    data.get_dataset('z_score')['X_val'].to_numpy() 
])
meta_y_val = data.get_dataset('clinical')['y_val']

meta_X_test = np.column_stack([
    data.get_dataset('clinical')['X_test'].to_numpy(),
    data.get_dataset('mutation')['X_test'].to_numpy(),
    data.get_dataset('z_score')['X_test'].to_numpy() 
])
meta_y_test = data.get_dataset('clinical')['y_test']

In [10]:
voting_clf = VotingClassifier([
    ('clinical', clinical_algorithm),
    ('z_score', z_score_algorithm),
    ('mutation', mutation_algorithm)
], voting='hard')

voting_clf.fit(meta_X_train, meta_y_train)

In [67]:
val_predictions = voting_clf.predict(meta_X_val)
test_predictions = voting_clf.predict(meta_X_test)

print_metrics(meta_y_val, val_predictions, 'Voting Validation')
print_metrics(meta_y_test, test_predictions, 'Voting Test')

Voting Validation Precision: 0.57, Recall: 0.78, F1 Score: 0.66
Voting Validation Accuracy: 0.56, MCC: 0.09
Voting Validation Confusion Matrix:
[[29 69]
 [25 90]]
Voting Test Precision: 0.69, Recall: 0.90, F1 Score: 0.78
Voting Test Accuracy: 0.71, MCC: 0.40
Voting Test Confusion Matrix:
[[23 28]
 [ 7 61]]


In [11]:
lst = []
acc = cross_val_score(voting_clf, meta_X_train, meta_y_train, cv=10, scoring="accuracy")
precision = cross_val_score(voting_clf, meta_X_train, meta_y_train, cv=10, scoring="precision")
recall = cross_val_score(voting_clf, meta_X_train, meta_y_train, cv=10, scoring="recall")
f1 = cross_val_score(voting_clf, meta_X_train, meta_y_train, cv=10, scoring="f1")
mcc = cross_val_score(voting_clf, meta_X_train, meta_y_train, cv=10, scoring="roc_auc")

lst.append(['Voting', acc.mean(), precision.mean(), recall.mean(), f1.mean(), mcc.mean()])
pd.DataFrame(lst, columns=['Algorithm', 'Mean Accuracy', 'Mean Precision', 'Mean Recall', 'Mean F1', 'Mean MCC'])

Traceback (most recent call last):
  File "/home/diego/miniconda3/envs/pandas-env/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/home/diego/miniconda3/envs/pandas-env/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 370, in _score
    response_method = _check_response_method(estimator, self._response_method)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/diego/miniconda3/envs/pandas-env/lib/python3.12/site-packages/sklearn/utils/validation.py", line 2145, in _check_response_method
    raise AttributeError(
AttributeError: VotingClassifier has none of the following attributes: decision_function, predict_proba.

Traceback (most recent call last):
  File "/home/diego/miniconda3/envs/pandas-env/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/h

Unnamed: 0,Algorithm,Mean Accuracy,Mean Precision,Mean Recall,Mean F1,Mean MCC
0,Voting,0.647059,0.640674,0.602805,0.621349,


# Teapot

In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from tpot.builtins import ZeroCount
from xgboost import XGBClassifier
from tpot.export_utils import set_param_recursive



In [13]:
clinical_tpot = GradientBoostingClassifier(
    learning_rate=0.1, 
    max_depth=4, 
    max_features=0.8, 
    min_samples_leaf=11, 
    min_samples_split=8,
    n_estimators=100, 
    subsample=0.7000000000000001
    )
mutation_tpot = make_pipeline(
    ZeroCount(),
    XGBClassifier(
        learning_rate=0.1, 
        max_depth=2, 
        min_child_weight=13, 
        n_estimators=100, 
        n_jobs=1, 
        subsample=0.6000000000000001, 
        verbosity=0
        )
)
zscore_tpot = MLPClassifier(
    alpha=0.0001, 
    learning_rate_init=0.001
    )


### Clinical

In [14]:
training_val_features, testing_features, training_val_target, testing_target = \
            train_test_split(clinical_attributes.to_numpy(), response.to_numpy(), random_state=42)
training_features, validation_features, training_target, validation_target = \
            train_test_split(training_val_features, training_val_target, random_state=42)


#clinical_tpot = RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.5, min_samples_leaf=8, min_samples_split=11, n_estimators=100)
clinical_tpot = GradientBoostingClassifier(learning_rate=0.1, max_depth=4, max_features=0.8, min_samples_leaf=11, min_samples_split=8, n_estimators=100, subsample=0.7000000000000001)

if hasattr(clinical_tpot, 'random_state'):
    setattr(clinical_tpot
, 'random_state', 42)

clinical_tpot.fit(training_features, training_target)

  y = column_or_1d(y, warn=True)


In [27]:
results = clinical_tpot.predict(validation_features)
print_metrics(validation_target, results, "TPOT Clinical")

TPOT Clinical Precision: 0.66, Recall: 0.70, F1 Score: 0.68
TPOT Clinical Accuracy: 0.70, MCC: 0.40
TPOT Clinical Confusion Matrix:
[[87 36]
 [30 69]]


In [30]:
get_cv_metrics(clinical_tpot, training_features, np.squeeze(training_target))

Unnamed: 0,Mean Accuracy,Mean Precision,Mean Recall,Mean F1,Mean MCC
0,0.668725,0.669042,0.668717,0.66788,0.737665


In [31]:
meta_X_train = training_features
meta_X_val = validation_features
meta_X_test = testing_features
meta_y_train = training_target
meta_y_val = validation_target
meta_y_test = testing_target

### Mutation

In [15]:
training_validation_features, testing_features, training_validation_target, testing_target = \
            train_test_split(mutation.to_numpy(), response.to_numpy(), random_state=42)
training_features, validation_features, training_target, validation_target = \
            train_test_split(training_validation_features, training_validation_target, random_state=42)

#mutation_tpot = GradientBoostingClassifier(learning_rate=0.01, max_depth=8, max_features=0.55, min_samples_leaf=13, min_samples_split=19, n_estimators=100, subsample=0.15000000000000002)
mutation_tpot = make_pipeline(
    ZeroCount(),
    XGBClassifier(learning_rate=0.1, max_depth=2, min_child_weight=13, n_estimators=100, n_jobs=1, subsample=0.6000000000000001, verbosity=0)
)

if hasattr(mutation_tpot, 'random_state'):
    setattr(mutation_tpot, 'random_state', 42)

mutation_tpot.fit(training_features, training_target)

In [33]:
results = mutation_tpot.predict(validation_features)
print_metrics(validation_target, results, "Mutation")


Mutation Precision: 0.50, Recall: 0.56, F1 Score: 0.53
Mutation Accuracy: 0.56, MCC: 0.12
Mutation Confusion Matrix:
[[69 54]
 [44 55]]


In [34]:
get_cv_metrics(mutation_tpot, training_features, np.squeeze(training_target))

Unnamed: 0,Mean Accuracy,Mean Precision,Mean Recall,Mean F1,Mean MCC
0,0.536092,0.532791,0.5541,0.5416,0.525754


In [35]:
meta_X_train = np.column_stack((meta_X_train, training_features))
meta_X_val = np.column_stack((meta_X_val, validation_features))
meta_X_test = np.column_stack((meta_X_test, testing_features))

### Z-Score

In [16]:
training_validation_features, testing_features, training_validation_target, testing_target = \
            train_test_split(z_score.to_numpy(), response.to_numpy(), random_state=42)
training_features, validation_features, training_target, validation_target = \
            train_test_split(training_validation_features, training_validation_target, random_state=42)

#zscore_tpot = MLPClassifier(alpha=0.0001, learning_rate_init=0.001)
zscore_tpot =  MLPClassifier(alpha=0.0001, learning_rate_init=0.001)

if hasattr(zscore_tpot, 'random_state'):
    setattr(zscore_tpot, 'random_state', 42)

zscore_tpot.fit(training_features, training_target)

  y = column_or_1d(y, warn=True)


In [42]:
results = zscore_tpot.predict(validation_features)
print_metrics(validation_target, results, "Z-Score")

Z-Score Precision: 0.59, Recall: 0.67, F1 Score: 0.63
Z-Score Accuracy: 0.64, MCC: 0.29
Z-Score Confusion Matrix:
[[77 46]
 [33 66]]


In [43]:
get_cv_metrics(zscore_tpot, training_features, np.squeeze(training_target))

Unnamed: 0,Mean Accuracy,Mean Precision,Mean Recall,Mean F1,Mean MCC
0,0.607169,0.608752,0.599465,0.600908,0.666013


In [44]:
meta_X_train = np.column_stack((meta_X_train, training_features))
meta_X_val = np.column_stack((meta_X_val, validation_features))
meta_X_test = np.column_stack((meta_X_test, testing_features))

### Meta Model

In [17]:
meta_model = StackingClassifier(
    estimators=[
        ('clinical', clinical_tpot),
        ('mutation', mutation_tpot),
        ('zscore', zscore_tpot)
    ],
    final_estimator=LogisticRegression()
)

meta_model.fit(meta_X_train, np.squeeze(meta_y_train))

In [46]:
meta_model_predictions = meta_model.predict(meta_X_val)
print_metrics(validation_target, meta_model_predictions, "Meta Model")

meta_model_predictions = meta_model.predict(meta_X_test)
print_metrics(testing_target, meta_model_predictions, "Meta Model Test")

Meta Model Precision: 0.62, Recall: 0.68, F1 Score: 0.65
Meta Model Accuracy: 0.67, MCC: 0.34
Meta Model Confusion Matrix:
[[82 41]
 [32 67]]
Meta Model Test Precision: 0.71, Recall: 0.66, F1 Score: 0.69
Meta Model Test Accuracy: 0.67, MCC: 0.35
Meta Model Test Confusion Matrix:
[[ 93  43]
 [ 54 106]]


In [18]:
lst = []

acc = cross_val_score(meta_model, meta_X_train, meta_y_train, cv=10, scoring="accuracy")
precision = cross_val_score(meta_model, meta_X_train, meta_y_train, cv=10, scoring="precision")
recall = cross_val_score(meta_model, meta_X_train, meta_y_train, cv=10, scoring="recall")
f1 = cross_val_score(meta_model, meta_X_train, meta_y_train, cv=10, scoring="f1")
mcc = cross_val_score(meta_model, meta_X_train, meta_y_train, cv=10, scoring="roc_auc")

lst.append(['Meta Model', acc.mean(), precision.mean(), recall.mean(), f1.mean(), mcc.mean()])
pd.DataFrame(lst, columns=['Algorithm', 'Mean Accuracy', 'Mean Precision', 'Mean Recall', 'Mean F1', 'Mean MCC'])

Unnamed: 0,Algorithm,Mean Accuracy,Mean Precision,Mean Recall,Mean F1,Mean MCC
0,Meta Model,0.654118,0.65084,0.605183,0.62573,0.717493
