In [14]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef, confusion_matrix
from sklearn.model_selection import cross_val_score

In [2]:
def print_metrics(y_true, predictions, dataset_name):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, predictions, average='binary')
    accuracy = accuracy_score(y_true, predictions)
    mcc = matthews_corrcoef(y_true, predictions)
    cm = confusion_matrix(y_true, predictions)
    
    print(f"{dataset_name} Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    print(f"{dataset_name} Accuracy: {accuracy:.2f}, MCC: {mcc:.2f}")
    print(f"{dataset_name} Confusion Matrix:")
    print(cm)

In [3]:
clinical_attributes = pd.read_csv('step_04/clinical_attributes.csv')
z_score = pd.read_csv('step_04/z_score.csv')
mutation = pd.read_csv('step_04/mutation.csv')
response = pd.read_csv('step_04/response.csv')

In [4]:
class Datasets():
    def __init__(self, seed):
        X_clinical_train_val, self.X_clinical_test, y_train_val, self.y_test = train_test_split(clinical_attributes, response, test_size=0.1, random_state=seed)
        self.X_clinical_train, self.X_clinical_val, self.y_train, self.y_val = train_test_split(X_clinical_train_val, y_train_val, test_size=0.2, random_state=seed)

        X_mutation_train_val, self.X_mutation_test, _, _ = train_test_split(mutation, response, test_size=0.1, random_state=seed)
        self.X_mutation_train, self.X_mutation_val, _, _ = train_test_split(X_mutation_train_val, y_train_val, test_size=0.2, random_state=seed)

        X_z_score_train_val, self.X_z_score_test, _, _ = train_test_split(z_score, response, test_size=0.1, random_state=seed)
        self.X_z_score_train, self.X_z_score_val, _, _ = train_test_split(X_z_score_train_val, y_train_val, test_size=0.2, random_state=seed)

    def get_dataset(self, datasets):
        X_train, X_val, X_test = None, None, None

        if 'clinical' in datasets:
            dataset = {
                'X_train': self.X_clinical_train,
                'X_val': self.X_clinical_val,
                'X_test': self.X_clinical_test
            }

        if 'mutation' in datasets:
            dataset = {
                'X_train': self.X_mutation_train,
                'X_val': self.X_mutation_val,
                'X_test': self.X_mutation_test
            }

        if 'z_score' in datasets:
            dataset = {
                'X_train': self.X_z_score_train,
                'X_val': self.X_z_score_val,
                'X_test': self.X_z_score_test
            }
        
        dataset.update({
            'y_train': np.squeeze(self.y_train),
            'y_val': np.squeeze(self.y_val),
            'y_test': np.squeeze(self.y_test)
        })

        return dataset

In [5]:
data = Datasets(42)

# Halving Search

### Model definition

In [9]:
clinical_algorithm = AdaBoostClassifier(learning_rate=0.6, n_estimators=30, algorithm='SAMME')
mutation_algorithm = z_score_algorithm = RandomForestClassifier(ccp_alpha=0.07777777777777778, class_weight='balanced_subsample', criterion='entropy', max_depth=78, max_features='log2', n_estimators=133)
z_score_algorithm = DecisionTreeClassifier(ccp_alpha=0.0, criterion='log_loss', max_depth=12, max_features='log2', min_samples_leaf=1, min_samples_split=2, splitter='random')

In [10]:
clinical_algorithm.fit(data.get_dataset(['clinical'])['X_train'], data.get_dataset(['clinical'])['y_train'])
mutation_algorithm.fit(data.get_dataset(['mutation'])['X_train'], data.get_dataset(['mutation'])['y_train'])
z_score_algorithm.fit(data.get_dataset(['z_score'])['X_train'], data.get_dataset(['z_score'])['y_train'])

### Test models separated

In [30]:
from sklearn.model_selection import cross_val_score
lst = []

algorithms = {
    'clinical': clinical_algorithm,
    'mutation': mutation_algorithm,
    'z_score': z_score_algorithm
}

for name, algorithm in algorithms.items():
    acc = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="accuracy")
    precision = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="precision")
    recall = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="recall")
    f1 = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="f1")
    mcc = cross_val_score(algorithm, data.get_dataset([name.lower()])['X_train'], data.get_dataset([name.lower()])['y_train'], cv=10, scoring="roc_auc")

    lst.append([name, acc.mean(), precision.mean(), recall.mean(), f1.mean(), mcc.mean()])

pd.DataFrame(lst, columns=['Algorithm', 'Mean Accuracy', 'Mean Precision', 'Mean Recall', 'Mean F1', 'Mean MCC'])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Algorithm,Mean Accuracy,Mean Precision,Mean Recall,Mean F1,Mean MCC
0,clinical,0.650588,0.626479,0.678537,0.649847,0.724585
1,mutation,0.5,0.24,0.8,0.584635,0.5
2,z_score,0.535294,0.53597,0.487927,0.534013,0.546629


In [13]:
clinical_algorithm_pred = clinical_algorithm.predict(data.get_dataset(['clinical'])['X_val'])
mutation_algorithm_pred = mutation_algorithm.predict(data.get_dataset(['mutation'])['X_val'])
z_score_algorithm_pred = z_score_algorithm.predict(data.get_dataset(['z_score'])['X_val'])

clinical_algorithm_pred_test = clinical_algorithm.predict(data.get_dataset(['clinical'])['X_test'])
mutation_algorithm_pred_test = mutation_algorithm.predict(data.get_dataset(['mutation'])['X_test'])
z_score_algorithm_pred_test = z_score_algorithm.predict(data.get_dataset(['z_score'])['X_test'])

print_metrics(data.get_dataset(['clinical'])['y_val'], clinical_algorithm_pred, 'Clinical Validation')
print_metrics(data.get_dataset(['mutation'])['y_val'], mutation_algorithm_pred, 'Mutation Validation')
print_metrics(data.get_dataset(['z_score'])['y_val'], z_score_algorithm_pred, 'Z-Score Validation')

Clinical Validation Precision: 0.68, Recall: 0.70, F1 Score: 0.69
Clinical Validation Accuracy: 0.66, MCC: 0.31
Clinical Validation Confusion Matrix:
[[59 39]
 [34 81]]
Mutation Validation Precision: 0.54, Recall: 1.00, F1 Score: 0.70
Mutation Validation Accuracy: 0.54, MCC: 0.00
Mutation Validation Confusion Matrix:
[[  0  98]
 [  0 115]]
Z-Score Validation Precision: 0.66, Recall: 0.54, F1 Score: 0.59
Z-Score Validation Accuracy: 0.60, MCC: 0.21
Z-Score Validation Confusion Matrix:
[[66 32]
 [53 62]]


### Merge models in one

In [33]:
voting_clf = VotingClassifier([
    ('clinical', clinical_algorithm),
    ('z_score', z_score_algorithm),
    ('mutation', mutation_algorithm)
], voting='hard')

voting_clf.fit(data.get_dataset(['clinical', 'z_score', 'mutation'])['X_train'], data.get_dataset(['clinical', 'z_score', 'mutation'])['y_train'])

In [35]:
val_predictions = voting_clf.predict(data.get_dataset(['clinical', 'z_score', 'mutation'])['X_val'])
test_predictions = voting_clf.predict(data.get_dataset(['clinical', 'z_score', 'mutation'])['X_test'])

print_metrics(data.get_dataset(['clinical', 'z_score', 'mutation'])['y_val'], val_predictions, "Validation")
print_metrics(data.get_dataset(['clinical', 'z_score', 'mutation'])['y_test'], test_predictions, "Test")

Validation Precision: 0.78, Recall: 0.22, F1 Score: 0.34
Validation Accuracy: 0.54, MCC: 0.20
Validation Confusion Matrix:
[[91  7]
 [90 25]]
Test Precision: 0.82, Recall: 0.26, F1 Score: 0.40
Test Accuracy: 0.55, MCC: 0.24
Test Confusion Matrix:
[[47  4]
 [50 18]]


# Teapot

In [81]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

### Clinical

In [36]:
training_features, testing_features, training_target, testing_target = \
            train_test_split(clinical_attributes.to_numpy(), response.to_numpy(), random_state=42)

clinical_tpot = RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.5, min_samples_leaf=8, min_samples_split=11, n_estimators=100)

if hasattr(clinical_tpot, 'random_state'):
    setattr(clinical_tpot
, 'random_state', 42)

clinical_tpot.fit(training_features, training_target)
results = clinical_tpot.predict(testing_features)

  return fit_method(estimator, *args, **kwargs)


In [37]:
print_metrics(testing_target, results, "Clinical")


Clinical Precision: 0.69, Recall: 0.68, F1 Score: 0.69
Clinical Accuracy: 0.67, MCC: 0.33
Clinical Confusion Matrix:
[[ 88  48]
 [ 51 109]]


### Mutation

In [38]:
training_features, testing_features, training_target, testing_target = \
            train_test_split(mutation.to_numpy(), response.to_numpy(), random_state=42)

mutation_tpot = GradientBoostingClassifier(learning_rate=0.01, max_depth=8, max_features=0.55, min_samples_leaf=13, min_samples_split=19, n_estimators=100, subsample=0.15000000000000002)

if hasattr(mutation_tpot, 'random_state'):
    setattr(mutation_tpot, 'random_state', 42)

mutation_tpot.fit(training_features, training_target)
results = mutation_tpot.predict(testing_features)


NameError: name 'GradientBoostingClassifier' is not defined

In [80]:
print_metrics(testing_target, results, "Mutation")


Mutation Precision: 0.60, Recall: 0.42, F1 Score: 0.50
Mutation Accuracy: 0.53, MCC: 0.09
Mutation Confusion Matrix:
[[90 46]
 [92 68]]


### Z-Score

In [82]:
training_features, testing_features, training_target, testing_target = \
            train_test_split(z_score.to_numpy(), response.to_numpy(), random_state=42)

zscore_tpot = MLPClassifier(alpha=0.0001, learning_rate_init=0.001)

if hasattr(zscore_tpot, 'random_state'):
    setattr(zscore_tpot, 'random_state', 42)

zscore_tpot.fit(training_features, training_target)
results = zscore_tpot.predict(testing_features)

  y = column_or_1d(y, warn=True)


In [None]:
meta_model = StackingClassifier(
    estimators=[
        ('clinical', clinical_tpot),
        ('mutation', mutation_tpot),
        ('zscore', zscore_tpot)
    ],
    final_estimator=LogisticRegression()
)

meta_model.fit(training_features, training_target.ravel())

meta_model_predictions = meta_model.predict(testing_features)

print_metrics(testing_target, meta_model_predictions, "Meta Model")