In [61]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef, confusion_matrix

In [37]:
def print_metrics(y_true, predictions, dataset_name):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, predictions, average='binary')
    accuracy = accuracy_score(y_true, predictions)
    mcc = matthews_corrcoef(y_true, predictions)
    cm = confusion_matrix(y_true, predictions)
    
    print(f"{dataset_name} Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    print(f"{dataset_name} Accuracy: {accuracy:.2f}, MCC: {mcc:.2f}")
    print(f"{dataset_name} Confusion Matrix:")
    print(cm)

In [3]:
clinical_attributes = pd.read_csv('step_04/clinical_attributes.csv')
z_score = pd.read_csv('step_04/z_score.csv')
mutation = pd.read_csv('step_04/mutation.csv')
response = pd.read_csv('step_04/response.csv')

In [47]:
class Datasets():
    def __init__(self, seed):
        X_clinical_train_val, self.X_clinical_test, y_train_val, self.y_test = train_test_split(clinical_attributes, response, test_size=0.1, random_state=seed)
        self.X_clinical_train, self.X_clinical_val, self.y_train, self.y_val = train_test_split(X_clinical_train_val, y_train_val, test_size=0.2, random_state=seed)

        X_mutation_train_val, self.X_mutation_test, _, _ = train_test_split(mutation, response, test_size=0.1, random_state=seed)
        self.X_mutation_train, self.X_mutation_val, _, _ = train_test_split(X_mutation_train_val, y_train_val, test_size=0.2, random_state=seed)

        X_z_score_train_val, self.X_z_score_test, _, _ = train_test_split(z_score, response, test_size=0.1, random_state=seed)
        self.X_z_score_train, self.X_z_score_val, _, _ = train_test_split(X_z_score_train_val, y_train_val, test_size=0.2, random_state=seed)

    def get_dataset(self, datasets):
        X_train, X_val, X_test = None, None, None

        if 'clinical' in datasets:
            dataset = {
                'X_train': self.X_clinical_train,
                'X_val': self.X_clinical_val,
                'X_test': self.X_clinical_test
            }

        if 'mutation' in datasets:
            dataset = {
                'X_train': self.X_mutation_train,
                'X_val': self.X_mutation_val,
                'X_test': self.X_mutation_test
            }

        if 'z_score' in datasets:
            dataset = {
                'X_train': self.X_z_score_train,
                'X_val': self.X_z_score_val,
                'X_test': self.X_z_score_test
            }
        
        dataset.update({
            'y_train': np.squeeze(self.y_train),
            'y_val': np.squeeze(self.y_val),
            'y_test': np.squeeze(self.y_test)
        })

        return dataset

In [48]:
data = Datasets(42)

# Halving Search

### Model definition

In [70]:
clinical_algorithm = RandomForestClassifier(ccp_alpha=0.08, class_weight='balanced_subsample', criterion='entropy', max_depth=1, max_features='log2', n_estimators=122)
mutation_algorithm = RandomForestClassifier(ccp_alpha=0.1, class_weight='balanced', criterion='gini', max_depth=78, max_features='log2', n_estimators=177)
z_score_algorithm = AdaBoostClassifier(algorithm='SAMME', learning_rate=0.7, n_estimators=10)

In [71]:
clinical_algorithm.fit(data.get_dataset(['clinical'])['X_train'], data.get_dataset(['clinical'])['y_train'])
mutation_algorithm.fit(data.get_dataset(['mutation'])['X_train'], data.get_dataset(['mutation'])['y_train'])
z_score_algorithm.fit(data.get_dataset(['z_score'])['X_train'], data.get_dataset(['z_score'])['y_train'])

In [72]:
clinical_algorithm_pred = clinical_algorithm.predict(data.get_dataset(['clinical'])['X_val'])
mutation_algorithm_pred = mutation_algorithm.predict(data.get_dataset(['mutation'])['X_val'])
z_score_algorithm_pred = z_score_algorithm.predict(data.get_dataset(['z_score'])['X_val'])

clinical_algorithm_pred_test = clinical_algorithm.predict(data.get_dataset(['clinical'])['X_test'])
mutation_algorithm_pred_test = mutation_algorithm.predict(data.get_dataset(['mutation'])['X_test'])
z_score_algorithm_pred_test = z_score_algorithm.predict(data.get_dataset(['z_score'])['X_test'])

print_metrics(data.get_dataset(['clinical'])['y_val'], clinical_algorithm_pred, 'Clinical Validation')
print_metrics(data.get_dataset(['mutation'])['y_val'], mutation_algorithm_pred, 'Mutation Validation')
print_metrics(data.get_dataset(['z_score'])['y_val'], z_score_algorithm_pred, 'Z-Score Validation')

Clinical Validation Precision: 0.65, Recall: 0.71, F1 Score: 0.68
Clinical Validation Accuracy: 0.63, MCC: 0.26
Clinical Validation Confusion Matrix:
[[53 45]
 [33 82]]
Mutation Validation Precision: 0.57, Recall: 0.47, F1 Score: 0.52
Mutation Validation Accuracy: 0.53, MCC: 0.06
Mutation Validation Confusion Matrix:
[[58 40]
 [61 54]]
Z-Score Validation Precision: 0.61, Recall: 0.43, F1 Score: 0.51
Z-Score Validation Accuracy: 0.54, MCC: 0.11
Z-Score Validation Confusion Matrix:
[[66 32]
 [65 50]]


In [44]:
voting_clf = VotingClassifier([
    ('clinical', clinical_algorithm),
    ('z_score', z_score_algorithm),
    ('mutation', mutation_algorithm)
], voting='hard')

voting_clf.fit(data.get_dataset(['clinical'])['X_train'], data.get_dataset(['clinical'])['y_train'])

In [45]:
val_predictions = voting_clf.predict(data.get_dataset(['clinical'])['X_val'])
test_predictions = voting_clf.predict(data.get_dataset(['clinical'])['X_test'])

print_metrics(data.get_dataset(['clinical'])['y_val'], val_predictions, "Validation")

print_metrics(data.get_dataset(['clinical'])['y_test'], test_predictions, "Test")


Validation Precision: 0.73, Recall: 0.26, F1 Score: 0.38
Validation Accuracy: 0.55, MCC: 0.19
Validation Confusion Matrix:
[[87 11]
 [85 30]]
Test Precision: 0.80, Recall: 0.35, F1 Score: 0.49
Test Accuracy: 0.58, MCC: 0.27
Test Confusion Matrix:
[[45  6]
 [44 24]]


# Teapot