# Import

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
import warnings

In [None]:
from sklearn.model_selection import KFold

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LinearRegression
# from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
def warn(*args, **kwargs):
    pass
warnings.warn = warn

# Load

In [None]:
data: pd.DataFrame = pd.read_csv('../5_general_data/5_5_final.csv')
data

# Helper

In [None]:
# Define the targets for analysis.
targets: list[str] = ['target_90_day_mortality', 'target_30_day_mortality']

# Define the systems for analysis.
systems: dict[int] = {'all': -1, 'esophagus': 0, 'stomach': 1, 'intestine': 2, 'liver': 3, 'pancreas': 4}

# Define the number of iterations of the k-fold cross-validation.
k: int = 5

In [None]:
estimators = {
    'logistic_sgd': SGDClassifier(loss='log_loss', class_weight='balanced', n_jobs=-1, random_state=0),
    'logistic_regression': LogisticRegression(class_weight='balanced', n_jobs=-1),
    'modified_huber_sgd': SGDClassifier(loss="modified_huber", class_weight="balanced", n_jobs=-1, random_state=0),
    'squared_hinge_sgd': SGDClassifier(loss="squared_hinge", class_weight="balanced", n_jobs=-1, random_state=0),
    'linear_sgd': SGDClassifier(class_weight="balanced", n_jobs=-1, random_state=0),
    'naive_bayes': GaussianNB(),
    'linear_svc': LinearSVC(class_weight="balanced", random_state=0),
    'knn_2': KNeighborsClassifier(n_neighbors=2, n_jobs=-1),
    'knn_3': KNeighborsClassifier(n_neighbors=3, n_jobs=-1),
    'knn_4': KNeighborsClassifier(n_neighbors=4, n_jobs=-1),
    'knn_5': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'knn_6': KNeighborsClassifier(n_neighbors=6, n_jobs=-1),
    'knn_7': KNeighborsClassifier(n_neighbors=7, n_jobs=-1),
    'knn_8': KNeighborsClassifier(n_neighbors=8, n_jobs=-1),
    'knn_9': KNeighborsClassifier(n_neighbors=9, n_jobs=-1),
    'knn_10': KNeighborsClassifier(n_neighbors=10, n_jobs=-1),
    'quadratic_discriminant': QuadraticDiscriminantAnalysis(),
    'ada_boost': AdaBoostClassifier(random_state=0),
    'linear_discriminant': LinearDiscriminantAnalysis(),
    'gradient_boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
    'random_forest': RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0),
    'rbf_svc': SVC(kernel="rbf", class_weight="balanced", random_state=0),
    'poly_svc_cubic': SVC(kernel="poly", degree=3, class_weight="balanced", random_state=0),  # hangs
    'poly_svc_quadratic': SVC(kernel="poly", degree=2, class_weight="balanced", random_state=0),
    'hinge_sgd': SGDClassifier(random_state=0, class_weight="balanced", n_jobs=-1)
}

In [None]:
def evaluate(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    # Evaluate the Model.
    roc_auc = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)

    metrics = {
        'roc_auc': roc_auc.item(),
        'accuracy': accuracy.item(),
        'precision': precision.item(),
        'recall': recall.item(),
        'f1': f1.item(),
        'specificity': specificity.item(),
        'sensitivity': sensitivity.item(),
        'true_negative': tn.item(),
        'false_positive': fp.item(),
        'false_negative': fn.item(),
        'true_positive': tp.item()
    }
    return metrics

In [None]:
def evaluate_k_fold(set: pd.DataFrame, model, model_name) -> pd.DataFrame:
    # Implement a k-fold cross validation.
    kf = StratifiedKFold(n_splits=k, random_state=42, shuffle=True)

    # Split data into features and target
    X = set.copy().drop('target', axis=1)
    y = set['target']

    # Set up k-fold cross validation
    metrics = []
    for train_index , test_index in kf.split(X, y):
        # split data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # normalize X
        # fit scaler on training data
        norm = MinMaxScaler().fit(X_train)
        # transform training data
        X_train = norm.transform(X_train)
        # transform testing data
        X_test = norm.transform(X_test)

        # evaluate the model
        metrics.append(evaluate(X_train, y_train, X_test, y_test, model))

    # convert the metrics dictionary into a Series.
    mean_metrics = pd.DataFrame.from_records(metrics).mean().to_frame().T
    mean_metrics.insert(loc = 0, column = 'estimator', value = model_name)
    return mean_metrics

# Main

In [None]:
# Define a dataframe for storing the results.
results: list[pd.Series] = []

# Loop through each target.
for target in tqdm(targets, desc='Targets'):
    # Create local copy of the data.
    target_data = data.copy()

    # Rename target column and remove all other target columns.
    target_data.rename(columns={target: 'target'}, inplace=True)
    target_data.drop([col for col in target_data.columns if 'target_' in col], axis=1, inplace=True)

    # Fill NaN values with -1.
    target_data.fillna(-1, inplace=True)

    # Loop through each system.
    for system_name, system in tqdm(systems.items(), desc='Systems'):
        # Create a local copy of target data.
        system_data = target_data.copy()

        # Check if the given system is not 'all'.
        if system != -1:
            # Filter the data by the given system.
            system_data = system_data[system_data['meta_system'] == system]
        
        # Remove all meta columns.
        system_data.drop([col for col in system_data.columns if 'meta_' in col], axis=1, inplace=True)

        # Skip if positive target class is smaller than the given k.
        if sum(system_data['target']) < k: continue

        # Loop through all estimators.
        for estimator_name, estimator in estimators.items():
            # Evaluate the estimator, add the relevant characteristics and append the result to the results list.
            result = evaluate_k_fold(system_data, estimator, estimator_name)
            result.insert(loc = 0, column = 'system', value = system_name)
            result.insert(loc = 0, column = 'target', value = target)
            results.append(result)

# Concat the results, save them to a csv file and display them.
results = pd.concat(results)
results.to_csv('11_2_results.csv')
results

In [None]:
# Define a list for storing the best results
best_results: list[pd.DataFrame] = []

for target, target_data in results.groupby('target'):
    for system, system_data in target_data.groupby('system'):
        # Get the row with the best roc_auc score and append the result to the results list.
        best_result = system_data.nlargest(1, 'roc_auc')
        best_results.append(best_result)

# Concat the best_results, save them to a csv file and display them.
best_results = pd.concat(best_results)
best_results.to_csv('11_3_best_results.csv')
best_results