In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense



In [3]:
# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)


In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (roc_auc_score, f1_score, accuracy_score, precision_score,
                             recall_score, matthews_corrcoef)

# Ensure the directory for the output file exists
if not os.path.exists('metrics'):
    os.makedirs('metrics')

# Load the list of encoded data files from a directory
encoded_data_dir = 'encoded_data'
files = os.listdir(encoded_data_dir)

# Prepare to collect metrics
metrics = []

# Define the 5-fold Stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each encoding file to process data and perform cross-validation
for encoding_file in files:
    data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    X = data.drop('target', axis=1)
    y = data['target']

    # Define the classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

    # Variables to store performance metrics
    roc_aucs, f1s, accuracies, precisions, recalls, mccs = [], [], [], [], [], []

    # Perform cross-validation
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # Compute metrics for multi-class classification
        roc_aucs.append(roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))
        f1s.append(f1_score(y_test, y_pred, average='weighted'))
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
        recalls.append(recall_score(y_test, y_pred, average='weighted', zero_division=0))
        mccs.append(matthews_corrcoef(y_test, y_pred))

    # Aggregate the metrics
    metrics.append({
        'Encoding': encoding_file,
        'ROC AUC': np.mean(roc_aucs),
        'F1 Score': np.mean(f1s),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'MCC': np.mean(mccs)
    })

# Output the average scores from 5-fold cross-validation for each encoding method to a file
with open('metrics/scores_RandomForest.txt', 'w') as f:
    for result in metrics:
        f.write(f"Encoding method: {result['Encoding']}\n")
        f.write(f"Average ROC AUC score: {result['ROC AUC']:.2f}\n")
        f.write(f"Average F1 score: {result['F1 Score']:.2f}\n")
        f.write(f"Average Accuracy: {result['Accuracy']:.2f}\n")
        f.write(f"Average Precision: {result['Precision']:.2f}\n")
        f.write(f"Average Recall: {result['Recall']:.2f}\n")
        f.write(f"Average MCC: {result['MCC']:.2f}\n\n")


In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef

# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)

# Ensure the metrics directory exists
if not os.path.exists('metrics'):
    os.makedirs('metrics')

# Prepare to collect metrics
metrics = []

# Define the 5-fold Stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each encoding file to process data and perform cross-validation
for encoding_file in files:
    data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    X = data.drop(columns=['target'])
    y = data['target']

    # Variables to store performance metrics
    roc_aucs, f1s, accuracies, precisions, recalls, mccs = [], [], [], [], [], []

    # Perform cross-validation
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model = SVC(probability=True, random_state=42)
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # Compute metrics
        roc_aucs.append(roc_auc_score(y_test, y_pred_proba, multi_class='ovo'))
        f1s.append(f1_score(y_test, y_pred, average='weighted'))
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
        recalls.append(recall_score(y_test, y_pred, average='weighted', zero_division=0))
        mccs.append(matthews_corrcoef(y_test, y_pred))

    # Aggregate the metrics
    metrics.append({
        'Encoding': encoding_file,
        'ROC AUC': np.mean(roc_aucs),
        'F1 Score': np.mean(f1s),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'MCC': np.mean(mccs)
    })

# Output the average scores from 5-fold cross-validation for each encoding method to a file
with open('metrics/scores_SVM.txt', 'w') as f:
    for result in metrics:
        f.write(f"Encoding method: {result['Encoding']}\n")
        f.write(f"Average ROC AUC score: {result['ROC AUC']:.2f}\n")
        f.write(f"Average F1 score: {result['F1 Score']:.2f}\n")
        f.write(f"Average Accuracy: {result['Accuracy']:.2f}\n")
        f.write(f"Average Precision: {result['Precision']:.2f}\n")
        f.write(f"Average Recall: {result['Recall']:.2f}\n")
        f.write(f"Average MCC: {result['MCC']:.2f}\n\n")


In [6]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef

# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)

# Ensure the metrics directory exists
if not os.path.exists('metrics'):
    os.makedirs('metrics')

# Prepare to collect metrics
metrics = []

# Define the 5-fold Stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each encoding file to process data and perform cross-validation
for encoding_file in files:
    data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    X = data.drop(columns=['target'])
    y = data['target']

    # Variables to store performance metrics
    roc_aucs, f1s, accuracies, precisions, recalls, mccs = [], [], [], [], [], []

    # Perform cross-validation
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # Compute metrics
        roc_aucs.append(roc_auc_score(y_test, y_pred_proba, multi_class='ovo'))
        f1s.append(f1_score(y_test, y_pred, average='weighted'))
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
        recalls.append(recall_score(y_test, y_pred, average='weighted', zero_division=0))
        mccs.append(matthews_corrcoef(y_test, y_pred))

    # Aggregate the metrics
    metrics.append({
        'Encoding': encoding_file,
        'ROC AUC': np.mean(roc_aucs),
        'F1 Score': np.mean(f1s),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'MCC': np.mean(mccs)
    })

# Output the average scores from 5-fold cross-validation for each encoding method to a file
with open('metrics/scores_KNN.txt', 'w') as f:
    for result in metrics:
        f.write(f"Encoding method: {result['Encoding']}\n")
        f.write(f"Average ROC AUC score: {result['ROC AUC']:.2f}\n")
        f.write(f"Average F1 score: {result['F1 Score']:.2f}\n")
        f.write(f"Average Accuracy: {result['Accuracy']:.2f}\n")
        f.write(f"Average Precision: {result['Precision']:.2f}\n")
        f.write(f"Average Recall: {result['Recall']:.2f}\n")
        f.write(f"Average MCC: {result['MCC']:.2f}\n\n")


In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef

# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)

# Ensure the metrics directory exists
if not os.path.exists('metrics'):
    os.makedirs('metrics')

# Prepare to collect metrics
metrics = []

# Define the 5-fold Stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each encoding file to process data and perform cross-validation
for encoding_file in files:
    data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    X = data.drop(columns=['target'])
    y = data['target']

    # Variables to store performance metrics
    roc_aucs, f1s, accuracies, precisions, recalls, mccs = [], [], [], [], [], []

    # Perform cross-validation
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model = MLPClassifier(hidden_layer_sizes=(3,), max_iter=10000, random_state=42)
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo') if len(set(y)) > 2 else roc_auc_score(y_test, y_pred_proba[:, 1])
        f1 = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        mcc = matthews_corrcoef(y_test, y_pred)

        # Store the scores for this fold
        roc_aucs.append(roc_auc)
        f1s.append(f1)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        mccs.append(mcc)

    # Aggregate the metrics
    metrics.append({
        'Encoding': encoding_file,
        'ROC AUC': np.mean(roc_aucs),
        'F1 Score': np.mean(f1s),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'MCC': np.mean(mccs)
    })

# Output the average scores from 5-fold cross-validation for each encoding method to a file
with open('metrics/scores_MLP.txt', 'w') as f:
    for result in metrics:
        f.write(f"Encoding method: {result['Encoding']}\n")
        f.write(f"Average ROC AUC score: {result['ROC AUC']:.2f}\n")
        f.write(f"Average F1 score: {result['F1 Score']:.2f}\n")
        f.write(f"Average Accuracy: {result['Accuracy']:.2f}\n")
        f.write(f"Average Precision: {result['Precision']:.2f}\n")
        f.write(f"Average Recall: {result['Recall']:.2f}\n")
        f.write(f"Average MCC: {result['MCC']:.2f}\n\n")


In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef

# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)

# Ensure the metrics directory exists
if not os.path.exists('metrics'):
    os.makedirs('metrics')

# Prepare to collect metrics
metrics = []

# Define the 5-fold Stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each encoding file to process data and perform cross-validation
for encoding_file in files:
    data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    X = data.drop(columns=['target'])
    y = data['target']

    # Variables to store performance metrics
    roc_aucs, f1s, accuracies, precisions, recalls, mccs = [], [], [], [], [], []

    # Perform cross-validation
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model = GaussianNB()
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # Compute metrics
        roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')
        f1 = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        mcc = matthews_corrcoef(y_test, y_pred)

        # Store the scores for this fold
        roc_aucs.append(roc_auc)
        f1s.append(f1)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        mccs.append(mcc)

    # Aggregate the metrics
    metrics.append({
        'Encoding': encoding_file,
        'ROC AUC': np.mean(roc_aucs),
        'F1 Score': np.mean(f1s),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'MCC': np.mean(mccs)
    })

# Output the average scores from 5-fold cross-validation for each encoding method to a file
with open('metrics/scores_GaussianNB.txt', 'w') as f:
    for result in metrics:
        f.write(f"Encoding method: {result['Encoding']}\n")
        f.write(f"Average ROC AUC score: {result['ROC AUC']:.2f}\n")
        f.write(f"Average F1 score: {result['F1 Score']:.2f}\n")
        f.write(f"Average Accuracy: {result['Accuracy']:.2f}\n")
        f.write(f"Average Precision: {result['Precision']:.2f}\n")
        f.write(f"Average Recall: {result['Recall']:.2f}\n")
        f.write(f"Average MCC: {result['MCC']:.2f}\n\n")


In [9]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef

# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)

# Ensure the metrics directory exists
if not os.path.exists('metrics'):
    os.makedirs('metrics')

# Prepare to collect metrics
metrics = []

# Define the 5-fold Stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each encoding file to process data and perform cross-validation
for encoding_file in files:
    data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    X = data.drop(columns=['target'])
    y = data['target']

    # Variables to store performance metrics
    roc_aucs, f1s, accuracies, precisions, recalls, mccs = [], [], [], [], [], []

    # Perform cross-validation
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model = LogisticRegression(max_iter=10000, solver='lbfgs')
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # Compute metrics
        roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')
        f1 = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        mcc = matthews_corrcoef(y_test, y_pred)

        # Store the scores for this fold
        roc_aucs.append(roc_auc)
        f1s.append(f1)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        mccs.append(mcc)

    # Aggregate the metrics
    metrics.append({
        'Encoding': encoding_file,
        'ROC AUC': np.mean(roc_aucs),
        'F1 Score': np.mean(f1s),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'MCC': np.mean(mccs)
    })

# Output the average scores from 5-fold cross-validation for each encoding method to a file
with open('metrics/scores_LogisticRegression.txt', 'w') as f:
    for result in metrics:
        f.write(f"Encoding method: {result['Encoding']}\n")
        f.write(f"Average ROC AUC score: {result['ROC AUC']:.2f}\n")
        f.write(f"Average F1 score: {result['F1 Score']:.2f}\n")
        f.write(f"Average Accuracy: {result['Accuracy']:.2f}\n")
        f.write(f"Average Precision: {result['Precision']:.2f}\n")
        f.write(f"Average Recall: {result['Recall']:.2f}\n")
        f.write(f"Average MCC: {result['MCC']:.2f}\n\n")


In [10]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef

# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)

# Ensure the metrics directory exists
if not os.path.exists('metrics'):
    os.makedirs('metrics')

# Prepare to collect metrics
metrics = []

# Define the 5-fold Stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each encoding file to process data and perform cross-validation
for encoding_file in files:
    data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    X = data.drop(columns=['target'])
    y = data['target']

    # Variables to store performance metrics
    roc_aucs, f1s, accuracies, precisions, recalls, mccs = [], [], [], [], [], []

    # Perform cross-validation
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model = DecisionTreeClassifier(random_state=42)
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # Compute metrics
        roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')
        f1 = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        mcc = matthews_corrcoef(y_test, y_pred)

        # Store the scores for this fold
        roc_aucs.append(roc_auc)
        f1s.append(f1)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        mccs.append(mcc)

    # Aggregate the metrics
    metrics.append({
        'Encoding': encoding_file,
        'ROC AUC': np.mean(roc_aucs),
        'F1 Score': np.mean(f1s),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'MCC': np.mean(mccs)
    })

# Output the average scores from 5-fold cross-validation for each encoding method to a file
with open('metrics/scores_DecisionTree.txt', 'w') as f:
    for result in metrics:
        f.write(f"Encoding method: {result['Encoding']}\n")
        f.write(f"Average ROC AUC score: {result['ROC AUC']:.2f}\n")
        f.write(f"Average F1 score: {result['F1 Score']:.2f}\n")
        f.write(f"Average Accuracy: {result['Accuracy']:.2f}\n")
        f.write(f"Average Precision: {result['Precision']:.2f}\n")
        f.write(f"Average Recall: {result['Recall']:.2f}\n")
        f.write(f"Average MCC: {result['MCC']:.2f}\n\n")


In [11]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef

# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)

# Ensure the metrics directory exists
if not os.path.exists('metrics'):
    os.makedirs('metrics')

# Prepare to collect metrics
metrics = []

# Define the 5-fold Stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each encoding file to process data and perform cross-validation
for encoding_file in files:
    data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    X = data.drop(columns=['target'])
    y = data['target']

    # Variables to store performance metrics
    roc_aucs, f1s, accuracies, precisions, recalls, mccs = [], [], [], [], [], []

    # Perform cross-validation
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model using AdaBoost with the SAMME algorithm
        model = AdaBoostClassifier(algorithm="SAMME", random_state=42)
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)

        # Compute metrics
        roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovo')
        f1 = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        mcc = matthews_corrcoef(y_test, y_pred)

        # Store the scores for this fold
        roc_aucs.append(roc_auc)
        f1s.append(f1)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        mccs.append(mcc)

    # Aggregate the metrics
    metrics.append({
        'Encoding': encoding_file,
        'ROC AUC': np.mean(roc_aucs),
        'F1 Score': np.mean(f1s),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'MCC': np.mean(mccs)
    })

# Output the average scores from 5-fold cross-validation for each encoding method to a file
with open('metrics/scores_AdaBoost_SAMME.txt', 'w') as f:
    for result in metrics:
        f.write(f"Encoding method: {result['Encoding']}\n")
        f.write(f"Average ROC AUC score: {result['ROC AUC']:.2f}\n")
        f.write(f"Average F1 score: {result['F1 Score']:.2f}\n")
        f.write(f"Average Accuracy: {result['Accuracy']:.2f}\n")
        f.write(f"Average Precision: {result['Precision']:.2f}\n")
        f.write(f"Average Recall: {result['Recall']:.2f}\n")
        f.write(f"Average MCC: {result['MCC']:.2f}\n\n")


In [12]:
import os
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Input, Dense
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef

# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)

# Ensure the metrics directory exists
if not os.path.exists('metrics'):
    os.makedirs('metrics')

# Prepare to collect metrics
metrics = []

# Define the 5-fold Stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each encoding file to process data and perform cross-validation
for encoding_file in files:
    data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    X = data.drop(columns=['target']).values
    y = data['target'].values

    # Standardize the input features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Variables to store performance metrics
    roc_aucs, f1s, accuracies, precisions, recalls, mccs = [], [], [], [], [], []

    # Perform cross-validation
    for train_idx, test_idx in cv.split(X_scaled, y):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Ensure labels are zero-indexed if they start at 1
        if y_train.min() == 1:
            y_train -= 1
            y_test -= 1

        # Define the neural network model
        model = Sequential([
            Input(shape=(X_train.shape[1],)),
            Dense(64, activation='relu'),
            Dense(32, activation='relu'),
            Dense(len(np.unique(y_train)), activation='softmax')
        ])

        # Compile the model
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        # Train the model
        model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

        # Evaluate the model
        test_probs = model.predict(X_test)
        roc_auc = roc_auc_score(y_test, test_probs, multi_class='ovo')
        f1 = f1_score(y_test, np.argmax(test_probs, axis=1), average='weighted')
        accuracy = accuracy_score(y_test, np.argmax(test_probs, axis=1))
        precision = precision_score(y_test, np.argmax(test_probs, axis=1), average='weighted', zero_division=0)
        recall = recall_score(y_test, np.argmax(test_probs, axis=1), average='weighted', zero_division=0)
        mcc = matthews_corrcoef(y_test, np.argmax(test_probs, axis=1))

        # Store the scores for this fold
        roc_aucs.append(roc_auc)
        f1s.append(f1)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        mccs.append(mcc)

    # Aggregate the metrics
    metrics.append({
        'Encoding': encoding_file,
        'ROC AUC': np.mean(roc_aucs),
        'F1 Score': np.mean(f1s),
        'Accuracy': np.mean(accuracies),
        'Precision': np.mean(precisions),
        'Recall': np.mean(recalls),
        'MCC': np.mean(mccs)
    })

# Output the average scores from 5-fold cross-validation for each encoding method to a file
with open('metrics/scores_NeuralNetwork.txt', 'w') as f:
    for result in metrics:
        f.write(f"Encoding method: {result['Encoding']}\n")
        f.write(f"Average ROC AUC score: {result['ROC AUC']:.2f}\n")
        f.write(f"Average F1 score: {result['F1 Score']:.2f}\n")
        f.write(f"Average Accuracy: {result['Accuracy']:.2f}\n")
        f.write(f"Average Precision: {result['Precision']:.2f}\n")
        f.write(f"Average Recall: {result['Recall']:.2f}\n")
        f.write(f"Average MCC: {result['MCC']:.2f}\n\n")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30