In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, accuracy_score,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
# Define the directory containing the encoded data
encoded_data_dir = 'encoded_data'

# Load the list of encoded data files
files = os.listdir(encoded_data_dir)

In [3]:


# Define dictionaries to store the ROC AUC and F1 scores for each encoding method
roc_auc_scores = {}
f1_scores = {}

for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    X = encoded_data.drop(columns=['target'])
    y = encoded_data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovo')
    f1 = f1_score(y_test, model.predict(X_test), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
# Output the scores for each encoding method to a separate file
with open('scores_RandomForest.txt', 'w') as f:
    for encoding_file in files:
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')




In [4]:

from xgboost import XGBClassifier

# Define dictionaries to store the ROC AUC and F1 scores for each encoding method
roc_auc_scores = {}
f1_scores = {}

# Train and evaluate an XGBClassifier model with each encoding method
for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    X = encoded_data.drop(columns=['target'])
    y = encoded_data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Adjust target variable if minimum value is 1
    if y_train.min() == 1:
        y_train -= 1
        y_test -= 1
    
    # Train the model
    model = XGBClassifier(random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    test_preds = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, test_preds, multi_class='ovo')
    f1 = f1_score(y_test, model.predict(X_test), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
# Output the scores for each encoding method to a separate file
with open('scores_XGBoost.txt', 'w') as f:
    for encoding_file in files:
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4], got [    0    99   999  9999 99999]

In [None]:

# Train and evaluate a SVM model

# Define dictionaries to store the ROC AUC and F1 scores for each encoding method
roc_auc_scores = {}
f1_scores = {}

# Train and evaluate a support vector machine with each encoding method
for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    X = encoded_data.drop(columns=['target'])
    y = encoded_data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = SVC(probability=True, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    test_preds = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, test_preds, multi_class='ovo')
    f1 = f1_score(y_test, model.predict(X_test), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
# Output the scores for each encoding method to a separate file
with open('scores_SVM.txt', 'w') as f:
    for encoding_file in files:
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')


In [None]:

# Train and evaluate a KNeighborsClassifier  model

from sklearn.neighbors import KNeighborsClassifier

# Define dictionaries to store the ROC AUC and F1 scores for each encoding method
roc_auc_scores = {}
f1_scores = {}

# Train and evaluate a KNN model with each encoding method
for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    X = encoded_data.drop(columns=['target'])
    y = encoded_data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    test_preds = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, test_preds, multi_class='ovo')
    f1 = f1_score(y_test, model.predict(X_test), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
# Output the scores for each encoding method to a separate file
with open('scores_KNN.txt', 'w') as f:
    for encoding_file in files:
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')



In [None]:

from sklearn.neural_network import MLPClassifier


# Define dictionaries to store the ROC AUC and F1 scores for each encoding method
roc_auc_scores = {}
f1_scores = {}

# Assuming 'files' is a list of filenames and 'encoded_data_dir' is defined
for encoding_file in files:
    
   
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels, removing the target from features
    features = encoded_data.drop(columns=['target'])
    labels = encoded_data['target']
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    
    # Train the model
    model = MLPClassifier(hidden_layer_sizes=(3,), max_iter=10000, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    test_probs = model.predict_proba(X_test)
    if len(set(labels)) == 2:  # Binary classification
        roc_auc = roc_auc_score(y_test, test_probs[:, 1])
    else:  # Multi-class classification
        roc_auc = roc_auc_score(y_test, test_probs, multi_class='ovo')
    f1 = f1_score(y_test, model.predict(X_test), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
# Output the scores for each encoding method to a separate file
with open('scores_MLP.txt', 'w') as f:
    for encoding_file in files:
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')


In [None]:

# Train and evaluate a GaussianNB  model
from sklearn.naive_bayes import GaussianNB

# Define dictionaries to store the ROC AUC and F1 scores for each encoding method
roc_auc_scores = {}
f1_scores = {}

# Train and evaluate a Naive Bayes model with each encoding method
for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    X = encoded_data.drop(columns=['target'])
    y = encoded_data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    # Evaluate the model
    test_preds = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, test_preds, multi_class='ovo')
    f1 = f1_score(y_test, model.predict(X_test), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
# Output the scores for each encoding method to a separate file
with open('scores_GaussianNB.txt', 'w') as f:
    for encoding_file in files:
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')



In [None]:

# Train and evaluate a LogisticRegression model


# Define dictionaries to store the ROC AUC and F1 scores for each encoding method
roc_auc_scores = {}
f1_scores = {}

# Train and evaluate a logistic regression model with each encoding method
for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    train_encoded_data, test_encoded_data, train_labels, test_labels = train_test_split(encoded_data.drop(columns=['target']), encoded_data['target'], test_size=0.2, random_state=42)
    
    # Train the model
    model = LogisticRegression(max_iter=10000, solver='lbfgs')
    model.fit(train_encoded_data, train_labels)
    
    # Evaluate the model
    test_preds = model.predict_proba(test_encoded_data)
    roc_auc = roc_auc_score(test_labels, test_preds, multi_class='ovo')
    f1 = f1_score(test_labels, model.predict(test_encoded_data), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
# Output the scores for each encoding method to a separate file


with open('scores_LogisticRegression.txt', 'w') as f:
    
    for encoding_file in files:   
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')



In [None]:
from sklearn.tree import DecisionTreeClassifier


# Define dictionaries to store the ROC AUC and F1 scores for each encoding method
roc_auc_scores = {}
f1_scores = {}

# Train and evaluate a Decision Tree model with each encoding method
for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    train_encoded_data, test_encoded_data, train_labels, test_labels = train_test_split(encoded_data.drop(columns=['target']), encoded_data['target'], test_size=0.2, random_state=42)
    
    # Train the model
    model = DecisionTreeClassifier(random_state=42)
    model.fit(train_encoded_data, train_labels)
    
    # Evaluate the model
    test_preds = model.predict_proba(test_encoded_data)
    roc_auc = roc_auc_score(test_labels, test_preds, multi_class='ovo')
    f1 = f1_score(test_labels, model.predict(test_encoded_data), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
    # Output the scores for this encoding method to a separate file
with open('scores_DecisionTree.txt', 'w') as f:
    
    for encoding_file in files:   
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')






In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Train and evaluate a Gradient Boosting model with each encoding method
for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    train_encoded_data, test_encoded_data, train_labels, test_labels = train_test_split(encoded_data.drop(columns=['target']), encoded_data['target'], test_size=0.2, random_state=42)
    
    # Train the model
    model = GradientBoostingClassifier(random_state=42)
    model.fit(train_encoded_data, train_labels)
    
    # Evaluate the model
    test_preds = model.predict_proba(test_encoded_data)
    roc_auc = roc_auc_score(test_labels, test_preds, multi_class='ovo')
    f1 = f1_score(test_labels, model.predict(test_encoded_data), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
    # Output the scores for this encoding method to a separate file
with open('scores_GradientBoosting.txt', 'w') as f:
    
    for encoding_file in files:   
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler

# Define dictionaries to store the ROC AUC and F1 scores for each encoding method
roc_auc_scores = {}
f1_scores = {}

# Train and evaluate a neural network model with each encoding method
for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    X = encoded_data.drop(columns=['target']).values
    y = encoded_data['target'].values
    
    # Standardize the input features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    if y_train.min() == 1:
        y_train -= 1
        y_test -= 1
    
    # Define the neural network model
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(5, activation='softmax'))  # Assuming 8 classes for the target variable
    
    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
    
    # Evaluate the model
    test_preds = model.predict(X_test)
    roc_auc = roc_auc_score(y_test, test_preds, multi_class='ovo')
    f1 = f1_score(y_test, np.argmax(test_preds, axis=1), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
    # Output the scores for this encoding method to a separate file
with open('scores_NeuralNetwork.txt', 'w') as f:
    
    for encoding_file in files: 
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')


In [None]:

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Train and evaluate an AdaBoost model with each encoding method
roc_auc_scores = {}
f1_scores = {}

for encoding_file in files:
    
    # Load the encoded data
    encoded_data = pd.read_csv(os.path.join(encoded_data_dir, encoding_file))
    
    # Split the encoded data into features and labels
    train_encoded_data, test_encoded_data, train_labels, test_labels = train_test_split(
        encoded_data.drop(columns=['target']), encoded_data['target'], test_size=0.2, random_state=42)
    
    # Train the model using AdaBoost with the SAMME algorithm
    model = AdaBoostClassifier(algorithm="SAMME", random_state=42)
    model.fit(train_encoded_data, train_labels)
    
    # Evaluate the model
    test_preds = model.predict_proba(test_encoded_data)
    roc_auc = roc_auc_score(test_labels, test_preds, multi_class='ovo')
    f1 = f1_score(test_labels, model.predict(test_encoded_data), average='macro')
    
    # Store the scores for this encoding method
    roc_auc_scores[encoding_file] = roc_auc
    f1_scores[encoding_file] = f1
    
# Output the scores for this encoding method to a separate file
with open('scores_AdaBoost_SAMME.txt', 'w') as f:
    for encoding_file in files:   
        f.write(f'Encoding method: {encoding_file}\n')
        f.write(f'ROC AUC score: {roc_auc_scores[encoding_file]:.2f}\n')
        f.write(f'F1 score: {f1_scores[encoding_file]:.2f}\n\n')
