In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import pickle
import os
import numpy as np
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from itertools import cycle
import seaborn as sns
import time
import tensorflow as tf
import time
import re
from xgboost import XGBClassifier
import warnings
ignore_warnings = True
if ignore_warnings:
    warnings.filterwarnings("ignore")
class TFNeuralNetwork:
    def __init__(self, input_shape, output_classes):
        self.model = create_tf_neural_network(input_shape, output_classes)

    def fit(self, X_train, y_train, epochs=10, batch_size=32):
        self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

    def predict(self, X_test):
        predictions = self.model.predict(X_test)
        return predictions.argmax(axis=1)

    def predict_proba(self, X_test):
        return self.model.predict(X_test)

    def score(self, X_test, y_test):
        loss, accuracy = self.model.evaluate(X_test, y_test)
        return accuracy
def create_tf_neural_network(input_shape, output_classes):
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(input_shape=input_shape),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(output_classes, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model
class MLModel:
    def __init__(self, model):
        self.model = model
        self.training_time = None

    def train(self, X_train, y_train):
        start_time = time.time()
        self.model.fit(X_train, y_train)
        end_time = time.time()
        self.training_time = end_time - start_time


    def predict(self, X_test):
        self.y_pred = self.model.predict(X_test)
        return self.y_pred

    def evaluate(self, X_test, y_test):
        predictions = self.predict(X_test)
        predict_proba = self.model.predict_proba(X_test)

        #true_labels = np.argmax(y_test, axis=1)
        report = classification_report(y_test, predictions, target_names=['Red', 'Yellow','Green'])  # Update target names based on your classes
        acc = accuracy_score(y_test, predictions)
        rec = recall_score(y_test, predictions, average='macro')
        prec = precision_score(y_test, predictions, average='macro')
        f1 = f1_score(y_test, predictions, average='macro')
        auc = roc_auc_score(y_test, predict_proba, multi_class='ovo')
        return {
            'predictions': predictions,
            'predict_proba': predict_proba,
            'report': report,
            'accuracy': acc,
            'recall': rec,
            'precision': prec,
            'f1_score': f1,
            'auc': auc,
            'training_time': self.training_time,
            'model_object':self
        }


    def plot_confusion_matrix(self, X_test, y_test, experiment_name):
        # Confusion matrix plot logic
        cm = confusion_matrix(y_test, self.y_pred)
        # Plot the confusion matrix
        plt.title('Confusion Matrix')
        #plt.figure(figsize=(10,7))
        sns.heatmap(cm, annot=True, fmt='d')
        classes = ['Red', 'Yellow','Green']
        plt.xticks(np.arange(3), classes, rotation=45)
        plt.yticks(np.arange(3), classes)
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        if not os.path.exists(experiment_name):
            os.makedirs(experiment_name)
        plt.savefig(f"{experiment_name}/confusion_matrix.png")

    def plot_roc(self, X_test, y_test, experiment_name):
        # Binarize the output
        y = label_binarize(y_test, classes=[0, 1, 2])
        n_classes = y.shape[1]

        # Compute ROC curve and ROC area for each class
        y_pred = self.model.predict_proba(X_test)
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y[:, i], y_pred[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        classes = ['Red', 'Yellow','Green']
        # Plot all ROC curves
        plt.figure()
        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i], tpr[i], color=color, lw=2,
                    label='ROC curve of class {0} (area = {1:0.2f})'.format(classes[i], roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([-0.05, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate (FPR)')
        plt.ylabel('True Positive Rate (TPR)')
        plt.title('Receiver Operating Characteristic (ROC)')
        plt.legend(loc="lower right")
        if not os.path.exists(experiment_name):
            os.makedirs(experiment_name)
        plt.savefig(f"{experiment_name}/roc.png")


    def save_results(self, results, experiment_name):
        if not os.path.exists(experiment_name):
            os.makedirs(experiment_name)
        with open(f"{experiment_name}/results.pkl", 'wb') as file:
            pickle.dump(results, file)

    def feature_importance(self, feature_names, experiment_name):
        if hasattr(self.model, 'feature_importances_'):
            # For models with feature_importances_ attribute (e.g., RandomForest)
            importances = self.model.feature_importances_
        elif hasattr(self.model, 'coef_'):
            # For models with coef_ attribute (e.g., LogisticRegression)
            importances = np.abs(self.model.coef_[0])
        else:
            print("Model does not have feature_importances_ or coef_ attribute")
            return
        feature_names_length = min(15, len(feature_names))
        indices = np.argsort(importances)[::-1][:feature_names_length]
        # Plot the feature importances
        plt.figure(figsize=(12, 6))  # Increase the plot size
        plt.title("Feature Importances")
        plt.bar(range(feature_names_length), importances[indices], color="b", align="center")
        # Rotate feature names for better visibility
        plt.xticks(range(feature_names_length), [feature_names[i] for i in indices], rotation=45, ha="right")
        # Adjust the font size and alignment if necessary
        plt.tick_params(axis='x', which='major', labelsize=9)  # Decrease label font size if needed
        plt.tight_layout()  # Adjust the padding between and around subplots.
        plt.show()
        if not os.path.exists(experiment_name):
            os.makedirs(experiment_name)
        plt.savefig(f"{experiment_name}/feature_importance.png")




In [None]:
import glob
# All files and directories ending with .txt and that don't begin with a dot:
list_of_experiments = ## **


In [None]:
import re

# List of filenames
list_of_experiments = ## **


# Function to extract information from filename
def extract_info(filename):
    # Adjusted pattern to handle feature sets with underscores
    pattern = r'experiment_([^_]+(?:_[^_]+)*)_([^_]+)_n_whole_data_text_embd_([^)]+)'
    match = re.search(pattern, filename)
    if match:
        return {
            'feature_set': match.group(1),
            'algorithm': match.group(2),
            'text_embedding_method': match.group(3)
        }
    else:
        return None

# Main dictionary to store experiments
experiments = {}
results = {}

# Process each file and add to dictionary
for idx, filename in enumerate(list_of_experiments, start=1):
    info = extract_info(filename)
    if info:
        experiments[f'experiment_id_{idx}'] = info
    try:
        with open(f"{filename}/results.pkl", 'rb') as file:
            results[f'experiment_id_{idx}'] = pickle.load(file)
    except:
        print(f"Error with {filename}")


# Print the dictionary
for key, value in experiments.items():
    print(f"{key}: {value}")

Error with /Users/meyildirim/Desktop/Notebooks/Learning/master-thesis/etl/code/experiments/results/experiment_categorical_TFNeuralNetwork_n_whole_data_text_embd_BioBERT)
Error with /Users/meyildirim/Desktop/Notebooks/Learning/master-thesis/etl/code/experiments/results/experiment_text_embeddings_TFNeuralNetwork_n_whole_data_text_embd_BioBERT)
Error with /Users/meyildirim/Desktop/Notebooks/Learning/master-thesis/etl/code/experiments/results/experiment_numerical_TFNeuralNetwork_n_whole_data_text_embd_BioBERT)
experiment_id_1: {'feature_set': 'text_embeddings', 'algorithm': 'LogisticRegression', 'text_embedding_method': 'BioBERT'}
experiment_id_2: {'feature_set': 'categorical', 'algorithm': 'LogisticRegression', 'text_embedding_method': 'BioBERT'}
experiment_id_3: {'feature_set': 'categorical', 'algorithm': 'TFNeuralNetwork', 'text_embedding_method': 'BioBERT'}
experiment_id_4: {'feature_set': 'text_embeddings', 'algorithm': 'TFNeuralNetwork', 'text_embedding_method': 'BioBERT'}
experiment

In [39]:
# creating a pandas df from two dictionary
experiment_df = pd.DataFrame.from_dict(experiments, orient='index')
for key, value in results.items():
    experiment_df.loc[key, 'accuracy'] = value['accuracy'] 
    experiment_df.loc[key, 'recall'] = value['recall'] 
    experiment_df.loc[key, 'precision'] = value['precision'] 
    experiment_df.loc[key, 'f1_score'] = value['f1_score'] 
    experiment_df.loc[key, 'auc'] = value['auc'] 
    experiment_df.loc[key, 'training_time'] = value['training_time'] 

In [40]:
experiment_df

Unnamed: 0,feature_set,algorithm,text_embedding_method,accuracy,recall,precision,f1_score,auc,training_time
experiment_id_1,text_embeddings,LogisticRegression,BioBERT,0.750473,0.493956,0.579187,0.493469,0.797833,48.644255
experiment_id_2,categorical,LogisticRegression,BioBERT,0.626468,0.413242,0.567896,0.408861,0.657623,20.617074
experiment_id_3,categorical,TFNeuralNetwork,BioBERT,,,,,,
experiment_id_4,text_embeddings,TFNeuralNetwork,BioBERT,,,,,,
experiment_id_5,all_features_except_categorical,LogisticRegression,BERT-multilingual,0.66123,0.414378,0.428579,0.410473,0.644333,82.021492
experiment_id_6,numerical,TFNeuralNetwork,BioBERT,,,,,,
experiment_id_7,numerical,LogisticRegression,BioBERT,0.660938,0.413941,0.428464,0.409955,0.64561,8.70255
experiment_id_8,text_embeddings,RandomForestClassifier,BioBERT,0.755051,0.522298,0.631265,0.54152,0.779925,669.116558
experiment_id_9,categorical,GradientBoostingClassifier,BioBERT,0.625772,0.431581,0.576212,0.427168,0.659377,78.368037
experiment_id_10,numerical,GradientBoostingClassifier,BioBERT,0.689048,0.468128,0.612808,0.488399,0.737237,99.347659
