In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data
cf1_results = pd.read_csv('/content/cf1_results.csv')
cf1_results_ltn = pd.read_csv('/content/cf1_results_ltn.csv')

# Calculate the last epoch of the normal model
last_normal_epoch = cf1_results['Epoch'].max()

# Adjust the LTN model epochs to start right after the last epoch of the normal model
cf1_results_ltn['Adjusted Epoch'] = cf1_results_ltn['Epoch'] + last_normal_epoch

# Group by Epoch and calculate mean accuracy for the normal model
avg_cf1_results = cf1_results.groupby('Epoch').agg({'Validation Accuracy': 'mean'}).reset_index()

# Find the maximum test accuracy for each fold and speed in the LTN model
max_accuracy_per_fold_speed = cf1_results_ltn.groupby(['Fold', 'Speed', 'Adjusted Epoch']).agg({'test_accuracy': 'max'}).reset_index()

# Calculate the mean of these maximum accuracies for each adjusted epoch
avg_max_accuracy_per_epoch = max_accuracy_per_fold_speed.groupby('Adjusted Epoch').agg({'test_accuracy': 'mean'}).reset_index()

# Combine averaged data for a continuous plot
combined_avg_results = pd.concat([
    avg_cf1_results.rename(columns={'Validation Accuracy': 'Accuracy', 'Epoch': 'Extended Epoch'}),
    avg_max_accuracy_per_epoch.rename(columns={'test_accuracy': 'Accuracy', 'Adjusted Epoch': 'Extended Epoch'})
])

# Plotting
plt.figure(figsize=(14, 8))
plt.plot(combined_avg_results['Extended Epoch'], combined_avg_results['Accuracy'], label='Average Model Accuracy', marker='o')
plt.title('Average Model Accuracy Across Extended Epochs')
plt.xlabel('Extended Epoch (Normal + LTN)')
plt.ylabel('Average Test Accuracy')
plt.axvline(x=last_normal_epoch, color='r', linestyle='--', label='Start of LTN Model')
plt.legend()
plt.grid(True)
plt.show()


In [None]:

# Standard library imports
import argparse
import csv
import math
import os
import pickle
import random
import re
import sys
import csv


# Append config directory to sys.path
script_dir = os.path.dirname(os.path.abspath(__file__))  # Absolute dir the script is in
sys.path.append(os.path.join(script_dir, '..', 'config'))

# Third-party library imports
import joblib
import ltn
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from rich.console import Console
from rich.table import Table
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from tensorflow.keras import layers, models, optimizers, callbacks, regularizers
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint, TensorBoard
from tensorflow.keras.layers import Dense, Dropout, LSTM, Input
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy
from tqdm import tqdm
from numpy import mean

# Local module imports
import config as config
from model_creation import LSTMModel, lr_schedule
from sequence_generation import load_sequences, save_sequences
from model_evaluation import kfold_cross_validation, normalize_importances, permutation_importance_per_class
from pgb_data_processing import overview_csv_files, process_pgb_data
from data_scaling import load_and_scale_data
from util import concatenate_and_delete_ltn_csv_files
import commons as commons
from tensorflow.keras.callbacks import Callback


kf = KFold(n_splits=n_splits, shuffle=False)
counter = 0
console = Console()
processed_bases = set()

if os.path.exists(processed_file_tracker):
    with open(processed_file_tracker, "r") as file:
        processed_bases = set(file.read().splitlines())

metrics_summary = []

for file in sorted(os.listdir(sequences_directory)):
    if "_train_scaled_sequences.npy" in file:
        
        if counter >= S:
            break
        
        base_name = file.replace("_train_scaled_sequences.npy", "")
        if base_name in processed_bases:
            continue
        processed_bases.add(base_name)
        counter+=1
        
        
            
            
        # Load sequences and labels
        train_sequence_file_path = os.path.join(sequences_directory, f"{base_name}_train_scaled_sequences.npy")
        train_label_file_path = os.path.join(sequences_directory, f"{base_name}_train_scaled_labels.npy")
        X_train, y_train = load_sequences(train_sequence_file_path, train_label_file_path)
        
        test_sequence_file_path = os.path.join(sequences_directory, f"{base_name}_test_scaled_sequences.npy")
        test_label_file_path = os.path.join(sequences_directory, f"{base_name}_test_scaled_labels.npy")
        X_test, y_test = load_sequences(test_sequence_file_path, test_label_file_path)

        # Shuffle the sequences and corresponding labels. Before this they were kept ordered.
        train_indices = np.arange(len(X_train))
        np.random.shuffle(train_indices)
        X_train = X_train[train_indices]
        y_train = y_train[train_indices]

        test_indices = np.arange(len(X_test))
        np.random.shuffle(test_indices)
        X_test = X_test[test_indices]
        y_test = y_test[test_indices]

        # Merge for cross-validation
        X = np.concatenate((X_train, X_test), axis=0)
        y = np.concatenate((y_train, y_test), axis=0)

        input_shape = (sequence_length, num_features)
        fold_metrics = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
            metrics_logger = MetricsLogger(results_path, fold_number=fold+1, base_name=base_name)
            console.print(f"[bold green]Training fold {fold + 1}/{n_splits} for {base_name}[/]")
            X_train_fold, X_val_fold = X[train_idx], X[val_idx]
            y_train_fold, y_val_fold = y[train_idx], y[val_idx]


            
            # load the model
            
            model = tf.keras.models.load_model('path_to_saved_model')

            
            y_val_pred_classes = model.predict(X_val_fold, batch_size = batch_size)
            y_val_pred_classes = np.argmax(y_val_pred_classes, axis=1)  # Get predicted classes

            # Since y_val_fold contains integer labels, there's no need for conversion
            y_val_true_classes = y_val_fold  # Directly use the integer labels

            # Calculate and store metrics for this fold
            accuracy = accuracy_score(y_val_true_classes, y_val_pred_classes)
            precision = precision_score(y_val_true_classes, y_val_pred_classes, average='macro', zero_division=0)
            recall = recall_score(y_val_true_classes, y_val_pred_classes, average='macro', zero_division=0)
            f1 = f1_score(y_val_true_classes, y_val_pred_classes, average='macro')
            fold_metrics.append((accuracy, precision, recall, f1))
            



            console.print(f"Fold {fold+1} Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
            
            class_importances = permutation_importance_per_class(model, X_val_fold, y_val_fold, n_repeats=4, n_samples=n_samples)
            for class_id, importances in class_importances.items():
                print(f"{class_id} Feature Importances:", importances)
            
            importances_matrix = np.array(list(class_importances.values()))
            # Calculate the average importance across all classes
            average_importances = np.mean(importances_matrix, axis=0)
            
            normalized_average_importances = normalize_importances(average_importances)

            print("Normalized Average Feature Importances:", normalized_average_importances)

            p = ltn.Predicate.FromLogits(model, activation_function="softmax", with_class_indexing=True)
            
            @tf.function
            def axioms(features, labels, training=False):
                x_A = ltn.Variable("x_A", features[labels == 0])
                x_B = ltn.Variable("x_B", features[labels == 1])
                x_C = ltn.Variable("x_C", features[labels == 2])
                x_D = ltn.Variable("x_D", features[labels == 3])
                x_E = ltn.Variable("x_E", features[labels == 4])
                x_F = ltn.Variable("x_F", features[labels == 5])
                x_G = ltn.Variable("x_G", features[labels == 6])
                x_H = ltn.Variable("x_H", features[labels == 7])
                x_I = ltn.Variable("x_I", features[labels == 8])
                axioms = [
                    Forall(x_A, p([x_A, class_0], training=training)),
                    Forall(x_B, p([x_B, class_1], training=training)),
                    Forall(x_C, p([x_C, class_2], training=training)),
                    Forall(x_D, p([x_D, class_3], training=training)),
                    Forall(x_E, p([x_E, class_4], training=training)),
                    Forall(x_F, p([x_F, class_5], training=training)),
                    Forall(x_G, p([x_G, class_6], training=training)),
                    Forall(x_H, p([x_H, class_7], training=training)),
                    Forall(x_I, p([x_I, class_8], training=training))
                ]
                sat_level = formula_aggregator(axioms).tensor
                return sat_level

            optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
            
            @tf.function
            def train_step(features, labels):
                # sat and update
                with tf.GradientTape() as tape:
                    sat = axioms(features, labels, training=True)
                    loss = 1.-sat
                gradients = tape.gradient(loss, p.trainable_variables)
                optimizer.apply_gradients(zip(gradients, p.trainable_variables))
                sat = axioms(features, labels) # compute sat without dropout
                metrics_dict['train_sat_kb'](sat)
                # accuracy
                predictions = model([features])
                metrics_dict['train_accuracy'](tf.one_hot(labels,9),predictions)
                
            @tf.function
            def test_step(features, labels):
                # sat
                sat = axioms(features, labels)
                metrics_dict['test_sat_kb'](sat)
                # accuracy
                predictions = model([features])
                metrics_dict['test_accuracy'](tf.one_hot(labels,9),predictions)
            
            
            X_train_fold_weighted = X_train_fold * np.array(normalized_average_importances)
            X_val_fold_weighted = X_val_fold * np.array(normalized_average_importances)
                
            ds_train_fold = tf.data.Dataset.from_tensor_slices((X_train_fold_weighted, y_train_fold))
            ds_val_fold = tf.data.Dataset.from_tensor_slices((X_val_fold_weighted, y_val_fold))
            
            ds_train_fold = ds_train_fold.batch(batch_size)
            ds_val_fold = ds_val_fold.batch(batch_size)

            for batch_features, batch_labels in ds_val_fold:
                batch_satisfaction_level = axioms(batch_features, batch_labels, training=False)
                print(f"Batch Satisfaction Level: {batch_satisfaction_level.numpy():.4f}")
                break
            

            
            results_path_ltn_fold = results_path_ltn + base_name +"_fold" + str(fold+1) + '_ltn.csv'
            commons.train(
                epochs,
                metrics_dict,
                ds_train_fold,
                ds_val_fold,
                train_step,
                test_step,
                csv_path=results_path_ltn_fold,
                track_metrics=1
            )