## TRAIN THE MODEL AND EVALUATE IT 


In [7]:
# Importing libraries for code tt
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import make_scorer, accuracy_score, classification_report
from sklearn.decomposition import PCA 
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterGrid, cross_val_score, KFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.datasets import make_classification
import numpy as np
from tqdm import tqdm
import os


## DEFINE FUNCTIONS

In [53]:
import itertools
import warnings
from sklearn.exceptions import ConvergenceWarning

#define train_test splits, based on the track we define in test_track. 
#we use test_track for testing and the other two tracks for training
#-->returns X_train,y_train,X_test,y_test as numpy arrays

def train_test_with_partial_ciruits_split(dfs,test_track,numpy_conversion=True):
    #generate train and test dictionary

    keys=list(dfs.keys())

    scenario1_indices=[0,3,6]
    scenario2_indices=[1,4,7]
    scenario3_indices=[2,5,8]

    scenario1_keys=[keys[i] for i in scenario1_indices]
    scenario2_keys=[keys[i] for i in scenario2_indices]
    scenario3_keys=[keys[i] for i in scenario3_indices]

    #generate train dictionary(same as before)

    if test_track==3:
        train_keys=scenario1_keys+scenario2_keys
        test_keys=scenario3_keys
    elif test_track==2:
        train_keys=scenario1_keys+scenario3_keys
        test_keys=scenario2_keys
    elif test_track==1:
        train_keys=scenario2_keys+scenario3_keys
        test_keys=scenario1_keys
    
    #take the minimum length of the dataframes
    min_length = min([len(df) for df in dfs.values()])
    eval_size = int(0.4 * min_length)

    #take the first eval_size rows for evaluation, consider all dfs
    train_ev_dfs = {key: df.iloc[:eval_size] for key, df in dfs.items()}

    #concatenate different drivers in the same circuit in the same dataframe

    train_ev_dfs_combined = {
        "scenario1": pd.concat([train_ev_dfs[key] for key in scenario1_keys], axis=0, ignore_index=True),
        "scenario2": pd.concat([train_ev_dfs[key] for key in scenario2_keys], axis=0, ignore_index=True),
        "scenario3": pd.concat([train_ev_dfs[key] for key in scenario3_keys], axis=0, ignore_index=True)
    }


    #shuffle the dataframes
    for key, df in train_ev_dfs_combined.items():
        train_ev_dfs_combined[key] = df.sample(frac=1, random_state=42).reset_index(drop=True)



    #generate train dictionary(same as before), concatenate everything in one single df
    train_dfs = {key: df for key, df in dfs.items() if key in train_keys}
    
    for key in train_keys:
        print(key)
    
    train_df = pd.concat(train_dfs, axis=0, ignore_index=True)
    #shuffle the training data
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

    #generate test dictionary, i.e. take only rows not used in evaluation fro the defined scenario
    test_dfs = {key: df.iloc[eval_size:] for key, df in dfs.items() if key in test_keys}
    test_df = pd.concat(test_dfs, axis=0, ignore_index=True)
    #shuffle the testing data
    test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    #SVM and XGBoost expects targets from going from 0, so convert to int and set labels as 0, 1, 2
    train_df.iloc[:, -1] = train_df.iloc[:, -1].astype(int) - 1
    test_df.iloc[:, -1] = test_df.iloc[:, -1].astype(int) - 1

    #split features and targets
    if numpy_conversion:
        #split features and targets
        X_train=train_df.iloc[:,:-1].to_numpy()
        y_train=train_df.iloc[:,-1].to_numpy()
    
        X_test=test_df.iloc[:,:-1].to_numpy()
        y_test=test_df.iloc[:,-1].to_numpy()
    else:
        #split features and targets
        X_train=train_df.iloc[:,:-1]
        y_train=train_df.iloc[:,-1]
    
        X_test=test_df.iloc[:,:-1]
        y_test=test_df.iloc[:,-1]

    return train_ev_dfs_combined,X_train,y_train,X_test,y_test



#function for creating the model based on the parameter type
#--> returns the model
def create_model(type):
    if type=="RandomForest":
        return RandomForestClassifier(n_estimators=100, random_state=42)
    elif type == "SVM":
        return SVC(kernel="rbf", C=1.0, random_state=42)
    elif type == "lr":
        return LogisticRegression(random_state=42, max_iter=1000)
    elif type == "XGBoost":  # Adding XGBoost
        return XGBClassifier(
            n_estimators=100,        # Number of trees
            max_depth=3,             # Maximum depth of trees
            learning_rate=0.1,       # Learning rate (eta)
            subsample=0.8,           # Subsample ratio of the training set
            colsample_bytree=0.8,    # Subsample ratio of columns
            random_state=42          # Seed for reproducibility
        )

#test the model on the TEST set, take as input the NON-WINDOWED datasets
#-->returns the accuracy on the test set

'''
def test_model(X_train,y_train,X_test,y_test,model_type,test_track=3):
    #create the  model 

    #print("Test the model")
    #print(X_train.shape)
    #print(X_test.shape)

    model=create_model(model_type)
    
    model.fit(X_train, y_train)

    # Test set evaluation
    y_test_pred = model.predict(X_test)


    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Test set accuracy: {test_accuracy:.4f}")

    return test_accuracy
'''    

# function for cross-fold evaluation, with num_folds folds, taken as a parameter
#--> returns average accuracy for the specific hyperparameters configuration defined as input

'''
def evaluate_model(X_train,y_train,model_type,num_folds,test_track=3):
    
    #create the  model 
    model=create_model(model_type)

    #APPLY CROSS-FOLDER EVALUATION

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    fold_accuracies = []
    
    for i,(train_index, val_index) in enumerate(kf.split(X_train)):
        X_ttrain, X_val = X_train[train_index], X_train[val_index] 
        y_ttrain, y_val = y_train[train_index], y_train[val_index]
            
        model.fit(X_ttrain, y_ttrain) 
        y_pred = model.predict(X_val) 
        
        accuracy = accuracy_score(y_val, y_pred) 
        #print((y_val != y_pred).sum())
        print(f'fold {i} accuracy:', accuracy)
        fold_accuracies.append(accuracy)

    average_accuracy = sum(fold_accuracies) / num_folds
    print('average of folds',average_accuracy)

    return average_accuracy
'''
    
#intermediate function, used for: 
#windowing based on the window size
#-->returns X_train,y_train,X_test,y_test based on track defined in test_track

def window_and_split_LOCO(dfs,window_size,ratio,test_track=3,numpy_conversion=True):
    # Load the windowed data
    print(ratio)
    directory = 'datasets'
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, f'dfs_windowed_{ratio}.pkl')
    #file_path = os.path.join(directory, f'dfs_windowed_{window_size}_rd{ratio}.pkl')
    with open(file_path, "rb") as file:
        dfs_windowed = pickle.load(file)
    for i in dfs_windowed.keys():
        print(len(dfs_windowed[i]))
    train_ev_dfs,X_train,y_train,X_test,y_test=train_test_with_partial_ciruits_split(dfs_windowed,test_track,numpy_conversion)

    return train_ev_dfs,X_train,y_train,X_test,y_test

#receives X_train and X_test ALREADY SCALED  and returns pca datasets, as numpy arrays.
'''
def apply_PCA(X_train,X_test,threshold):

    X_train_scaled = pd.DataFrame(X_train)
    X_test_scaled = pd.DataFrame(X_test)

    pca=PCA()

    pca=PCA(n_components=threshold, random_state=29)
    X_train_pca=pca.fit_transform(X_train_scaled)
    X_test_pca=pca.transform(X_test_scaled)

    return X_train_pca,X_test_pca
'''

'''    
def save_results_to_csv(results, best_params, test_accuracy,model_type,pca_threshold,window_size,reduction_factor=2):
    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results, columns=['Parameters', 'Mean Accuracy'])
    
    # Add the best parameters and test accuracy as new columns
    test_accuracy_df = pd.DataFrame([test_accuracy], columns=[f'Test on {best_params}'])

    # Combine the results, best parameters, and test accuracy into one DataFrame
    final_df = pd.concat([results_df, test_accuracy_df], axis=1)
    # Ensure the 'results' directory exists
    os.makedirs('results', exist_ok=True)

    # Define the file path within the 'results' directory
    file_path = os.path.join('results', f'grid_search_results_{model_type}_{pca_threshold}_red{reduction_factor}_ws{window_size}.csv')

    # Save the combined DataFrame to a CSV file
    final_df.to_csv(file_path, index=False, sep =";")

    print(f"Results saved to 'grid_search_results_{model_type}_{pca_threshold}.csv'")
'''


def LOCO_cross_validation(dfs,model_type,PCA_thresholds,window_size,ratio,test_track=3):
    # Load the windowed data
    train_ev_dfs, X_train, y_train, X_test, y_test = window_and_split_LOCO(dfs, window_size, ratio, test_track, numpy_conversion=True)
    print("done with windowing")
    
    # Define param_grid based on model type
    if model_type == "lr":
        
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        pipeline = Pipeline([
            ('poly', PolynomialFeatures()),  # Add polynomial features
            ('scaler', StandardScaler()),    # Standardize features
            ('pca', PCA()), #apply PCA
            ('logreg', LogisticRegression()) # Logistic Regression model
        ])

        param_grid = {
            'poly__degree': [1,2],                   # Degrees of polynomial features
            'logreg__C': [0.1, 1, 10, 100],        # Regularization strength (inverse of lambda)
            'logreg__max_iter': [100, 200, 500],   # Maximum iterations
            'pca__n_components': PCA_thresholds
        }

    elif model_type == "RandomForest":
        pipeline = Pipeline([
            ('scaler', StandardScaler()),    # Standardize features
            ('pca', PCA()), #apply PCA
            ('rf', RandomForestClassifier(random_state=42))  # Random Forest Classifier
        ])

        param_grid = {
            'rf__n_estimators': [100,   200],            # Number of trees in the forest
            'rf__max_depth': [None, 30],              # Maximum depth of the trees
            'rf__min_samples_split': [2, 10],         # Minimum number of samples required to split an internal node
            'rf__bootstrap': [True, False],           # Whether bootstrap samples are used when building trees
            'rf__max_features': [ 'sqrt'],     # The number of features to consider when looking for the best split
            'pca__n_components': PCA_thresholds
        }
    
    elif model_type == "svm":
        pipeline = Pipeline([
            ('scaler', StandardScaler()),    # Standardize features
            ('pca', PCA()),                  # Apply PCA
            ('svm', SVC(random_state=42))   # Support Vector Machine model
        ])

        param_grid = {
            'svm__C': [0.1],#, 1, 10],#, 100],            # Regularization parameter, controls tradeoff between slack variable penalty and the margin
            'svm__kernel': ['rbf'],#, 'linear'],       # Kernel type
            'svm__gamma': [0.1], #['scale', 'auto', 0.1],#, 0.1],  # Kernel coefficient for RBF
            'pca__n_components': PCA_thresholds    # Number of PCA components
        }
    
    elif model_type == "xgboost":
        pipeline = Pipeline([
            ('scaler', StandardScaler()),    # Standardize features
            ('pca', PCA()),                  # Apply PCA
            ('xgb', XGBClassifier(random_state=42))  # XGBoost Classifier
        ])

        param_grid = {
            'xgb__n_estimators': [100, 200],       # Number of boosting rounds (trees)
            'xgb__max_depth': [3, 6, 9],          # Maximum depth of the trees
            'xgb__learning_rate': [0.01, 0.1, 0.2],  # Learning rate (eta)
            'xgb__subsample': [0.8, 1.0],         # Fraction of samples used for each tree
            'xgb__colsample_bytree': [0.8, 1.0],  # Fraction of features (columns) used for each tree
            'pca__n_components': PCA_thresholds   # Number of PCA components
        }


    # Generate all combinations of hyperparameters
    hyperpar_keys = param_grid.keys()
    combinations = list(itertools.product(*param_grid.values()))
    combinations_results = []

    best_combination = (None, 0)

    for combination in combinations:
        param_dict = dict(zip(hyperpar_keys, combination))
        print(f"Combination: {param_dict}")

        # Evaluate the model
        model = pipeline
        model.set_params(**param_dict)
        
        results = []

        # Apply Cross-Folder Evaluation
        for eval_key in train_ev_dfs.keys():
            eval = train_ev_dfs[eval_key]
            eval = eval.sample(frac=1, random_state=42).reset_index(drop=True)
            X_eval = eval.iloc[:, :-1].to_numpy()
            y_eval = eval.iloc[:, -1].to_numpy()

            # Use remaining keys as the training set
            train_keys = [key for key in train_ev_dfs.keys() if key != eval_key]
            train_combined=pd.concat([train_ev_dfs[key] for key in train_keys], axis=0, ignore_index=True)
            
            train_combined = train_combined.sample(frac=1, random_state=42).reset_index(drop=True)
            X_train_combined = train_combined.iloc[:, :-1].to_numpy()
            y_train_combined = train_combined.iloc[:, -1].to_numpy()

            y_train_combined = LabelEncoder().fit_transform(y_train_combined)
            y_eval = LabelEncoder().fit_transform(y_eval)

            # Train the model on the training set
            model.fit(X_train_combined, y_train_combined)

            # Evaluate the model on the evaluation set
            y_eval_pred = model.predict(X_eval)
            eval_accuracy = accuracy_score(y_eval, y_eval_pred)

            # Append the results
            results.append((eval_key, eval_accuracy))

        # Compute the average accuracy
        average_accuracy = np.mean([acc for key, acc in results])
        combinations_results.append((param_dict, average_accuracy))

        if average_accuracy > best_combination[1] or best_combination[0] is None:
            best_combination = (combination, average_accuracy)
        print(f"Average Accuracy: {average_accuracy}")

    # Find the best hyperparameters from argmax of the average accuracy
    print(f"Best hyperparameters combination: {best_combination}")

    # Test the best hyperparameters on the test set
    model = pipeline
    model.set_params(**dict(zip(hyperpar_keys, best_combination[0])))

    model.fit(X_train, y_train)

    # Test set evaluation
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Test set accuracy: {test_accuracy:.4f}")

    #save_results_to_csv(combinations_results, best_combination[0], test_accuracy, model_type, PCA_thresholds,window_size,ratio)

    return best_combination, test_accuracy

## IMPORT THE DATA 

In [44]:
# Load datasets_reduced, which is the temporal data truncated


directory = 'datasets'
os.makedirs(directory, exist_ok=True)
file_path = os.path.join(directory, 'datasets_reduced.pkl')
with open(file_path, "rb") as file:
    datasets_reduced = pickle.load(file)

'''
file_path = 'datasets_reduced.pkl'
with open(file_path, "rb") as file:
    datasets_reduced = pickle.load(file)
'''

'\nfile_path = \'datasets_reduced.pkl\'\nwith open(file_path, "rb") as file:\n    datasets_reduced = pickle.load(file)\n'

## TEST THE MODEL 

In [54]:
model_type="svm"
window_sizes=[20]
test_track=3
thresholds_pca=[0.8]
ratio=100
for window_size in window_sizes:
    LOCO_cross_validation(datasets_reduced,model_type,thresholds_pca,window_size,ratio,test_track=3)

100
4729
4729
4729
4495
4495
4495
4285
4285
4285
pvs1_gps_mpu
pvs4_gps_mpu
pvs7_gps_mpu
pvs2_gps_mpu
pvs5_gps_mpu
pvs8_gps_mpu
done with windowing
Combination: {'svm__C': 0.1, 'svm__kernel': 'rbf', 'svm__gamma': 0.1, 'pca__n_components': 0.8}
Average Accuracy: 0.5047970958122651
Best hyperparameters combination: ((0.1, 'rbf', 0.1, 0.8), 0.5047970958122651)
Test set accuracy: 0.4230
