## TRAIN THE MODEL AND EVALUATE IT 


In [1]:
# Importing libraries for code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle

from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import make_scorer, accuracy_score 
from sklearn.svm import SVC 
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 

## DEFINE FUNCTIONS

In [8]:
#define train_test splits, based on the track we define in test_track. 
#we use test_track for testing and the other two tracks for training
#-->returns X_train,y_train,X_test,y_test as numpy arrays

def train_test_my_split(dfs,test_track,numpy_conversion=True):
    #take track three for testing and trcakk 1,2 for trainig
    all_keys=list(dfs.keys())


    if test_track==3:
        train_indices=[0,1,3,4,6,7]
        test_indices=[2,5,8]
    elif test_track==2:
        train_indices=[0,2,3,5,6,8]
        test_indices=[1,4,7]
    elif test_track==1:
        train_indices=[1,2,4,5,7,8]
        test_indices=[0,3,6]
        

    train_dfs = [dfs[all_keys[i]] for i in train_indices]
    train_df = pd.concat(train_dfs, axis=0, ignore_index=True)

    # Shuffle the training data
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

    test_dfs = [dfs[all_keys[i]] for i in test_indices]
    test_df = pd.concat(test_dfs, axis=0, ignore_index=True)

    
    # Shuffle testing data
    test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    #split features and targets
    if numpy_conversion:
        #split features and targets
        X_train=train_df.iloc[:,:-1].to_numpy()
        y_train=train_df.iloc[:,-1].to_numpy()
    
        X_test=test_df.iloc[:,:-1].to_numpy()
        y_test=test_df.iloc[:,-1].to_numpy()
    else:
        #split features and targets
        X_train=train_df.iloc[:,:-1]
        y_train=train_df.iloc[:,-1]
    
        X_test=test_df.iloc[:,:-1]
        y_test=test_df.iloc[:,-1]

    return X_train,y_train,X_test,y_test

#function for creating the model based on the parameter type
#--> returns the model
def create_model(type):
    if type=="RandomForest":
        return RandomForestClassifier(n_estimators=100, random_state=42)
    elif type == "SVM":
        return SVC(kernel="rbf", C=1.0)
    elif type == "lr":
        return LogisticRegression(random_state=42, max_iter=1000)

#test the model on the TEST set, take as input the NON-WINDOWED datasets
#-->returns the accuracy on the test set

def test_model(X_train,y_train,X_test,y_test,model_type,test_track=3):
    #create the  model 

    #print("Test the model")
    #print(X_train.shape)
    #print(X_test.shape)

    

    model=create_model(model_type)
    
    model.fit(X_train, y_train)

    # Test set evaluation
    y_test_pred = model.predict(X_test)


    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Test set accuracy: {test_accuracy:.4f}")

    return test_accuracy
    

# function for cross-fold evaluation, with num_folds folds, taken as a parameter
#--> returns average accuracy for the specific hyperparameters configuration defined as input

def evaluate_model(X_train,y_train,model_type,num_folds,test_track=3):
    
    #create the  model 
    model=create_model(model_type)

    #APPLY CROSS-FOLDER EVALUATION

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    fold_accuracies = []
    
    for i,(train_index, val_index) in enumerate(kf.split(X_train)):
        X_ttrain, X_val = X_train[train_index], X_train[val_index] 
        y_ttrain, y_val = y_train[train_index], y_train[val_index]
            
        model.fit(X_ttrain, y_ttrain) 
        y_pred = model.predict(X_val) 
        
        accuracy = accuracy_score(y_val, y_pred) 
        #print((y_val != y_pred).sum())
        print(f'fold {i} accuracy:', accuracy)
        fold_accuracies.append(accuracy)

    average_accuracy = sum(fold_accuracies) / num_folds
    print('average of folds',average_accuracy)

    return average_accuracy

#intermediate function, used for: 
#windowing based on the window size
#-->returns X_train,y_train,X_test,y_test based on track defined in test_track


def window_and_split(dfs,window_size,test_track=3,numpy_conversion=True):
    # Load the windowed data
    with open(f"dfs_windowed_{window_size}.pkl", "rb") as file:
        dfs_windowed = pickle.load(file)

    return train_test_my_split(dfs_windowed,test_track,numpy_conversion)

#receives X_train and X_test ALREADY SCALED  and returns pca datasets, as numpy arrays.
def apply_PCA(X_train,X_test,threshold):

    X_train_scaled = pd.DataFrame(X_train)
    X_test_scaled = pd.DataFrame(X_test)

    pca=PCA()

    pca=PCA(n_components=threshold, random_state=29)
    X_train_pca=pca.fit_transform(X_train_scaled)
    X_test_pca=pca.transform(X_test_scaled)

    return X_train_pca,X_test_pca

    
    

#main function, takes hyperparameters options, model type and num_folds for k-fold
#tries all configurations on the evaluation set
#test the best configuration on the trainig set 

def tuning_and_evaluation(dfs,window_sizes,model_type,num_folds,test_track=3,threshold_pca=1):

    #method for deciding if we wanna convert or not into numpy, not important
    if threshold_pca!=1:
        numpy_conversion=True
    else:
        numpy_conversion=True

    
    #initialize optimal results
    best_accuracy=0
    best_window_size=0


    #store all the average accuracies with different hyperparameters inside an arrray
    tot_accuracies=[]

    for window_size in window_sizes:
        print(f"--------------------")
        print(f"EVALUATE window_size: {window_size}")
        
        X_train,y_train,X_test,y_test=window_and_split(datasets_reduced,window_size,test_track,numpy_conversion)
        
        #scale values
        scaler=StandardScaler()
        X_train_scaled=scaler.fit_transform(X_train)
        X_test_scaled=scaler.transform(X_test)
    


        if threshold_pca!=1:
            print("apply pca for evaluation")
            X_train_pca_ev,X_test_pca_ev=apply_PCA(X_train_scaled,X_test_scaled,threshold_pca)
            print(f"dataset has {X_train_pca_ev.shape} ")

        
        
        accuracy=evaluate_model(X_train_pca_ev,y_train,model_type,num_folds,test_track)
        tot_accuracies.append(accuracy)
        
        
        #update optimal results if needed
        if accuracy>best_accuracy:
            best_accuracy=accuracy
            best_window_size=window_size
            
            
    print(f"Best window size: {best_window_size} with accuracy: {best_accuracy}")
    print("test best model on TEST data")

    
    X_train,y_train,X_test,y_test=window_and_split(datasets_reduced,best_window_size,test_track,numpy_conversion)

    #scale values
    scaler=StandardScaler()
    X_train_scaled=scaler.fit_transform(X_train)
    X_test_scaled=scaler.transform(X_test)

    if threshold_pca!=1:
            print("apply pca for test")
            X_train_pca_test,X_test_pca_test=apply_PCA(X_train_scaled,X_test_scaled,threshold_pca)


    
    return test_model(X_train_pca_test,y_train,X_test_pca_test,y_test,model_type,test_track)


In [None]:
#define train_test splits, based on the track we define in test_track. 
#we use test_track for testing and the other two tracks for training
#-->returns X_train,y_train,X_test,y_test as numpy arrays

def train_test_my_split(dfs,test_track,numpy_conversion=True):
    #take track three for testing and trcakk 1,2 for trainig
    all_keys=list(dfs.keys())


    if test_track==3:
        train_indices=[0,1,3,4,6,7]
        test_indices=[2,5,8]
    elif test_track==2:
        train_indices=[0,2,3,5,6,8]
        test_indices=[1,4,7]
    elif test_track==1:
        train_indices=[1,2,4,5,7,8]
        test_indices=[0,3,6]
        

    train_dfs = [dfs[all_keys[i]] for i in train_indices]
    train_df = pd.concat(train_dfs, axis=0, ignore_index=True)

    # Shuffle the training data
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

    test_dfs = [dfs[all_keys[i]] for i in test_indices]
    test_df = pd.concat(test_dfs, axis=0, ignore_index=True)

    
    # Shuffle testing data
    test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    #split features and targets
    if numpy_conversion:
        #split features and targets
        X_train=train_df.iloc[:,:-1].to_numpy()
        y_train=train_df.iloc[:,-1].to_numpy()
    
        X_test=test_df.iloc[:,:-1].to_numpy()
        y_test=test_df.iloc[:,-1].to_numpy()
    else:
        #split features and targets
        X_train=train_df.iloc[:,:-1]
        y_train=train_df.iloc[:,-1]
    
        X_test=test_df.iloc[:,:-1]
        y_test=test_df.iloc[:,-1]

    return X_train,y_train,X_test,y_test

#function for creating the model based on the parameter type
#--> returns the model
def create_model(type):
    if type=="RandomForest":
        return RandomForestClassifier(n_estimators=100, random_state=42)
    elif type == "SVM":
        return SVC(kernel="rbf", C=1.0)
    elif type == "lr":
        return LogisticRegression(random_state=42, max_iter=1000)

#test the model on the TEST set, take as input the NON-WINDOWED datasets
#-->returns the accuracy on the test set

def test_model(X_train,y_train,X_test,y_test,model_type,test_track=3):
    #create the  model 

    #print("Test the model")
    #print(X_train.shape)
    #print(X_test.shape)

    

    model=create_model(model_type)
    
    model.fit(X_train, y_train)

    # Test set evaluation
    y_test_pred = model.predict(X_test)


    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Test set accuracy: {test_accuracy:.4f}")

    return test_accuracy
    

# function for cross-fold evaluation, with num_folds folds, taken as a parameter
#--> returns average accuracy for the specific hyperparameters configuration defined as input

def evaluate_model(X_train,y_train,model_type,num_folds,test_track=3):
    
    #create the  model 
    model=create_model(model_type)

    #APPLY CROSS-FOLDER EVALUATION

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    fold_accuracies = []
    
    for i,(train_index, val_index) in enumerate(kf.split(X_train)):
        X_ttrain, X_val = X_train[train_index], X_train[val_index] 
        y_ttrain, y_val = y_train[train_index], y_train[val_index]
            
        model.fit(X_ttrain, y_ttrain) 
        y_pred = model.predict(X_val) 
        
        accuracy = accuracy_score(y_val, y_pred) 
        #print((y_val != y_pred).sum())
        print(f'fold {i} accuracy:', accuracy)
        fold_accuracies.append(accuracy)

    average_accuracy = sum(fold_accuracies) / num_folds
    print('average of folds',average_accuracy)

    return average_accuracy

#intermediate function, used for: 
#windowing based on the window size
#-->returns X_train,y_train,X_test,y_test based on track defined in test_track


def window_and_split(dfs,window_size,test_track=3,numpy_conversion=True):
    # Load the windowed data
    with open(f"dfs_windowed_{window_size}.pkl", "rb") as file:
        dfs_windowed = pickle.load(file)

    return train_test_my_split(dfs_windowed,test_track,numpy_conversion)

#receives X_train and X_test ALREADY SCALED  and returns pca datasets, as numpy arrays.
def apply_PCA(X_train,X_test,threshold):

    X_train_scaled = pd.DataFrame(X_train)
    X_test_scaled = pd.DataFrame(X_test)

    pca=PCA()

    pca=PCA(n_components=threshold, random_state=29)
    X_train_pca=pca.fit_transform(X_train_scaled)
    X_test_pca=pca.transform(X_test_scaled)

    return X_train_pca,X_test_pca

    
    

#main function, takes hyperparameters options, model type and num_folds for k-fold
#tries all configurations on the evaluation set
#test the best configuration on the trainig set 

def tuning_and_evaluation(dfs,window_sizes,model_type,num_folds,test_track=3,threshold_pca=1):

    #method for deciding if we wanna convert or not into numpy, not important
    if threshold_pca!=1:
        numpy_conversion=True
    else:
        numpy_conversion=True

    
    #initialize optimal results
    best_accuracy=0
    best_window_size=0


    #store all the average accuracies with different hyperparameters inside an arrray
    tot_accuracies=[]

    for window_size in window_sizes:
        print(f"--------------------")
        print(f"EVALUATE window_size: {window_size}")
        
        X_train,y_train,X_test,y_test=window_and_split(datasets_reduced,window_size,test_track,numpy_conversion)
        
        #scale values
        scaler=StandardScaler()
        X_train_scaled=scaler.fit_transform(X_train)
        X_test_scaled=scaler.transform(X_test)
    


        if threshold_pca!=1:
            print("apply pca for evaluation")
            X_train_pca_ev,X_test_pca_ev=apply_PCA(X_train_scaled,X_test_scaled,threshold_pca)
            print(f"dataset has {X_train_pca_ev.shape} ")

        
        
        accuracy=evaluate_model(X_train_pca_ev,y_train,model_type,num_folds,test_track)
        tot_accuracies.append(accuracy)
        
        
        #update optimal results if needed
        if accuracy>best_accuracy:
            best_accuracy=accuracy
            best_window_size=window_size
            
            
    print(f"Best window size: {best_window_size} with accuracy: {best_accuracy}")
    print("test best model on TEST data")

    
    X_train,y_train,X_test,y_test=window_and_split(datasets_reduced,best_window_size,test_track,numpy_conversion)

    #scale values
    scaler=StandardScaler()
    X_train_scaled=scaler.fit_transform(X_train)
    X_test_scaled=scaler.transform(X_test)

    if threshold_pca!=1:
            print("apply pca for test")
            X_train_pca_test,X_test_pca_test=apply_PCA(X_train_scaled,X_test_scaled,threshold_pca)


    
    return test_model(X_train_pca_test,y_train,X_test_pca_test,y_test,model_type,test_track)


## IMPORT THE DATA 

In [3]:
# Load datasets_reduced, which is the temporal data truncated
with open("datasets_reduced.pkl", "rb") as file:
    datasets_reduced = pickle.load(file)

## TEST THE MODEL 

In [10]:
import sys
import io
from IPython.core.interactiveshell import InteractiveShell
import time

model_type="RandomForest"
window_sizes500]
num_folds = 5
test_track=3
thresholds_pca=[0.6,0.7,0.8,0.9,0.95]
final_results=[]

for threshold in thresholds_pca:
    start_time = time.time()
    print(f"----------------------")
    print(f"PCA threshold: {threshold}")
    final_results.append(tuning_and_evaluation(datasets_reduced,window_sizes,model_type,num_folds,test_track,threshold))
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken for PCA threshold {threshold}: {elapsed_time:.2f} seconds")


----------------------
PCA threshold: 0.6
--------------------
EVALUATE window_size: 50
apply pca for evaluation
dataset has (359100, 6) 
fold 0 accuracy: 0.8293233082706767
fold 1 accuracy: 0.8311473127262601
fold 2 accuracy: 0.8304511278195489
fold 3 accuracy: 0.8296157059314954
fold 4 accuracy: 0.8299359509885825
average of folds 0.8300946811473127
--------------------
EVALUATE window_size: 100
apply pca for evaluation
dataset has (358800, 5) 
fold 0 accuracy: 0.9025083612040133
fold 1 accuracy: 0.8997491638795987
fold 2 accuracy: 0.9
fold 3 accuracy: 0.9009197324414716
fold 4 accuracy: 0.9023411371237459
average of folds 0.901103678929766
--------------------
EVALUATE window_size: 200
apply pca for evaluation
dataset has (358200, 5) 
fold 0 accuracy: 0.9449609156895589
fold 1 accuracy: 0.9441792294807371
fold 2 accuracy: 0.9431323283082077
fold 3 accuracy: 0.9439838079285315
fold 4 accuracy: 0.941247906197655
average of folds 0.9435008375209379
--------------------
EVALUATE window_

KeyboardInterrupt: 