## TRAIN THE MODEL AND EVALUATE IT 


In [2]:
# Importing libraries for code
import numpy as np
import json
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, KFold, GridSearchCV, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import make_scorer, accuracy_score, classification_report 
from sklearn.svm import SVC 
from sklearn.decomposition import PCA 
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.datasets import make_classification

## DEFINE FUNCTIONS

In [5]:
#define train_test splits, based on the track we define in test_track. 
#we use test_track for testing and the other two tracks for training
#-->returns X_train,y_train,X_test,y_test as numpy arrays


def train_test_my_split(dfs,test_track,numpy_conversion=True):
    #take track three for testing and trcakk 1,2 for trainig
    all_keys=list(dfs.keys())



    if test_track==3:
        train_indices=[0,1,3,4,6,7]
        test_indices=[2,5,8]
    elif test_track==2:
        train_indices=[0,2,3,5,6,8]
        test_indices=[1,4,7]
    elif test_track==1:
        train_indices=[1,2,4,5,7,8]
        test_indices=[0,3,6]
        

    train_dfs = [dfs[all_keys[i]] for i in train_indices]
    train_df = pd.concat(train_dfs, axis=0, ignore_index=True)

    # Shuffle the training data
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

    test_dfs = [dfs[all_keys[i]] for i in test_indices]
    test_df = pd.concat(test_dfs, axis=0, ignore_index=True)

    
    # Shuffle testing data
    test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    #split features and targets
    if numpy_conversion:
        #split features and targets
        X_train=train_df.iloc[:,:-1].to_numpy()
        y_train=train_df.iloc[:,-1].to_numpy()
    
        X_test=test_df.iloc[:,:-1].to_numpy()
        y_test=test_df.iloc[:,-1].to_numpy()
    else:
        #split features and targets
        X_train=train_df.iloc[:,:-1]
        y_train=train_df.iloc[:,-1]
    
        X_test=test_df.iloc[:,:-1]
        y_test=test_df.iloc[:,-1]

    return X_train,y_train,X_test,y_test

#function for creating the model based on the parameter type
#--> returns the model
def create_model(type):
    if type=="RandomForest":
        return RandomForestClassifier(n_estimators=100, random_state=42)
    elif type == "SVM":
        return SVC(kernel="rbf", C=1.0)
    elif type == "lr":
        return LogisticRegression(random_state=42, max_iter=1000)

#test the model on the TEST set, take as input the NON-WINDOWED datasets
#-->returns the accuracy on the test set

def test_model(X_train,y_train,X_test,y_test,model_type,test_track=3):
    #create the  model 

    #print("Test the model")
    #print(X_train.shape)
    #print(X_test.shape)

    model=create_model(model_type)
    
    model.fit(X_train, y_train)

    # Test set evaluation
    y_test_pred = model.predict(X_test)


    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Test set accuracy: {test_accuracy:.4f}")

    return test_accuracy
    

# function for cross-fold evaluation, with num_folds folds, taken as a parameter
#--> returns average accuracy for the specific hyperparameters configuration defined as input

def evaluate_model(X_train,y_train,model_type,num_folds,test_track=3):
    
    #create the  model 
    model=create_model(model_type)

    #APPLY CROSS-FOLDER EVALUATION

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    fold_accuracies = []
    
    for i,(train_index, val_index) in enumerate(kf.split(X_train)):
        X_ttrain, X_val = X_train[train_index], X_train[val_index] 
        y_ttrain, y_val = y_train[train_index], y_train[val_index]
            
        model.fit(X_ttrain, y_ttrain) 
        y_pred = model.predict(X_val) 
        
        accuracy = accuracy_score(y_val, y_pred) 
        #print((y_val != y_pred).sum())
        print(f'fold {i} accuracy:', accuracy)
        fold_accuracies.append(accuracy)

    average_accuracy = sum(fold_accuracies) / num_folds
    print('average of folds',average_accuracy)

    return average_accuracy

#intermediate function, used for: 
#windowing based on the window size
#-->returns X_train,y_train,X_test,y_test based on track defined in test_track


def window_and_split(dfs,window_size,test_track=3,numpy_conversion=True):
    # Load the windowed data
    with open(f"dfs_windowed_{window_size}.pkl", "rb") as file:
        dfs_windowed = pickle.load(file)

    return train_test_my_split(dfs_windowed,test_track,numpy_conversion)


#receives X_train and X_test ALREADY SCALED  and returns pca datasets, as numpy arrays.
def apply_PCA(X_train,X_test,threshold):

    X_train_scaled = pd.DataFrame(X_train)
    X_test_scaled = pd.DataFrame(X_test)

    pca=PCA()

    pca=PCA(n_components=threshold, random_state=29)
    X_train_pca=pca.fit_transform(X_train_scaled)
    X_test_pca=pca.transform(X_test_scaled)

    return X_train_pca,X_test_pca

def save_results_to_csv(results, best_params, test_accuracy,model_type,pca_threshold):
    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results, columns=['Parameters', 'Mean Accuracy'])
    
    # Add the best parameters and test accuracy as new columns
    test_accuracy_df = pd.DataFrame([test_accuracy], columns=[f'Test on {best_params}'])

    # Combine the results, best parameters, and test accuracy into one DataFrame
    final_df = pd.concat([results_df, test_accuracy_df], axis=1)

    # Save the combined DataFrame to a CSV file
    final_df.to_csv(f'grid_search_results_with_test_accuracy_{model_type}_{pca_threshold}.csv', index=False, sep =";")

    print("Results saved to 'grid_search_results_with_test_accuracy.csv'")



#main function, takes hyperparameters options, model type and num_folds for k-fold
#tries all configurations on the evaluation set
#test the best configuration on the trainig set 

def apply_grid_search(X_train,y_train,X_test,y_test,model_type,num_folds):


    if model_type=="lr":
        #define pipeline for lr
        pipeline = Pipeline([
        ('poly', PolynomialFeatures()),  # Add polynomial features
        ('scaler', StandardScaler()),    # Standardize features
        ('logreg', LogisticRegression()) # Logistic Regression model
        ])

        param_grid = {
            'poly__degree': [3],                   # Degrees of polynomial features
            'logreg__C': [0.1, 1, 10, 100],           # Regularization strength (inverse of lambda)
            #'logreg__solver': ['lbfgs'],                    # Use solver suitable for small datasets
            'logreg__max_iter': [100, 200, 500],            # Maximum iterations
        }
    if model_type=="RandomForest":
        pipeline = Pipeline([
        #('scaler', StandardScaler()),   # Feature scaling (optional for Random Forest)
        ('rf', RandomForestClassifier(random_state=42))  # Random Forest Classifier
        ])

    # Define the hyperparameters to search over
        param_grid = {
            'rf__n_estimators': [50, 200],              # Number of trees in the forest
            'rf__max_depth': [None, 30],              # Maximum depth of the trees
            'rf__min_samples_split': [2, 10],              # Minimum number of samples required to split an internal node
            #'rf__min_samples_leaf': [1, 2, 4],                # Minimum number of samples required to be at a leaf node
            'rf__bootstrap': [True, False],                   # Whether bootstrap samples are used when building trees
            'rf__max_features': [ 'sqrt', 'log2']     # The number of features to consider when looking for the best split
        }

    # Create a manual loop with tqdm
    results = []
    param_combinations = list(ParameterGrid(param_grid))

    for params in tqdm(param_combinations, desc="Grid Search Progress"):
        pipeline.set_params(**params)  # Set the current parameters
        # Perform cross-validation and get the mean accuracy score
        scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
        # Append the results with the mean score
        results.append((params, np.mean(scores)))

    # Sort results by accuracy
    results = sorted(results, key=lambda x: x[1], reverse=True)


    best_params = results[0][0]

    # Display best result
    print("\nBest Parameters:", results[0][0])
    print("Best Score:", results[0][1])

    pipeline.set_params(**best_params)
    pipeline.fit(X_train, y_train)  # Fit on the entire training data

    # Predict on the test set
    y_pred = pipeline.predict(X_test)

    # Evaluate the model on the test set
    test_accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", test_accuracy)

    return results,best_params,test_accuracy



## IMPORT THE DATA 

In [6]:
# Load datasets_reduced, which is the temporal data truncated
with open("datasets_reduced.pkl", "rb") as file:
    datasets_reduced = pickle.load(file)

## TEST THE MODEL 

In [7]:
import sys
import io
from IPython.core.interactiveshell import InteractiveShell
import time

model_type="RandomForest"
window_sizes=[1000]
num_folds = 5
test_track=3
thresholds_pca=[0.6]

for window_size in window_sizes:
    for threshold_pca in thresholds_pca:
        print(f"----------------------")
        print(f"PCA threshold: {threshold_pca}")
        print(f"Window size: {window_size}")

        X_train,y_train,X_test,y_test=window_and_split(datasets_reduced,window_size,test_track,True)

        #scale values
        scaler=StandardScaler()
        X_train_scaled=scaler.fit_transform(X_train)
        X_test_scaled=scaler.transform(X_test)

        #apply PCA
        X_train_pca,X_test_pca=apply_PCA(X_train_scaled,X_test_scaled,threshold_pca)


        start_time = time.time()
        final_results,best_params,test_accuracy=apply_grid_search(X_train_pca,y_train,X_test_pca,y_test,model_type,num_folds)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Time taken for PCA threshold {threshold_pca}: {elapsed_time:.2f} seconds")
        # Call the function to save results to a CSV file
        save_results_to_csv(final_results, best_params, test_accuracy,model_type,threshold_pca)


----------------------
PCA threshold: 0.6
Window size: 1000


Grid Search Progress: 100%|██████████| 32/32 [6:02:26<00:00, 679.57s/it]   



Best Parameters: {'rf__bootstrap': False, 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_split': 2, 'rf__n_estimators': 200}
Best Score: 0.9968845500848896
Test Accuracy: 0.6509918319719953
Time taken for PCA threshold 0.6: 22405.91 seconds
Results saved to 'grid_search_results_with_test_accuracy.csv'
