# Installazione delle Librerie

In [1]:
%pip install numpy pandas scikit-learn torch pytorch-tabnet pytorch-tabular




# Import e definizioni delle funzioni

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import torch
# from pytorch_tabnet.tab_model import TabNetRegressor
# from pytorch_tabular.models import TabTransformerModel
# from pytorch_tabular.config import ModelConfig, DataConfig, TrainerConfig
# from pytorch_tabular import TabularModel
import pickle

# Funzioni di utilità
def save_model(model, directory, filename):
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(os.path.join(directory, filename + '.pkl'), 'wb') as f:
        pickle.dump(model, f)

def cross_val_score_with_preprocessing(model, X, y, cv, scaler_type, use_pca, n_components):
    mse_scores, mae_scores, r2_scores = [], [], []
    for train_idx, val_idx in KFold(n_splits=cv, shuffle=True, random_state=89).split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Scaling
        if scaler_type == 'Standard':
            scaler = StandardScaler().fit(X_train)
        elif scaler_type == 'MinMax':
            scaler = MinMaxScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        # PCA
        if use_pca:
            pca = PCA(n_components=n_components).fit(X_train_scaled)
            X_train_scaled = pca.transform(X_train_scaled)
            X_val_scaled = pca.transform(X_val_scaled)
        
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
        mse_scores.append(mean_squared_error(y_val, y_pred))
        mae_scores.append(mean_absolute_error(y_val, y_pred))
        r2_scores.append(r2_score(y_val, y_pred))
        
    return np.mean(mse_scores), np.mean(mae_scores), np.mean(r2_scores)


## Functions definition

In [3]:
# random_state = 89

# def save_model(model, directory, filename):
#     if not os.path.exists(directory):
#         os.makedirs(directory)
#     joblib.dump(model, os.path.join(directory, f'{filename}.pkl'))

# def preprocess(X, scaler_type='standard', use_pca=False, n_components=None):
#     if scaler_type == 'standard':
#         scaler = StandardScaler()
#     elif scaler_type == 'minmax':
#         scaler = MinMaxScaler()
#     X_scaled = scaler.fit_transform(X)

#     pca = None
#     if use_pca and n_components:
#         pca = PCA(n_components=n_components, random_state=random_state)
#         X_scaled = pca.fit_transform(X_scaled)

#     return X_scaled, scaler, pca

# def cross_val_score_with_preprocessing(model, X, y, cv=5, scaler_type='standard', use_pca=False, n_components=None):
#     kf = KFold(n_splits=cv, shuffle=True, random_state=random_state)
#     mse_scores = []
#     mae_scores = []
#     r2_scores = []

#     for train_index, test_index in kf.split(X):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]

#         X_train_scaled, scaler, pca = preprocess(X_train, scaler_type=scaler_type, use_pca=use_pca, n_components=n_components)
#         X_test_scaled = scaler.transform(X_test)
#         if pca:
#             X_test_scaled = pca.transform(X_test_scaled)

#         model.fit(X_train_scaled, y_train)
#         y_pred = model.predict(X_test_scaled)

#         mse_scores.append(mean_squared_error(y_test, y_pred))
#         mae_scores.append(mean_absolute_error(y_test, y_pred))
#         r2_scores.append(r2_score(y_test, y_pred))

#     return np.mean(mse_scores), np.mean(mae_scores), np.mean(r2_scores)



# def grid_search_cv_with_preprocessing(model, param_grid, X, y, cv=5, scaler_type='standard', use_pca=False, n_components=None):
#     # Creazione del pipeline di preprocessing
#     steps = []
#     if scaler_type == 'standard':
#         steps.append(('scaler', StandardScaler()))
#     elif scaler_type == 'minmax':
#         steps.append(('scaler', MinMaxScaler()))
#     if use_pca:
#         steps.append(('pca', PCA(n_components=n_components)))

#     pipeline = Pipeline(steps + [('model', model)])
    
#     # Grid Search CV
#     grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
#     grid_search.fit(X, y)
    
#     best_model = grid_search.best_estimator_
#     best_params = grid_search.best_params_
#     best_score = -grid_search.best_score_
    
#     return best_model, best_params, best_score




# def save_performance(model_name, scaler_type, use_pca, mse, mae, r2, mse_cv, mae_cv, r2_cv, filename='model_performance.csv'):
#     file_exists = os.path.isfile(filename)
#     with open(filename, mode='a', newline='') as file:
#         writer = csv.writer(file)
#         if not file_exists:
#             writer.writerow(['Model', 'Scaler', 'PCA', 'MSE', 'MAE', 'R2', 'CV_MSE', 'CV_MAE', 'CV_R2'])
#         writer.writerow([model_name, scaler_type, use_pca, mse, mae, r2, mse_cv, mae_cv, r2_cv])


# Caricamento dei Dati

In [15]:
# Configurazione dei modelli
preprocessing_options = {
    'LR': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'RF': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'KNR': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'SVR': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'FFNN': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'TabNet': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [False]},
    'TabTransformer': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [False]}
}

# CSV zip folder's path
csv_file_name = '../data.zip'
# loading data from csv
df = pd.read_csv(csv_file_name)

X = df.drop('Year', axis=1)
y = df['Year']

# Divisione in training e validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=89)

# Creazione degli scaler e PCA per ogni combinazione di preprocessing
scalers = {}
pcas = {}

for model in preprocessing_options:
    for scaler_type in preprocessing_options[model]['scaler_type']:
        for use_pca in preprocessing_options[model]['use_pca']:
            key = f"{model}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
            if scaler_type == 'Standard':
                scaler = StandardScaler().fit(X_train)
            elif scaler_type == 'MinMax':
                scaler = MinMaxScaler().fit(X_train)
            scalers[key] = scaler
            if use_pca:
                pca = PCA(n_components=52).fit(scaler.transform(X_train))
                pcas[key] = pca
            else:
                pcas[key] = None

# Funzioni di Training

In [16]:
# Funzione per addestrare i modelli
def train_model(X_train_scaled, y_train, model_type):
    if model_type == 'LR':
        model = LinearRegression()
    elif model_type == 'RF':
        model = RandomForestRegressor(random_state=89)
    elif model_type == 'KNR':
        model = KNeighborsRegressor()
    elif model_type == 'SVR':
        model = SVR()
    return model.fit(X_train_scaled, y_train)

# Funzione per addestrare una rete neurale feed-forward
def train_ffnn(X_train_scaled, y_train, input_dim):
    class FFNN(torch.nn.Module):
        def __init__(self, input_dim):
            super(FFNN, self).__init__()
            self.fc1 = torch.nn.Linear(input_dim, 128)
            self.fc2 = torch.nn.Linear(128, 64)
            self.fc3 = torch.nn.Linear(64, 1)

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = torch.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    ffnn = FFNN(input_dim=input_dim)
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(ffnn.parameters(), lr=0.001)

    X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

    ffnn.train()
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = ffnn(X_tensor)
        loss = criterion(outputs, y_tensor)
        loss.backward()
        optimizer.step()

    return ffnn

# Funzione per addestrare TabNet
def train_tabnet(X_train_scaled, y_train, X_val_scaled, y_val):
    tabnet = TabNetRegressor()
    tabnet.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], patience=10, max_epochs=100)
    return tabnet

# Funzione per addestrare TabTransformer
def train_tabtransformer(df_train, df_val):
    data_config = DataConfig(
        target=['target_column'],
        continuous_cols=df_train.columns.difference(['target_column']).tolist(),
    )

    model_config = ModelConfig(
        task="regression",
        metrics=["mean_squared_error", "mean_absolute_error", "r2_score"],
        metrics_params=[{}, {}, {}]
    )

    trainer_config = TrainerConfig(
        max_epochs=100,
        gpus=0
    )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        trainer_config=trainer_config
    )

    tabular_model.fit(train=df_train, validation=df_val)
    return tabular_model

# Preparazione del dizionario per salvare le performance
performance_dict = {
    'Model': [],
    'Scaler': [],
    'PCA': [],
    'MSE_Val': [],
    'MAE_Val': [],
    'R2_Val': [],
    'MSE_CV': [],
    'MAE_CV': [],
    'R2_CV': []
}


# Preprocessing e salvataggio dei risultati

In [6]:
# Esegui il preprocessing specifico per ogni modello con Standard Scaler e Min-Max Scaler
preprocessing_options = {
    'LR': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'RF': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'KNR': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'SVR': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'FFNN': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'TabNet': {'scaler_type': ['standard', 'minmax'], 'use_pca': [False], 'n_components': None},
    'TabTransformer': {'scaler_type': ['standard', 'minmax'], 'use_pca': [False], 'n_components': None},
}

# Funzione per eseguire il preprocessing e salvare i risultati
def preprocess_and_save(X_train, X_val, preprocessing_options):
    scalers = {}
    pcas = {}
    for clfName, options in preprocessing_options.items():
        for scaler_type in options['scaler_type']:
            for use_pca in options['use_pca']:
                key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
                X_train_scaled, scaler, pca = preprocess(X_train, scaler_type=scaler_type, use_pca=use_pca, n_components=options['n_components'])
                X_val_scaled = scaler.transform(X_val)
                if pca:
                    X_val_scaled = pca.transform(X_val_scaled)
                scalers[key] = scaler
                pcas[key] = pca
                directory = os.path.join('models', key.lower())
                save_model(scaler, directory, 'scaler')
                if pca:
                    save_model(pca, directory, 'pca')
    return scalers, pcas

# Esegui il preprocessing per tutte le combinazioni di scaler e PCA
scalers, pcas = preprocess_and_save(X_train, X_val, preprocessing_options)

# Training trad models

## All the models (DO NOT run this cell if you don't want your PC to crash or to explode)

In [None]:
# Addestra i modelli tradizionali con tutte le combinazioni di scaler e PCA
models = {}
cv_performance = {}
validation_performance = {}

for clfName in ['LR', 'RF', 'KNR', 'SVR']:
    for scaler_type in preprocessing_options[clfName]['scaler_type']:
        for use_pca in preprocessing_options[clfName]['use_pca']:
            key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
            X_train_scaled = scalers[key].transform(X_train)
            if pcas[key]:
                X_train_scaled = pcas[key].transform(X_train_scaled)
            
            model = train_model(X_train_scaled, y_train, model_type=clfName)
            models[key] = model
            save_model(model, f'model_{key.lower()}')

            # Valutazione su validation set
            X_val_scaled = scalers[key].transform(X_val)
            if pcas[key]:
                X_val_scaled = pcas[key].transform(X_val_scaled)
            y_pred = model.predict(X_val_scaled)
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            validation_performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
            print(f"Validation Performance of {clfName} with {scaler_type} scaler and PCA={use_pca}: MSE={mse}, MAE={mae}, R2={r2}")

            # Cross-validation
            mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
                model, X.values, y, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=preprocessing_options[clfName]['n_components']
            )
            cv_performance[key] = {'mse': mse_cv, 'mae': mae_cv, 'r2': r2_cv}
            print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and PCA={use_pca}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")


In [None]:
models = {}
performance = {}

## Training LR

In [17]:
clfName = 'LR'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        if pcas[key]:
            X_train_scaled = pcas[key].transform(X_train_scaled)

        model = train_model(X_train_scaled, y_train, model_type=clfName)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(None)  # No cross-validation for Linear Regression
        performance_dict['MAE_CV'].append(None)
        performance_dict['R2_CV'].append(None)


Validation Performance of LR with Standard scaler and PCA: MSE=98.2318351539259, MAE=7.392438758148582, R2=0.12050186127982243
Validation Performance of LR with Standard scaler and NoPCA: MSE=85.58234834475938, MAE=6.668881118801984, R2=0.23375638907108864
Validation Performance of LR with MinMax scaler and PCA: MSE=87.21013510994777, MAE=6.73281651718109, R2=0.21918234158461758
Validation Performance of LR with MinMax scaler and NoPCA: MSE=85.58234834475938, MAE=6.668881118801991, R2=0.23375638907108864


## Training RF

In [18]:
clfName = 'RF'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        if pcas[key]:
            X_train_scaled = pcas[key].transform(X_train_scaled)

        model = train_model(X_train_scaled, y_train, model_type=clfName)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Cross-validation
        mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
            model, X.values, y.values, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=52
        )
        print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(mse_cv)
        performance_dict['MAE_CV'].append(mae_cv)
        performance_dict['R2_CV'].append(r2_cv)


Validation Performance of RF with Standard scaler and PCA: MSE=92.01214625373923, MAE=7.210540541101935, R2=0.17618854169825393
Cross-Validation Performance of RF with Standard scaler and PCA: MSE=90.85271300574892, MAE=7.169012839722981, R2=0.17553096742065438
Validation Performance of RF with Standard scaler and NoPCA: MSE=79.40827703444918, MAE=6.436896507055287, R2=0.2890346419636943
Cross-Validation Performance of RF with Standard scaler and NoPCA: MSE=78.4878556946283, MAE=6.400234070255443, R2=0.28774069750333975
Validation Performance of RF with MinMax scaler and PCA: MSE=81.65413718735886, MAE=6.573940451406099, R2=0.26892680399838764
Cross-Validation Performance of RF with MinMax scaler and PCA: MSE=80.79298888304402, MAE=6.526083350469959, R2=0.2668224632088366
Validation Performance of RF with MinMax scaler and NoPCA: MSE=79.42058209449787, MAE=6.438306572816495, R2=0.28892447118868336
Cross-Validation Performance of RF with MinMax scaler and NoPCA: MSE=78.52371315641918, M

## Training KNR

In [19]:
clfName = 'KNR'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        if pcas[key]:
            X_train_scaled = pcas[key].transform(X_train_scaled)

        model = train_model(X_train_scaled, y_train, model_type=clfName)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Cross-validation
        mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
            model, X.values, y.values, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=52
        )
        print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(mse_cv)
        performance_dict['MAE_CV'].append(mae_cv)
        performance_dict['R2_CV'].append(r2_cv)


found 0 physical cores < 1
  File "c:\Users\Gabriele\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Validation Performance of KNR with Standard scaler and PCA: MSE=95.77045226529194, MAE=7.149356597600872, R2=0.14253933686845544
Cross-Validation Performance of KNR with Standard scaler and PCA: MSE=94.88380767324279, MAE=7.103034797263805, R2=0.1388833328796148
Validation Performance of KNR with Standard scaler and NoPCA: MSE=89.67204282740155, MAE=6.853474769505305, R2=0.19714016705119275
Cross-Validation Performance of KNR with Standard scaler and NoPCA: MSE=88.92059514226231, MAE=6.822715177951821, R2=0.19298139456120558
Validation Performance of KNR with MinMax scaler and PCA: MSE=85.17326063249726, MAE=6.556490532368397, R2=0.23741907012498586
Cross-Validation Performance of KNR with MinMax scaler and PCA: MSE=84.32230474868643, MAE=6.529097650441163, R2=0.23475225724794607
Validation Performance of KNR with MinMax scaler and NoPCA: MSE=84.66267790225041, MAE=6.556375532864084, R2=0.24199046554085202
Cross-Validation Performance of KNR with MinMax scaler and NoPCA: MSE=83.8903333

## Training SVR

In [1]:
clfName = 'SVR'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        if pcas[key]:
            X_train_scaled = pcas[key].transform(X_train_scaled)

        model = train_model(X_train_scaled, y_train, model_type=clfName)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Cross-validation
        mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
            model, X.values, y.values, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=52
        )
        print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(mse_cv)
        performance_dict['MAE_CV'].append(mae_cv)
        performance_dict['R2_CV'].append(r2_cv)


NameError: name 'preprocessing_options' is not defined

In [2]:
# CSV zip folder's path
csv_file_name = '../data.zip'
# loading data from csv
df = pd.read_csv(csv_file_name)

X = df.drop('Year', axis=1)
y = df['Year']

# Divisione in training e validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=89)

NameError: name 'pd' is not defined

# Training FFNN

In [21]:
clfName = 'FFNN'
input_dim = X_train.shape[1]
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        if pcas[key]:
            X_train_scaled = pcas[key].transform(X_train_scaled)

        model = train_ffnn(X_train_scaled, y_train, input_dim=X_train_scaled.shape[1])
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
        model.eval()
        with torch.no_grad():
            y_pred = model(X_val_tensor).numpy()
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Cross-validation
        mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
            model, X.values, y.values, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=52
        )
        print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(mse_cv)
        performance_dict['MAE_CV'].append(mae_cv)
        performance_dict['R2_CV'].append(r2_cv)


AttributeError: Can't pickle local object 'train_ffnn.<locals>.FFNN'

# Training TabNet

In [22]:
clfName = 'TabNet'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        X_val_scaled = scalers[key].transform(X_val)

        model = train_tabnet(X_train_scaled, y_train, X_val_scaled, y_val)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Cross-validation
        mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
            model, X.values, y.values, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=52
        )
        print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(mse_cv)
        performance_dict['MAE_CV'].append(mae_cv)
        performance_dict['R2_CV'].append(r2_cv)




ValueError: Targets should be 2D : (n_samples, n_regression) but y_train.shape=(201740,) given.
Use reshape(-1, 1) for single regression.

# Training TabTransformer

In [23]:
clfName = 'TabTransformer'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        X_val_scaled = scalers[key].transform(X_val)

        df_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
        df_train_scaled['target_column'] = y_train.values
        df_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)
        df_val_scaled['target_column'] = y_val.values

        model = train_tabtransformer(df_train_scaled, df_val_scaled)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Cross-validation
        mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
            model, X.values, y.values, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=52
        )
        print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(mse_cv)
        performance_dict['MAE_CV'].append(mae_cv)
        performance_dict['R2_CV'].append(r2_cv)


TypeError: TrainerConfig.__init__() got an unexpected keyword argument 'gpus'

# Validation

## All the models (DO NOT run this cell if you don't want your PC to crash or to explode)

In [None]:
# Valutazione dei modelli con tutte le combinazioni di scaler e PCA
# performance = {}

# Valutazione modelli tradizionali
for clfName in ['LR', 'RF', 'KNR', 'SVR']:
    for scaler_type in preprocessing_options[clfName]['scaler_type']:
        for use_pca in preprocessing_options[clfName]['use_pca']:
            key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
            X_val_scaled = scalers[key].transform(X_val)
            if pcas[key]:
                X_val_scaled = pcas[key].transform(X_val_scaled)
            y_pred = models[key].predict(X_val_scaled)
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
            print(f"Performance of {clfName} with {scaler_type} scaler and PCA={use_pca}: MSE={mse}, MAE={mae}, R2={r2}")

# Valutazione FFNN
for scaler_type in preprocessing_options['FFNN']['scaler_type']:
    for use_pca in preprocessing_options['FFNN']['use_pca']:
        key = f"FFNN_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
        y_pred_tensor = models[key](X_val_tensor).detach().numpy().squeeze()
        mse = mean_squared_error(y_val, y_pred_tensor)
        mae = mean_absolute_error(y_val, y_pred_tensor)
        r2 = r2_score(y_val, y_pred_tensor)
        performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
        print(f"Performance of FFNN with {scaler_type} scaler and PCA={use_pca}: MSE={mse}, MAE={mae}, R2={r2}")

# Valutazione TabNet
for scaler_type in preprocessing_options['TabNet']['scaler_type']:
    key = f"TabNet_{scaler_type}_NoPCA"
    X_val_scaled = scalers[key].transform(X_val)
    y_pred = models[key].predict(X_val_scaled)
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
    print(f"Performance of TabNet with {scaler_type} scaler: MSE={mse}, MAE={mae}, R2={r2}")

# Valutazione TabTransformer
for scaler_type in preprocessing_options['TabTransformer']['scaler_type']:
    key = f"TabTransformer_{scaler_type}_NoPCA"
    y_pred = models[key].predict(df_val).squeeze()
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
    print(f"Performance of TabTransformer with {scaler_type} scaler: MSE={mse}, MAE={mae}, R2={r2}")

# Salva le performance dei modelli
with open('performance.txt', 'w') as f:
    for clfName, metrics in performance.items():
        f.write(f"{clfName}: MSE={metrics['mse']}, MAE={metrics['mae']}, R2={metrics['r2']}\n")


## Validation LR

In [15]:
# Valutazione dei modelli con tutte le combinazioni di scaler e PCA
# performance = {}

# Valutazione modelli tradizionali
for clfName in ['LR']:
    for scaler_type in preprocessing_options[clfName]['scaler_type']:
        for use_pca in preprocessing_options[clfName]['use_pca']:
            key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
            X_val_scaled = scalers[key].transform(X_val)
            if pcas[key]:
                X_val_scaled = pcas[key].transform(X_val_scaled)
            y_pred = models[key].predict(X_val_scaled)
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
            print(f"Performance of {clfName} with {scaler_type} scaler and PCA={use_pca}: MSE={mse}, MAE={mae}, R2={r2}")


Performance of LR with standard scaler and PCA=True: MSE=98.2318351539259, MAE=7.392438758148582, R2=0.12050186127982243
Performance of LR with standard scaler and PCA=False: MSE=85.58234834475938, MAE=6.668881118801984, R2=0.23375638907108864
Performance of LR with minmax scaler and PCA=True: MSE=87.21013510994777, MAE=6.73281651718109, R2=0.21918234158461758
Performance of LR with minmax scaler and PCA=False: MSE=85.58234834475938, MAE=6.668881118801991, R2=0.23375638907108864


## Validation RF 

In [16]:
# Valutazione dei modelli con tutte le combinazioni di scaler e PCA
performance = {}

# Valutazione modelli tradizionali
for clfName in ['RF']:
    for scaler_type in preprocessing_options[clfName]['scaler_type']:
        for use_pca in preprocessing_options[clfName]['use_pca']:
            key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
            X_val_scaled = scalers[key].transform(X_val)
            if pcas[key]:
                X_val_scaled = pcas[key].transform(X_val_scaled)
            y_pred = models[key].predict(X_val_scaled)
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
            print(f"Performance of {clfName} with {scaler_type} scaler and PCA={use_pca}: MSE={mse}, MAE={mae}, R2={r2}")


Performance of RF with standard scaler and PCA=True: MSE=92.01214625373923, MAE=7.210540541101935, R2=0.17618854169825393
Performance of RF with standard scaler and PCA=False: MSE=79.40827703444918, MAE=6.436896507055287, R2=0.2890346419636943
Performance of RF with minmax scaler and PCA=True: MSE=81.65413718735886, MAE=6.573940451406099, R2=0.26892680399838764
Performance of RF with minmax scaler and PCA=False: MSE=79.42058209449787, MAE=6.438306572816495, R2=0.28892447118868336


## Validation KNR

In [19]:
# Valutazione modelli tradizionali
for clfName in ['KNR']:
    for scaler_type in preprocessing_options[clfName]['scaler_type']:
        for use_pca in preprocessing_options[clfName]['use_pca']:
            key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
            X_val_scaled = scalers[key].transform(X_val)
            if pcas[key]:
                X_val_scaled = pcas[key].transform(X_val_scaled)
            y_pred = models[key].predict(X_val_scaled)
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
            print(f"Performance of {clfName} with {scaler_type} scaler and PCA={use_pca}: MSE={mse}, MAE={mae}, R2={r2}")


Performance of KNR with standard scaler and PCA=True: MSE=95.77045226529194, MAE=7.149356597600872, R2=0.14253933686845544
Performance of KNR with standard scaler and PCA=False: MSE=89.67204282740155, MAE=6.853474769505305, R2=0.19714016705119275
Performance of KNR with minmax scaler and PCA=True: MSE=85.17326063249726, MAE=6.556490532368397, R2=0.23741907012498586
Performance of KNR with minmax scaler and PCA=False: MSE=84.66267790225041, MAE=6.556375532864084, R2=0.24199046554085202


## Validation SVR

In [25]:
# Valutazione modelli tradizionali
for clfName in ['SVR']:
    for scaler_type in preprocessing_options[clfName]['scaler_type']:
        for use_pca in preprocessing_options[clfName]['use_pca']:
            key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
            X_val_scaled = scalers[key].transform(X_val)
            if pcas[key]:
                X_val_scaled = pcas[key].transform(X_val_scaled)
            y_pred = models[key].predict(X_val_scaled)
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
            print(f"Performance of {clfName} with {scaler_type} scaler and PCA={use_pca}: MSE={mse}, MAE={mae}, R2={r2}")


Performance of SVR with standard scaler and PCA=True: MSE=89.76062511715364, MAE=6.356003295928954, R2=0.19634706409390346
Performance of SVR with standard scaler and PCA=False: MSE=78.95616094014734, MAE=5.841599492634313, R2=0.2930825686139583
Performance of SVR with minmax scaler and PCA=True: MSE=78.74912613182043, MAE=5.808798686392694, R2=0.29493621135908754
Performance of SVR with minmax scaler and PCA=False: MSE=87.8756541494357, MAE=6.264030260698204, R2=0.21322375641113178


## Validation FFNN 

In [26]:
# Valutazione FFNN
for scaler_type in preprocessing_options['FFNN']['scaler_type']:
    for use_pca in preprocessing_options['FFNN']['use_pca']:
        key = f"FFNN_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
        y_pred_tensor = models[key](X_val_tensor).detach().numpy().squeeze()
        mse = mean_squared_error(y_val, y_pred_tensor)
        mae = mean_absolute_error(y_val, y_pred_tensor)
        r2 = r2_score(y_val, y_pred_tensor)
        performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
        print(f"Performance of FFNN with {scaler_type} scaler and PCA={use_pca}: MSE={mse}, MAE={mae}, R2={r2}")

Performance of FFNN with standard scaler and PCA=True: MSE=3827140.8363093333, MAE=1956.1712132502987, R2=-34264.50390625
Performance of FFNN with standard scaler and PCA=False: MSE=3606756.0399361844, MAE=1897.8917027504365, R2=-32291.3359375
Performance of FFNN with minmax scaler and PCA=True: MSE=3965149.11985667, MAE=1991.2404726114385, R2=-35500.12890625
Performance of FFNN with minmax scaler and PCA=False: MSE=3461327.940968171, MAE=1860.4338750381446, R2=-30989.271484375


## Validation TabNet

In [None]:
# Valutazione TabNet
for scaler_type in preprocessing_options['TabNet']['scaler_type']:
    key = f"TabNet_{scaler_type}_NoPCA"
    X_val_scaled = scalers[key].transform(X_val)
    y_pred = models[key].predict(X_val_scaled)
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
    print(f"Performance of TabNet with {scaler_type} scaler: MSE={mse}, MAE={mae}, R2={r2}")

## Validation TabTransformer

In [None]:
# Valutazione TabTransformer
for scaler_type in preprocessing_options['TabTransformer']['scaler_type']:
    key = f"TabTransformer_{scaler_type}_NoPCA"
    y_pred = models[key].predict(df_val).squeeze()
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
    print(f"Performance of TabTransformer with {scaler_type} scaler: MSE={mse}, MAE={mae}, R2={r2}")

# Saving performance

In [24]:
# Salvataggio delle performance in un file CSV
performance_df = pd.DataFrame(performance_dict)
performance_df.to_csv('performance_results.csv', index=False)