# Installazione delle Librerie

In [1]:
%pip install numpy pandas requests scikit-learn 
%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu
%pip install -U pytorch-tabnet pytorch-tabular tab-transformer-pytorch

Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://download.pytorch.org/whl/cpu
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Import e definizioni delle funzioni

In [2]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter
# import torch
# from pytorch_tabnet.tab_model import TabNetRegressor
# from pytorch_tabular.models import TabTransformerModel
# from pytorch_tabular.config import ModelConfig, DataConfig, TrainerConfig
# from pytorch_tabular import TabularModel
import pickle

# Funzioni di utilità
def save_model(model, directory, filename):
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(os.path.join(directory, filename + '.pkl'), 'wb') as f:
        pickle.dump(model, f)

def cross_val_score_with_preprocessing(model, X, y, cv, scaler_type, use_pca, n_components):
    mse_scores, mae_scores, r2_scores = [], [], []
    for train_idx, val_idx in KFold(n_splits=cv, shuffle=True, random_state=89).split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Scaling
        if scaler_type == 'Standard':
            scaler = StandardScaler().fit(X_train)
        elif scaler_type == 'MinMax':
            scaler = MinMaxScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        # PCA
        if use_pca:
            pca = PCA(n_components=n_components).fit(X_train_scaled)
            X_train_scaled = pca.transform(X_train_scaled)
            X_val_scaled = pca.transform(X_val_scaled)
        
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
        mse_scores.append(mean_squared_error(y_val, y_pred))
        mae_scores.append(mean_absolute_error(y_val, y_pred))
        r2_scores.append(r2_score(y_val, y_pred))
        
    return np.mean(mse_scores), np.mean(mae_scores), np.mean(r2_scores)


## Functions definition

In [3]:
# random_state = 89

# def save_model(model, directory, filename):
#     if not os.path.exists(directory):
#         os.makedirs(directory)
#     joblib.dump(model, os.path.join(directory, f'{filename}.pkl'))

# def preprocess(X, scaler_type='standard', use_pca=False, n_components=None):
#     if scaler_type == 'standard':
#         scaler = StandardScaler()
#     elif scaler_type == 'minmax':
#         scaler = MinMaxScaler()
#     X_scaled = scaler.fit_transform(X)

#     pca = None
#     if use_pca and n_components:
#         pca = PCA(n_components=n_components, random_state=random_state)
#         X_scaled = pca.fit_transform(X_scaled)

#     return X_scaled, scaler, pca

# def cross_val_score_with_preprocessing(model, X, y, cv=5, scaler_type='standard', use_pca=False, n_components=None):
#     kf = KFold(n_splits=cv, shuffle=True, random_state=random_state)
#     mse_scores = []
#     mae_scores = []
#     r2_scores = []

#     for train_index, test_index in kf.split(X):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]

#         X_train_scaled, scaler, pca = preprocess(X_train, scaler_type=scaler_type, use_pca=use_pca, n_components=n_components)
#         X_test_scaled = scaler.transform(X_test)
#         if pca:
#             X_test_scaled = pca.transform(X_test_scaled)

#         model.fit(X_train_scaled, y_train)
#         y_pred = model.predict(X_test_scaled)

#         mse_scores.append(mean_squared_error(y_test, y_pred))
#         mae_scores.append(mean_absolute_error(y_test, y_pred))
#         r2_scores.append(r2_score(y_test, y_pred))

#     return np.mean(mse_scores), np.mean(mae_scores), np.mean(r2_scores)



# def grid_search_cv_with_preprocessing(model, param_grid, X, y, cv=5, scaler_type='standard', use_pca=False, n_components=None):
#     # Creazione del pipeline di preprocessing
#     steps = []
#     if scaler_type == 'standard':
#         steps.append(('scaler', StandardScaler()))
#     elif scaler_type == 'minmax':
#         steps.append(('scaler', MinMaxScaler()))
#     if use_pca:
#         steps.append(('pca', PCA(n_components=n_components)))

#     pipeline = Pipeline(steps + [('model', model)])
    
#     # Grid Search CV
#     grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
#     grid_search.fit(X, y)
    
#     best_model = grid_search.best_estimator_
#     best_params = grid_search.best_params_
#     best_score = -grid_search.best_score_
    
#     return best_model, best_params, best_score




# def save_performance(model_name, scaler_type, use_pca, mse, mae, r2, mse_cv, mae_cv, r2_cv, filename='model_performance.csv'):
#     file_exists = os.path.isfile(filename)
#     with open(filename, mode='a', newline='') as file:
#         writer = csv.writer(file)
#         if not file_exists:
#             writer.writerow(['Model', 'Scaler', 'PCA', 'MSE', 'MAE', 'R2', 'CV_MSE', 'CV_MAE', 'CV_R2'])
#         writer.writerow([model_name, scaler_type, use_pca, mse, mae, r2, mse_cv, mae_cv, r2_cv])


# Caricamento dei Dati

In [15]:
# Configurazione dei modelli
preprocessing_options = {
    'LR': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'RF': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'KNR': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'SVR': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'FFNN': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [True, False]},
    'TabNet': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [False]},
    'TabTransformer': {'scaler_type': ['Standard', 'MinMax'], 'use_pca': [False]}
}

# CSV zip folder's path
csv_file_name = '../data.zip'
# loading data from csv
df = pd.read_csv(csv_file_name)

X = df.drop('Year', axis=1)
y = df['Year']

# Divisione in training e validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=89)

# Creazione degli scaler e PCA per ogni combinazione di preprocessing
scalers = {}
pcas = {}

for model in preprocessing_options:
    for scaler_type in preprocessing_options[model]['scaler_type']:
        for use_pca in preprocessing_options[model]['use_pca']:
            key = f"{model}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
            if scaler_type == 'Standard':
                scaler = StandardScaler().fit(X_train)
            elif scaler_type == 'MinMax':
                scaler = MinMaxScaler().fit(X_train)
            scalers[key] = scaler
            if use_pca:
                pca = PCA(n_components=52).fit(scaler.transform(X_train))
                pcas[key] = pca
            else:
                pcas[key] = None

# Funzioni di Training

In [16]:
# Funzione per addestrare i modelli
def train_model(X_train_scaled, y_train, model_type):
    if model_type == 'LR':
        model = LinearRegression()
    elif model_type == 'RF':
        model = RandomForestRegressor(random_state=89)
    elif model_type == 'KNR':
        model = KNeighborsRegressor()
    elif model_type == 'SVR':
        model = SVR()
    return model.fit(X_train_scaled, y_train)

# Funzione per addestrare una rete neurale feed-forward
def train_ffnn(X_train_scaled, y_train, input_dim):
    class FFNN(torch.nn.Module):
        def __init__(self, input_dim):
            super(FFNN, self).__init__()
            self.fc1 = torch.nn.Linear(input_dim, 128)
            self.fc2 = torch.nn.Linear(128, 64)
            self.fc3 = torch.nn.Linear(64, 1)

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = torch.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    ffnn = FFNN(input_dim=input_dim)
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(ffnn.parameters(), lr=0.001)

    X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

    ffnn.train()
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = ffnn(X_tensor)
        loss = criterion(outputs, y_tensor)
        loss.backward()
        optimizer.step()

    return ffnn

# Funzione per addestrare TabNet
def train_tabnet(X_train_scaled, y_train, X_val_scaled, y_val):
    tabnet = TabNetRegressor()
    tabnet.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], patience=10, max_epochs=100)
    return tabnet

# Funzione per addestrare TabTransformer
def train_tabtransformer(df_train, df_val):
    data_config = DataConfig(
        target=['target_column'],
        continuous_cols=df_train.columns.difference(['target_column']).tolist(),
    )

    model_config = ModelConfig(
        task="regression",
        metrics=["mean_squared_error", "mean_absolute_error", "r2_score"],
        metrics_params=[{}, {}, {}]
    )

    trainer_config = TrainerConfig(
        max_epochs=100,
        gpus=0
    )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        trainer_config=trainer_config
    )

    tabular_model.fit(train=df_train, validation=df_val)
    return tabular_model

# Preparazione del dizionario per salvare le performance
performance_dict = {
    'Model': [],
    'Scaler': [],
    'PCA': [],
    'MSE_Val': [],
    'MAE_Val': [],
    'R2_Val': [],
    'MSE_CV': [],
    'MAE_CV': [],
    'R2_CV': []
}


# Preprocessing e salvataggio dei risultati

In [6]:
# Esegui il preprocessing specifico per ogni modello con Standard Scaler e Min-Max Scaler
preprocessing_options = {
    'LR': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'RF': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'KNR': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'SVR': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'FFNN': {'scaler_type': ['standard', 'minmax'], 'use_pca': [True, False], 'n_components': 52},
    'TabNet': {'scaler_type': ['standard', 'minmax'], 'use_pca': [False], 'n_components': None},
    'TabTransformer': {'scaler_type': ['standard', 'minmax'], 'use_pca': [False], 'n_components': None},
}

# Funzione per eseguire il preprocessing e salvare i risultati
def preprocess_and_save(X_train, X_val, preprocessing_options):
    scalers = {}
    pcas = {}
    for clfName, options in preprocessing_options.items():
        for scaler_type in options['scaler_type']:
            for use_pca in options['use_pca']:
                key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
                X_train_scaled, scaler, pca = preprocess(X_train, scaler_type=scaler_type, use_pca=use_pca, n_components=options['n_components'])
                X_val_scaled = scaler.transform(X_val)
                if pca:
                    X_val_scaled = pca.transform(X_val_scaled)
                scalers[key] = scaler
                pcas[key] = pca
                directory = os.path.join('models', key.lower())
                save_model(scaler, directory, 'scaler')
                if pca:
                    save_model(pca, directory, 'pca')
    return scalers, pcas

# Esegui il preprocessing per tutte le combinazioni di scaler e PCA
scalers, pcas = preprocess_and_save(X_train, X_val, preprocessing_options)

# Training trad models

## All the models (DO NOT run this cell if you don't want your PC to crash or to explode)

In [None]:
# Addestra i modelli tradizionali con tutte le combinazioni di scaler e PCA
models = {}
cv_performance = {}
validation_performance = {}

for clfName in ['LR', 'RF', 'KNR', 'SVR']:
    for scaler_type in preprocessing_options[clfName]['scaler_type']:
        for use_pca in preprocessing_options[clfName]['use_pca']:
            key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
            X_train_scaled = scalers[key].transform(X_train)
            if pcas[key]:
                X_train_scaled = pcas[key].transform(X_train_scaled)
            
            model = train_model(X_train_scaled, y_train, model_type=clfName)
            models[key] = model
            save_model(model, f'model_{key.lower()}')

            # Valutazione su validation set
            X_val_scaled = scalers[key].transform(X_val)
            if pcas[key]:
                X_val_scaled = pcas[key].transform(X_val_scaled)
            y_pred = model.predict(X_val_scaled)
            mse = mean_squared_error(y_val, y_pred)
            mae = mean_absolute_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            validation_performance[key] = {'mse': mse, 'mae': mae, 'r2': r2}
            print(f"Validation Performance of {clfName} with {scaler_type} scaler and PCA={use_pca}: MSE={mse}, MAE={mae}, R2={r2}")

            # Cross-validation
            mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
                model, X.values, y, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=preprocessing_options[clfName]['n_components']
            )
            cv_performance[key] = {'mse': mse_cv, 'mae': mae_cv, 'r2': r2_cv}
            print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and PCA={use_pca}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")


In [None]:
models = {}
performance = {}

## Training LR

In [17]:
clfName = 'LR'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        if pcas[key]:
            X_train_scaled = pcas[key].transform(X_train_scaled)

        model = train_model(X_train_scaled, y_train, model_type=clfName)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(None)  # No cross-validation for Linear Regression
        performance_dict['MAE_CV'].append(None)
        performance_dict['R2_CV'].append(None)


Validation Performance of LR with Standard scaler and PCA: MSE=98.2318351539259, MAE=7.392438758148582, R2=0.12050186127982243
Validation Performance of LR with Standard scaler and NoPCA: MSE=85.58234834475938, MAE=6.668881118801984, R2=0.23375638907108864
Validation Performance of LR with MinMax scaler and PCA: MSE=87.21013510994777, MAE=6.73281651718109, R2=0.21918234158461758
Validation Performance of LR with MinMax scaler and NoPCA: MSE=85.58234834475938, MAE=6.668881118801991, R2=0.23375638907108864


## Training RF

In [18]:
clfName = 'RF'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        if pcas[key]:
            X_train_scaled = pcas[key].transform(X_train_scaled)

        model = train_model(X_train_scaled, y_train, model_type=clfName)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Cross-validation
        mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
            model, X.values, y.values, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=52
        )
        print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(mse_cv)
        performance_dict['MAE_CV'].append(mae_cv)
        performance_dict['R2_CV'].append(r2_cv)


Validation Performance of RF with Standard scaler and PCA: MSE=92.01214625373923, MAE=7.210540541101935, R2=0.17618854169825393
Cross-Validation Performance of RF with Standard scaler and PCA: MSE=90.85271300574892, MAE=7.169012839722981, R2=0.17553096742065438
Validation Performance of RF with Standard scaler and NoPCA: MSE=79.40827703444918, MAE=6.436896507055287, R2=0.2890346419636943
Cross-Validation Performance of RF with Standard scaler and NoPCA: MSE=78.4878556946283, MAE=6.400234070255443, R2=0.28774069750333975
Validation Performance of RF with MinMax scaler and PCA: MSE=81.65413718735886, MAE=6.573940451406099, R2=0.26892680399838764
Cross-Validation Performance of RF with MinMax scaler and PCA: MSE=80.79298888304402, MAE=6.526083350469959, R2=0.2668224632088366
Validation Performance of RF with MinMax scaler and NoPCA: MSE=79.42058209449787, MAE=6.438306572816495, R2=0.28892447118868336
Cross-Validation Performance of RF with MinMax scaler and NoPCA: MSE=78.52371315641918, M

## Training KNR

In [19]:
clfName = 'KNR'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        if pcas[key]:
            X_train_scaled = pcas[key].transform(X_train_scaled)

        model = train_model(X_train_scaled, y_train, model_type=clfName)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Cross-validation
        mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
            model, X.values, y.values, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=52
        )
        print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(mse_cv)
        performance_dict['MAE_CV'].append(mae_cv)
        performance_dict['R2_CV'].append(r2_cv)


found 0 physical cores < 1
  File "c:\Users\Gabriele\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Validation Performance of KNR with Standard scaler and PCA: MSE=95.77045226529194, MAE=7.149356597600872, R2=0.14253933686845544
Cross-Validation Performance of KNR with Standard scaler and PCA: MSE=94.88380767324279, MAE=7.103034797263805, R2=0.1388833328796148
Validation Performance of KNR with Standard scaler and NoPCA: MSE=89.67204282740155, MAE=6.853474769505305, R2=0.19714016705119275
Cross-Validation Performance of KNR with Standard scaler and NoPCA: MSE=88.92059514226231, MAE=6.822715177951821, R2=0.19298139456120558
Validation Performance of KNR with MinMax scaler and PCA: MSE=85.17326063249726, MAE=6.556490532368397, R2=0.23741907012498586
Cross-Validation Performance of KNR with MinMax scaler and PCA: MSE=84.32230474868643, MAE=6.529097650441163, R2=0.23475225724794607
Validation Performance of KNR with MinMax scaler and NoPCA: MSE=84.66267790225041, MAE=6.556375532864084, R2=0.24199046554085202
Cross-Validation Performance of KNR with MinMax scaler and NoPCA: MSE=83.8903333

## Training SVR

In [None]:
clfName = 'SVR'
for scaler_type in preprocessing_options[clfName]['scaler_type']:
    for use_pca in preprocessing_options[clfName]['use_pca']:
        key = f"{clfName}_{scaler_type}_{'PCA' if use_pca else 'NoPCA'}"
        X_train_scaled = scalers[key].transform(X_train)
        if pcas[key]:
            X_train_scaled = pcas[key].transform(X_train_scaled)

        model = train_model(X_train_scaled, y_train, model_type=clfName)
        directory = os.path.join('models', key.lower())
        save_model(model, directory, 'model')

        # Valutazione su validation set
        X_val_scaled = scalers[key].transform(X_val)
        if pcas[key]:
            X_val_scaled = pcas[key].transform(X_val_scaled)
        y_pred = model.predict(X_val_scaled)
        mse = mean_squared_error(y_val, y_pred)
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f"Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse}, MAE={mae}, R2={r2}")

        # Cross-validation
        mse_cv, mae_cv, r2_cv = cross_val_score_with_preprocessing(
            model, X.values, y.values, cv=5, scaler_type=scaler_type, use_pca=use_pca, n_components=52
        )
        print(f"Cross-Validation Performance of {clfName} with {scaler_type} scaler and {'PCA' if use_pca else 'NoPCA'}: MSE={mse_cv}, MAE={mae_cv}, R2={r2_cv}")

        # Salvare le performance
        performance_dict['Model'].append(clfName)
        performance_dict['Scaler'].append(scaler_type)
        performance_dict['PCA'].append(use_pca)
        performance_dict['MSE_Val'].append(mse)
        performance_dict['MAE_Val'].append(mae)
        performance_dict['R2_Val'].append(r2)
        performance_dict['MSE_CV'].append(mse_cv)
        performance_dict['MAE_CV'].append(mae_cv)
        performance_dict['R2_CV'].append(r2_cv)


# Validation Performance of SVR with Standard scaler and PCA: MSE=89.76062511715364, MAE=6.356003295928954, R2=0.19634706409390346
# Cross-Validation Performance of SVR with Standard scaler and PCA: MSE=88.47693285428485, MAE=6.314976936016734, R2=0.19714760071223011
# Validation Performance of SVR with Standard scaler and NoPCA: MSE=78.95616094014734, MAE=5.841599492634313, R2=0.2930825686139583
# Cross-Validation Performance of SVR with Standard scaler and NoPCA: MSE=77.90689735248935, MAE=5.800933341091231, R2=0.29307553783482143
# Validation Performance of SVR with MinMax scaler and PCA: MSE=78.74912613182043, MAE=5.808798686392694, R2=0.29493621135908754
# Cross-Validation Performance of SVR with MinMax scaler and PCA: MSE=77.6737074521388, MAE=5.765969088166743, R2=0.29518672879002494
# Validation Performance of SVR with MinMax scaler and NoPCA: MSE=87.8756541494357, MAE=6.264030260698204, R2=0.21322375641113178
# Cross-Validation Performance of SVR with MinMax scaler and NoPCA: MSE=86.66237617661876, MAE=6.2207294324537274, R2=0.21361395079336915

In [3]:
# CSV zip folder's path
csv_file_name = '../data.zip'
# loading data from csv
df = pd.read_csv(csv_file_name)

X = df.drop('Year', axis=1)
y = df['Year']

# Divisione in training e validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=89)

In [4]:
X_Validation = X_val.copy()

# Min-Max Scaling
# Creazione dell'oggetto MinMaxScaler e adattamento solo al training set
#file = open("scaler.save","wb") #salvataggio dello scaler sul disco nel file "scaler.save"
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train)
#apply transform on validation set
X_val_scaled = scaler.transform(X_Validation)
#pickle.dump(scaler, file)
#file.close()

#name columns after min-max scaling
# num_colonne = X_scaled.shape[1]  # Ottieni il numero di colonne
# nome_colonne = ['S' + str(i) for i in range(num_colonne)]
# X_scaled_df = pd.DataFrame(X_scaled, columns=nome_colonne)

# print("Dati di training originali:")
# print(X)
print("\nDati di training normalizzati MinMaxScaling:")
print(X_scaled)
# print("\nDati di validation originali:")
# print(X_Validation)
# print("\nDati di validation normalizzati Min-Max:")
# print(X_val_scaled)


Dati di training normalizzati MinMaxScaling:
[[0.85141255 0.54124099 0.51320672 ... 0.46749133 0.40665778 0.42897257]
 [0.61060493 0.50150322 0.42992334 ... 0.45522043 0.45247919 0.50174392]
 [0.44105064 0.43994505 0.52900578 ... 0.48328292 0.3835727  0.38158482]
 ...
 [0.81026452 0.47977891 0.52228356 ... 0.45482219 0.40558655 0.40335066]
 [0.54172766 0.54405139 0.27972061 ... 0.50070901 0.41803509 0.39640128]
 [0.75397911 0.57359645 0.4808505  ... 0.46099608 0.41463365 0.43697254]]


In [14]:
from sklearn.model_selection import GridSearchCV

param = {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),
         'C' : [1,5],'degree' : [3,5],'coef0' : [0.01,0.5],'gamma' : ('auto','scale')}

modelsvr = SVR()

grids = GridSearchCV(modelsvr,param,cv=5,verbose=2, n_jobs=-1) #verbose=2, n_jobs=-1

grids.fit(X_scaled,y_train)

y_pred=grids.predict(X_val_scaled)

mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2:", r2)

#### Codice da eseguire Gabri Traning SVR (minMax + PCA)

In [7]:
# CSV zip folder's path
csv_file_name = '../data.zip'
# loading data from csv
df = pd.read_csv(csv_file_name)

X = df.drop('Year', axis=1)
y = df['Year']

# Divisione in training e validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=89)

In [8]:
X_Validation = X_val.copy()

# Min-Max Scaling
# Creazione dell'oggetto MinMaxScaler e adattamento solo al training set
#file = open("scaler.save","wb") #salvataggio dello scaler sul disco nel file "scaler.save"
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train)
#apply transform on validation set
X_val_scaled = scaler.transform(X_Validation)
#pickle.dump(scaler, file)
#file.close()

#name columns after min-max scaling
# num_colonne = X_scaled.shape[1]  # Ottieni il numero di colonne
# nome_colonne = ['S' + str(i) for i in range(num_colonne)]
# X_scaled_df = pd.DataFrame(X_scaled, columns=nome_colonne)

# print("Dati di training originali:")
# print(X)
print("\nDati di training normalizzati MinMaxScaling:")
print(X_scaled)
# print("\nDati di validation originali:")
# print(X_Validation)
# print("\nDati di validation normalizzati Min-Max:")
# print(X_val_scaled)


Dati di training normalizzati MinMaxScaling:
[[0.85141255 0.54124099 0.51320672 ... 0.46749133 0.40665778 0.42897257]
 [0.61060493 0.50150322 0.42992334 ... 0.45522043 0.45247919 0.50174392]
 [0.44105064 0.43994505 0.52900578 ... 0.48328292 0.3835727  0.38158482]
 ...
 [0.81026452 0.47977891 0.52228356 ... 0.45482219 0.40558655 0.40335066]
 [0.54172766 0.54405139 0.27972061 ... 0.50070901 0.41803509 0.39640128]
 [0.75397911 0.57359645 0.4808505  ... 0.46099608 0.41463365 0.43697254]]


In [9]:
# Inizializzare l'oggetto PCA
pca = PCA(n_components=52)
X_decomposed = pca.fit_transform(X_scaled)
#pca.get_feature_names_out(X_scaled_df.columns)

# Ottieni i vettori delle componenti principali
components = pca.components_

#validation data after PCA
X_val_decomposed = pca.transform(X_val_scaled)

In [None]:
from sklearn.model_selection import GridSearchCV

param = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
         'C' : [1,5,10],
         'degree' : [3,8],
         'coef0' : [0.01,10,0.5],
         'gamma' : ['auto','scale']}

modelsvr = SVR(cache_size=200)

grids = GridSearchCV(modelsvr,param,cv=5,verbose=2, n_jobs=-1) #verbose=2, n_jobs=-1

grids.fit(X_decomposed,y_train)

y_pred=grids.predict(X_val_decomposed)

mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2:", r2)

# Training FFNN

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import random
import os

# Funzione per fissare la casualità
def fix_random(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Definizione del Dataset personalizzato
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y.values).view(-1, 1)
        self.num_features = X.shape[1]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx,:], self.y[idx]

class FeedForward(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, hidden_size4):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3)
        self.fc4 = nn.Linear(hidden_size3, hidden_size4)
        self.fc5 = nn.Linear(hidden_size4, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        h = self.relu(self.fc1(x))
        h = self.relu(self.fc2(h))
        h = self.relu(self.fc3(h))
        h = self.relu(self.fc4(h))
        output = self.fc5(h)
        return output

# Funzione per valutare le performance sul validation e test set
def test_model(model, data_loader, device):
    model.eval()
    y_pred = []
    y_test = []

    with torch.no_grad():
        for data, targets in data_loader:
            data, targets = data.to(device), targets.to(device)
            output = model(data)
            y_pred.append(output.cpu().numpy())
            y_test.append(targets.cpu().numpy())

    y_test = np.concatenate(y_test).squeeze()
    y_pred = np.concatenate(y_pred).squeeze()

    return y_test, y_pred

# Funzione per il processo di addestramento
def train_model(model, criterion, optimizer, num_epochs, train_loader, val_loader, device, writer, log_name="model", patience=10):
    n_iter = 0
    best_valid_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False

    for epoch in range(num_epochs):
        if early_stop:
            print("Early stopping")
            break

        model.train()

        for data, targets in train_loader:
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad()

            # Forward pass
            y_pred = model(data)

            # Compute Loss
            loss = criterion(y_pred, targets)
            writer.add_scalar("Loss/train", loss.item(), n_iter)

            # Backward pass
            loss.backward()
            optimizer.step()

            n_iter += 1

        # Validation
        labels, y_pred = test_model(model, val_loader, device)
        loss_val = criterion(torch.tensor(y_pred), torch.tensor(labels))
        writer.add_scalar("Loss/val", loss_val.item(), epoch)

        # Save best model
        if loss_val.item() < best_valid_loss:
            best_valid_loss = loss_val.item()
            epochs_no_improve = 0
            if not os.path.exists('models'):
                os.makedirs('models')
            torch.save(model.state_dict(), 'models/'+log_name)
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                early_stop = True

    return model

# Look for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: {}".format(device))

seed = 89
fix_random(seed)

# Train hyperparameters
num_epochs = 150
initial_learning_rate = 0.0001
batch_size = 64  # Dimensione del batch fissa
hidden_size1 = 128
hidden_size2 = 64
hidden_size3 = 32
hidden_size4 = 16

# CSV zip folder's path
csv_file_name = '../data.zip'
# loading data from csv
data = pd.read_csv(csv_file_name)

# Caricamento del dataset
X = data.drop('Year', axis=1)
y = data['Year']

# Suddivisione dei dati in train, val, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)  # 0.9 x 0.25

# Scaling dei dati
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# PCA
pca = PCA(n_components=52)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Creazione del test loader
test_dataset = MyDataset(X_test_pca, y_test)
test_loader = DataLoader(test_dataset, batch_size=1)

# Creazione del validation set fisso
val_dataset = MyDataset(X_val_pca, y_val)
val_loader = DataLoader(val_dataset, batch_size=1)

# Creazione del train loader
train_dataset = MyDataset(X_train_pca, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Definisci il modello
model = FeedForward(train_dataset.num_features, hidden_size1, hidden_size2, hidden_size3, hidden_size4)
model.to(device)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=initial_learning_rate)

# Start tensorboard
writer = SummaryWriter()

# Test before the training
y_val_initial, y_pred_initial = test_model(model, val_loader, device)
initial_mse = torch.mean((torch.tensor(y_val_initial) - torch.tensor(y_pred_initial)) ** 2).item()
initial_mae = mean_absolute_error(y_val_initial, y_pred_initial)
initial_r2 = r2_score(y_val_initial, y_pred_initial)
print(f"Initial MSE before training: {initial_mse}")
print(f"Initial MAE before training: {initial_mae}")
print(f"Initial R2 before training: {initial_r2}")

# Train the model
model = train_model(model, criterion, optimizer, num_epochs, train_loader, val_loader, device, writer)

# Load best model
model.load_state_dict(torch.load("models/model"))
model.to(device)

# Test after the training
y_val_final, y_pred_final = test_model(model, val_loader, device)
final_mse = torch.mean((torch.tensor(y_val_final) - torch.tensor(y_pred_final)) ** 2).item()
final_mae = mean_absolute_error(y_val_final, y_pred_final)
final_r2 = r2_score(y_val_final, y_pred_final)
print(f"Final MSE after training: {final_mse}")
print(f"Final MAE after training: {final_mae}")
print(f"Final R2 after training: {final_r2}")

# Test finale sul test set
print("Final evaluation on the test set")
y_test_final, y_pred_final = test_model(model, test_loader, device)
test_mse = torch.mean((torch.tensor(y_test_final) - torch.tensor(y_pred_final)) ** 2).item()
test_mae = mean_absolute_error(y_test_final, y_pred_final)
test_r2 = r2_score(y_test_final, y_pred_final)
print(f"Final MSE on the test set: {test_mse}")
print(f"Final MAE on the test set: {test_mae}")
print(f"Final R2 on the test set: {test_r2}")

# Close tensorboard writer after training
writer.flush()
writer.close()


Device: cpu
Initial MSE before training: 3992763.25
Initial MAE before training: 1998.16259765625
Initial R2 before training: -36563.94140625
Early stopping
Final MSE after training: 75.72067260742188
Final MAE after training: 6.161563396453857
Final R2 after training: 0.30656498670578003
Final evaluation on the test set
Final MSE on the test set: 77.37577819824219
Final MAE on the test set: 6.265819549560547
Final R2 on the test set: 0.3131319284439087


## FEED FORWARD FINALE

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.decomposition import PCA
from feedforward import FeedForward
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import ParameterSampler
from torch.utils.tensorboard import SummaryWriter
import random
import hashlib

# Function to set random seed
def fix_random(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Training process function
def train_model(model, criterion, optimizer, num_epochs, train_loader, val_loader, device, writer, log_name="FF", patience=10):
    n_iter = 0
    best_valid_loss = float('inf')
    epochs_no_improve = 0
    early_stop = False

    for epoch in range(num_epochs):
        if early_stop:
            print("Early stopping")
            break

        model.train()

        for data, targets in train_loader:
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad()

            # Forward pass
            y_pred = model(data)

            # Compute Loss
            loss = criterion(y_pred, targets)
            writer.add_scalar("Loss/train", loss.item(), n_iter)

            # Backward pass
            loss.backward()
            optimizer.step()

            n_iter += 1

        # Validation
        labels, y_pred = test_model(model, val_loader, device)
        loss_val = criterion(torch.tensor(y_pred), torch.tensor(labels))
        writer.add_scalar("Loss/val", loss_val.item(), epoch)

        # Save best model
        if loss_val.item() < best_valid_loss:
            best_valid_loss = loss_val.item()
            torch.save({
                'model_state_dict': model.state_dict(),
                'params': params
            }, '../pickle_saves/models/FF.save')
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                early_stop = True

    return model

# Function to generate a unique log directory name based on parameters
def get_log_dir(params):
    param_str = str(sorted(params.items()))
    param_hash = hashlib.md5(param_str.encode('utf-8')).hexdigest()
    return f"runs/param_{param_hash}"

# Look for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: {}".format(device))

seed = 89
fix_random(seed)

# CSV zip folder's path
csv_file_name = '../data.zip'
# loading data from csv
data = pd.read_csv(csv_file_name)

# Loading the dataset
X = data.drop('Year', axis=1)
y = data['Year']

# Splitting the data into train, val --> 80% training - 20% validation - NO TEST SET
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)

# Scaling the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# PCA
pca = PCA(n_components=52)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

# Creating the validation set
val_dataset = MyDataset(X_val_pca, y_val)
val_loader = DataLoader(val_dataset, batch_size=1)

# Creating the train loader
train_dataset = MyDataset(X_train_pca, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define the hyperparameter space
param_grid = {
    'num_epochs': [150, 200, 250],
    'learning_rate': [0.000001, 0.00001, 0.0001, 0.001],
    'batch_size': [64, 128, 256],
    'hidden_size1': [256, 512, 1024],
    'hidden_size2': [128, 256, 512],
    'hidden_size3': [64, 128, 256],
    'hidden_size4': [64, 128],
    'hidden_size5': [32, 64],
    'hidden_size6': [16, 32],
    'hidden_size7': [8, 16, 32],
    'hidden_size8': [4, 8],
    'negative_slope': [0.001, 0.01, 0.09] #0.01 default e solitamente migliore
}

# Number of parameter samples to evaluate
n_iter_search = 10

# Generate parameter samples
param_list = list(ParameterSampler(param_grid, n_iter=n_iter_search, random_state=seed))

# Function to run the training with given hyperparameters
def random_search(params):
    model = FeedForward(
        train_dataset.num_features, 
        params['hidden_size1'], 
        params['hidden_size2'], 
        params['hidden_size3'], 
        params['hidden_size4'], 
        params['hidden_size5'], 
        params['hidden_size6'], 
        params['hidden_size7'], 
        params['hidden_size8'], 
        params['negative_slope']
    )
    model.to(device)
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])
    
    # Create DataLoaders with the new batch size
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1)
    
    log_dir = get_log_dir(params)
    writer = SummaryWriter(log_dir=log_dir)
    model = train_model(model, criterion, optimizer, params['num_epochs'], train_loader, val_loader, device, writer, log_name=log_dir)
    
    y_val_final, y_pred_final = test_model(model, val_loader, device)
    final_mse = torch.mean((torch.tensor(y_val_final) - torch.tensor(y_pred_final)) ** 2).item()
    final_mae = mean_absolute_error(y_val_final, y_pred_final)
    final_r2 = r2_score(y_val_final, y_pred_final)
    
    writer.flush()
    writer.close()
    
    return {
        'params': params,
        'mse': final_mse,
        'mae': final_mae,
        'r2': final_r2,
        'model': model
    }

# Run the training for all parameter samples
results = []
for params in param_list:
    result = random_search(params)
    results.append(result)
    print(f"Trained with parameters: {params}, MSE: {result['mse']}, MAE: {result['mae']}, R2: {result['r2']}")

# Find the best parameters
best_result = min(results, key=lambda x: x['mse'])
print(f"Best parameters: {best_result['params']}, MSE: {best_result['mse']}, MAE: {best_result['mae']}, R2: {best_result['r2']}")

# Load the best model and parameters
checkpoint = torch.load('../pickle_saves/models/FF.save')
best_params = checkpoint['params']

# Create a new model instance with the best parameters
loaded_model = FeedForward(
    train_dataset.num_features, 
    best_params['hidden_size1'], 
    best_params['hidden_size2'], 
    best_params['hidden_size3'], 
    best_params['hidden_size4'], 
    best_params['hidden_size5'], 
    best_params['hidden_size6'], 
    best_params['hidden_size7'], 
    best_params['hidden_size8'], 
    best_params['negative_slope']
)

# Load the state dictionary
loaded_model.load_state_dict(checkpoint['model_state_dict'])
loaded_model.to(device)

# Test the loaded model on the validation set
loaded_model.eval()
y_val_final, y_pred_final = test_model(loaded_model, val_loader, device)
final_mse = torch.mean((torch.tensor(y_val_final) - torch.tensor(y_pred_final)) ** 2).item()
final_mae = mean_absolute_error(y_val_final, y_pred_final)
final_r2 = r2_score(y_val_final, y_pred_final)
print(f"Final MSE on the validation set: {final_mse}")
print(f"Final MAE on the validation set: {final_mae}")
print(f"Final R2 on the validation set: {final_r2}")



Device: cpu
Early stopping
Trained with parameters: {'num_epochs': 150, 'negative_slope': 0.01, 'learning_rate': 0.001, 'hidden_size8': 4, 'hidden_size7': 32, 'hidden_size6': 16, 'hidden_size5': 64, 'hidden_size4': 64, 'hidden_size3': 64, 'hidden_size2': 512, 'hidden_size1': 256, 'batch_size': 128}, MSE: 231.80857849121094, MAE: 12.55402946472168, R2: -1.0754497051239014
Trained with parameters: {'num_epochs': 200, 'negative_slope': 0.001, 'learning_rate': 0.001, 'hidden_size8': 4, 'hidden_size7': 16, 'hidden_size6': 16, 'hidden_size5': 32, 'hidden_size4': 128, 'hidden_size3': 256, 'hidden_size2': 256, 'hidden_size1': 512, 'batch_size': 128}, MSE: 2832427.25, MAE: 1682.94873046875, R2: -25358.541015625
Trained with parameters: {'num_epochs': 150, 'negative_slope': 0.09, 'learning_rate': 1e-05, 'hidden_size8': 8, 'hidden_size7': 32, 'hidden_size6': 32, 'hidden_size5': 64, 'hidden_size4': 128, 'hidden_size3': 64, 'hidden_size2': 512, 'hidden_size1': 1024, 'batch_size': 256}, MSE: 77.6301

## TAB TRANSFORMER

## Tuning 1

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models import TabTransformerConfig

import numpy as np

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Load dataset
data = pd.read_csv("../data.zip")
X = data[data.columns[1:]]
Y = data[data.columns[0]]

# `num_col_names` lista con i nomi delle 89 colonne numeriche
num_col_names = X.columns.tolist()

# Split data
seed = 89
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=seed, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed, shuffle=True)  # 0.9 x 0.25 = 0.22 di validation

# Scaling dei dati MinMaxScaler()
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# MinMaxScaling e creazione DataFrame per train, validation, test
train = pd.DataFrame(X_scaled, columns=X.columns)
train['Year'] = y_train.values

val = pd.DataFrame(X_val_scaled, columns=X.columns)
val['Year'] = y_val.values

test = pd.DataFrame(X_test_scaled, columns=X.columns)
test['Year'] = y_test.values

# Configurations
data_config = DataConfig(
    target=["Year"],  # target should always be a list
    continuous_cols=num_col_names,
    categorical_cols=[],
    normalize_continuous_features=False
    # num_workers=21 # Windows does not support num_workers > 0. Setting num_workers to 0
)

# Configurazione del trainer
trainer_config = TrainerConfig(
    auto_lr_find=False,
    batch_size=32, 
    max_epochs=150,
    early_stopping_patience=10,  # Numero di epoche di attesa per il miglioramento
    precision=32,
    seed=seed
)

# Configurazione dell'ottimizzatore
optimizer_config = OptimizerConfig(
    optimizer="AdamW"
)

# Configurazione del modello TabTransformer
model_config = TabTransformerConfig(
    task="regression",
    learning_rate=0.00001,
    num_heads=8,
    num_attn_blocks=6,
    ff_hidden_multiplier=6,
    seed=seed
)

# Initialize and train model
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

# Train the model
tabular_model.fit(train=train, validation=val, seed=seed)

# Valutare il modello
result = tabular_model.evaluate(val)
print(result)

# Fare predizioni
pred_df = tabular_model.predict(test)
print(pred_df)

y_true = test['Year']
y_pred = pred_df['Year_prediction']

mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
r2score = r2_score(y_true, y_pred)
print("MSE: ", mse, " MAE:", mae, " MAPE:", mape, " R2_SCORE:", r2score)


Seed set to 89


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\Gabriele\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\Gabriele\Documents\GitHub\song-publication-year-recognizer\TrainingModule\saved_models exists and is not empty.


Output()

`Trainer.fit` stopped: `max_epochs=150` reached.


Output()

c:\Users\Gabriele\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=21` in the `DataLoader` to improve performance.


[{'test_loss': 85.21570587158203, 'test_mean_squared_error': 85.21570587158203}]
       Year_prediction
0          1992.533325
1          1991.585571
2          1989.807495
3          1997.795654
4          1990.723633
...                ...
25213      2001.279663
25214      1983.474731
25215      1996.569336
25216      1998.419800
25217      2000.441284

[25218 rows x 1 columns]
MSE:  87.64905173338944  MAE: 6.8474529882626625  MAPE: 0.34367431697567447  R2_SCORE: 0.22193562984466553


## Tab transformer 2 con PCA

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models import TabTransformerConfig
import numpy as np

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Load dataset
data = pd.read_csv("../data.zip")
X = data[data.columns[1:]]
Y = data[data.columns[0]]

# Split data
seed = 89
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=seed, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed, shuffle=True)  # 0.9 x 0.25 = 0.22 di validation

# Scaling dei dati MinMaxScaler()
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# PCA
pca = PCA(n_components=52)
X_train_pca = pca.fit_transform(X_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Create new column names for PCA components
pca_col_names = [f'PCA_{i}' for i in range(1, 53)]

# MinMaxScaling e creazione DataFrame per train, validation, test
train = pd.DataFrame(X_train_pca, columns=pca_col_names)
train['Year'] = y_train.values

val = pd.DataFrame(X_val_pca, columns=pca_col_names)
val['Year'] = y_val.values

test = pd.DataFrame(X_test_pca, columns=pca_col_names)
test['Year'] = y_test.values

# Configurations
data_config = DataConfig(
    target=["Year"],  # target should always be a list
    continuous_cols=pca_col_names,
    categorical_cols=[],
    normalize_continuous_features=False
)

# Configurazione del trainer
trainer_config = TrainerConfig(
    auto_lr_find=False,
    batch_size=32, 
    max_epochs=150,
    early_stopping_patience=10,  # Numero di epoche di attesa per il miglioramento
    precision=32,
    seed=seed
)

# Configurazione dell'ottimizzatore
optimizer_config = OptimizerConfig(
    optimizer="AdamW"
)

# Configurazione del modello TabTransformer
model_config = TabTransformerConfig(
    task="regression",
    learning_rate=0.00001,
    num_heads=32,
    num_attn_blocks=24,
    ff_hidden_multiplier=24,
    seed=seed,
    metrics=["mean_squared_error","mean_absolute_error","mean_absolute_percentage_error","r2_score"],  
    )

# Initialize and train model
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

# Train the model
tabular_model.fit(train=train, validation=val, seed=seed)

# Valutare il modello
result = tabular_model.evaluate(val)
print(result)

# Fare predizioni
pred_df = tabular_model.predict(test)
print(pred_df)

y_true = test['Year']
y_pred = pred_df['Year_prediction']

mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
r2score = r2_score(y_true, y_pred)
print("MSE: ", mse, " MAE:", mae, " MAPE:", mape, " R2_SCORE:", r2score)


Seed set to 89


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


c:\Users\Gabriele\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory C:\Users\Gabriele\Documents\GitHub\song-publication-year-recognizer\TrainingModule\saved_models exists and is not empty.


Output()

`Trainer.fit` stopped: `max_epochs=150` reached.


Output()

[{'test_loss': 1324310.625, 'test_mean_squared_error': 1324310.625, 'test_mean_absolute_error': 1150.748779296875, 'test_mean_absolute_percentage_error': 0.5758378505706787, 'test_r2_score': -13198.2822265625}]
       Year_prediction
0           841.612244
1           839.556030
2           838.818909
3           847.098694
4           839.342712
...                ...
25213       851.963867
25214       835.353455
25215       845.740662
25216       849.822876
25217       854.007019

[25218 rows x 1 columns]
MSE:  1324188.9807020717  MAE: 1150.6959721482579  MAPE: 57.5831031900501  R2_SCORE: -11753.8828125


## TAB TRANSFORMER CON PIU' PARAMETRI ANCORA 

In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,mean_absolute_percentage_error, r2_score
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models import TabTransformerConfig

# Load dataset
data = pd.read_csv("../data.zip")
X = data[data.columns[1:]]
Y = data[data.columns[0]]

# `num_col_names` lista con i nomi delle 89 colonne numeriche
num_col_names = X.columns.tolist()

# Split data
seed = 89
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=seed, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed, shuffle=True)  # 0.9 x 0.25 = 0.22 di validation

# # Scaling dei dati MinMaxScaler()
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Creazione dei dataframe con target column in prima posizione
train = pd.DataFrame(X_scaled, columns=X.columns)
train = pd.concat([y_train.reset_index(drop=True), train], axis=1)

val = pd.DataFrame(X_val_scaled, columns=X.columns)
val = pd.concat([y_val.reset_index(drop=True), val], axis=1)

# Updated Configurations
data_config = DataConfig(
    target=["Year"], 
    continuous_cols=num_col_names,
    categorical_cols=[],
    normalize_continuous_features=True #TODO: senza minmax mettere true --> fa standard scaler
)

trainer_config = TrainerConfig(
    auto_lr_find=False,
    batch_size=32, 
    max_epochs=150,
    early_stopping_patience=10,
    precision=32,
    seed=seed
)

optimizer_config = OptimizerConfig(
    optimizer="AdamW"
)

model_config = TabTransformerConfig(
    task="regression",
    learning_rate=0.00001,
    num_heads=16,
    num_attn_blocks=12,
    ff_hidden_multiplier=12,
    seed=seed,
    metrics=["mean_squared_error", "mean_absolute_error", "mean_absolute_percentage_error", "r2_score"],
    attn_dropout=0.2,  # Added dropout for attention layers
    embedding_dropout=0.2,  # Added dropout for embedding layers
)

# Initialize and train model
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

tabular_model.fit(train=train, validation=val, seed=seed)

# Evaluate the model
result = tabular_model.evaluate(val)
print(result)

# Make predictions
pred_df = tabular_model.predict(test)
print(pred_df)

y_true = test['Year']
y_pred = pred_df['Year_prediction']

mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)
r2score = r2_score(y_true, y_pred)
print("MSE: ", mse, " MAE:", mae, " MAPE:", mape, " R2_SCORE:", r2score)

Seed set to 89


AttributeError: module 'torch.nn' has no attribute 'mean_absolute_error'