In [1]:
import psycopg2
from psycopg2 import sql
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix,mean_absolute_error, r2_score,f1_score, recall_score
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from functions.functions_f1_all_circuits import process_f1_dataset_single_position, process_f1_dataset_grouped_by_2, process_f1_dataset_grouped_by_4
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [2]:
# Cridar la funció per generar el dataset sense agrupació
process_f1_dataset_single_position()
dataframe_single_position = pd.read_csv('datasets/f1_top20_target_single_position_all_circuits.csv', delimiter=';')
print("Dataset sense agrupació carregat:")
print(dataframe_single_position.head())

# Cridar la funció per generar el dataset agrupat per 2 posicions
process_f1_dataset_grouped_by_2()
dataframe_grouped_by_2 = pd.read_csv('datasets/f1_top20_target_agrupacio_2_all_circuits.csv', delimiter=';')
print("\nDataset agrupat per 2 posicions carregat:")
print(dataframe_grouped_by_2.head())

# Cridar la funció per generar el dataset agrupat per 4 posicions
process_f1_dataset_grouped_by_4()
dataframe_grouped_by_4 = pd.read_csv('datasets/f1_top20_target_agrupacio_4_all_circuits.csv', delimiter=';')
print("\nDataset agrupat per 4 posicions carregat:")
print(dataframe_grouped_by_4.head())


  df = pd.read_sql(query, connection)


Dataset sense agrupació guardat en: ./datasets/f1_top20_target_single_position_all_circuits.csv
Dataset sense agrupació carregat:
   raceid  driverid driverref  constructorid  circuitref  circuitid  grid  \
0     257        57  hakkinen              1  interlagos         18     8   
1     257        84   brundle              1  interlagos         18    18   
2     258        57  hakkinen              1     okayama         28     4   
3     258        84   brundle              1     okayama         28     6   
4     259        57  hakkinen              1       imola         21     8   

   positionorder        date  year         dob  experience  hability  \
0             21  1994-03-27  1994  1968-09-28          13      80.0   
1             14  1994-03-27  1994  1959-06-01          35      74.0   
2             21  1994-04-17  1994  1968-09-28          13      72.0   
3             15  1994-04-17  1994  1959-06-01          35      74.0   
4              3  1994-05-01  1994  1968-09-28 

  df = pd.read_sql(query, connection)


Dataset agrupació 2 guardat en: ./datasets/f1_top20_target_agrupacio_2_all_circuits.csv

Dataset agrupat per 2 posicions carregat:
   raceid  driverid driverref  constructorid  circuitref  circuitid  grid  \
0     257        57  hakkinen              1  interlagos         18     8   
1     257        84   brundle              1  interlagos         18    18   
2     258        57  hakkinen              1     okayama         28     4   
3     258        84   brundle              1     okayama         28     6   
4     259        57  hakkinen              1       imola         21     8   

   positionorder        date  year         dob  experience  hability  \
0             21  1994-03-27  1994  1968-09-28          13      80.0   
1             14  1994-03-27  1994  1959-06-01          35      74.0   
2             21  1994-04-17  1994  1968-09-28          13      72.0   
3             15  1994-04-17  1994  1959-06-01          35      74.0   
4              3  1994-05-01  1994  1968-09-28

  df = pd.read_sql(query, connection)


Dataset agrupació 4 guardat en: ./datasets/f1_top20_target_agrupacio_4_all_circuits.csv

Dataset agrupat per 4 posicions carregat:
   raceid  driverid driverref  constructorid  circuitref  circuitid  grid  \
0     257        57  hakkinen              1  interlagos         18     8   
1     257        84   brundle              1  interlagos         18    18   
2     258        57  hakkinen              1     okayama         28     4   
3     258        84   brundle              1     okayama         28     6   
4     259        57  hakkinen              1       imola         21     8   

   positionorder        date  year         dob  experience  hability  \
0             21  1994-03-27  1994  1968-09-28          13      80.0   
1             14  1994-03-27  1994  1959-06-01          35      74.0   
2             21  1994-04-17  1994  1968-09-28          13      72.0   
3             15  1994-04-17  1994  1959-06-01          35      74.0   
4              3  1994-05-01  1994  1968-09-28

In [3]:
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

def objective_nn(trial, df, agrupacio):
    df = df[df['year'] >= 2014]
    max_classes = {1: 20, 2: 10, 4: 5}[agrupacio]
    df = df[df['target'] <= max_classes]
    X = df.drop(columns=['target', 'raceid', 'driverid', 'driverref', 'circuitid', 'circuitref', 'positionorder', 'date', 'dob', 'year'])
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
    
    encoder = OneHotEncoder(sparse_output=False)
    y_train_onehot = encoder.fit_transform(np.array(y_train_smote).reshape(-1, 1))
    y_test_onehot = encoder.transform(np.array(y_test).reshape(-1, 1))
    
    num_hidden_layers = trial.suggest_int('num_hidden_layers', 1, 3)
    num_neurons = trial.suggest_int('num_neurons', 32, 128)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2)
    
    model = Sequential()
    model.add(Dense(num_neurons, input_dim=X_train_smote.shape[1], activation='relu'))
    for _ in range(num_hidden_layers - 1):
        model.add(Dense(num_neurons, activation='relu'))
        model.add(Dropout(dropout_rate))
    model.add(Dense(len(encoder.categories_[0]), activation='softmax'))
    
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.fit(X_train_smote, y_train_onehot, epochs=10, batch_size=32, verbose=0)
    
    y_pred_probs = model.predict(X_test_scaled)
    y_pred_classes = np.argmax(y_pred_probs, axis=1)
    y_test_classes = np.argmax(y_test_onehot, axis=1)
    
    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    
    return accuracy

datasets = {
    1: dataframe_single_position,
    2: dataframe_grouped_by_2,
    4: dataframe_grouped_by_4,
}

best_params_per_group = {}
for agrupacio, df in datasets.items():
    print(f"\nOptimitzant xarxa neuronal per agrupació {agrupacio}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective_nn(trial, df, agrupacio), n_trials=50)
    best_params_per_group[agrupacio] = study.best_params
    print(f"\nMillors hiperparàmetres per agrupació {agrupacio}: {study.best_params}")
    print(f"Millor valor d'accuracy per agrupació {agrupacio}: {study.best_value:.4f}")


In [4]:
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
import numpy as np

def entrenar_svm(df, agrupacio):
    df = df[df['year'] >= 2014]
    max_classes = {1: 20, 2: 10, 4: 5}[agrupacio]
    df = df[df['target'] <= max_classes]
    print(f"Entrenant amb agrupació {agrupacio}. Classes després del filtratge: {df['target'].nunique()}")
    X = df.drop(columns=['target', 'raceid', 'driverid', 'driverref', 'circuitref', 'positionorder', 'date', 'dob', 'year'])
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
    print(f"Distribució després de SMOTE: {y_train_smote.value_counts()}")
    model = SVC(kernel='linear', probability=True, random_state=42)
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    print(f"\nResultats per agrupació {agrupacio}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Recall: {recall:.4f}")
    print("\nInforme de classificació:")
    print(classification_report(y_test, y_pred))
    return model

print("Entrenant model per agrupació 1...")
model_agrupacio_1 = entrenar_svm(dataframe_single_position, agrupacio=1)
print("\nEntrenant model per agrupació 2...")
model_agrupacio_2 = entrenar_svm(dataframe_grouped_by_2, agrupacio=2)
print("\nEntrenant model per agrupació 4...")
model_agrupacio_4 = entrenar_svm(dataframe_grouped_by_4, agrupacio=4)


Entrenant model per agrupació 1...
Entrenant amb agrupació 1. Classes després del filtratge: 20
Distribució després de SMOTE: target
13    173
7     173
11    173
9     173
5     173
14    173
12    173
19    173
4     173
17    173
18    173
16    173
20    173
10    173
15    173
3     173
8     173
1     173
6     173
2     173
Name: count, dtype: int64

Resultats per agrupació 1:
Accuracy: 0.1449
F1-Score: 0.1238
Recall: 0.1449

Informe de classificació:
              precision    recall  f1-score   support

           1       0.39      0.77      0.52        43
           2       0.21      0.16      0.18        43
           3       0.18      0.26      0.21        43
           4       0.14      0.21      0.17        43
           5       0.11      0.14      0.12        43
           6       0.07      0.12      0.09        43
           7       0.08      0.12      0.10        43
           8       0.20      0.12      0.15        43
           9       0.05      0.02      0.03       

In [5]:
import optuna
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import pandas as pd

def objective_svm(trial, df, agrupacio):
    df = df[df['year'] >= 2014]
    max_classes = {1: 20, 2: 10, 4: 5}[agrupacio]
    df = df[df['target'] <= max_classes]
    X = df.drop(columns=['target', 'raceid', 'driverid', 'driverref', 'circuitid', 'circuitref', 'positionorder', 'date', 'dob', 'year'])
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    C = trial.suggest_loguniform('C', 1e-3, 1e2)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
    gamma = trial.suggest_loguniform('gamma', 1e-4, 1e0) if kernel != 'linear' else 'scale'
    model = SVC(C=C, kernel=kernel, gamma=gamma, probability=True, random_state=42)
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

datasets = {
    1: pd.read_csv('datasets/f1_top20_target_single_position_all_circuits.csv', delimiter=';'),
    2: pd.read_csv('datasets/f1_top20_target_agrupacio_2_all_circuits.csv', delimiter=';'),
    4: pd.read_csv('datasets/f1_top20_target_agrupacio_4_all_circuits.csv', delimiter=';'),
}

best_params_per_group = {}
for agrupacio, df in datasets.items():
    print(f"\nOptimitzant SVM per agrupació {agrupacio}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective_svm(trial, df, agrupacio), n_trials=50)
    best_params_per_group[agrupacio] = study.best_params
    print(f"\nMillors hiperparàmetres per agrupació {agrupacio}: {study.best_params}")
    print(f"Millor valor d'accuracy per agrupació {agrupacio}: {study.best_value:.4f}")


In [6]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd

def objective_rf(trial, df, agrupacio):
    df = df[df['year'] >= 2014]
    max_classes = {1: 20, 2: 10, 4: 5}[agrupacio]
    df = df[df['target'] <= max_classes]
    X = df.drop(columns=['target', 'raceid', 'driverid', 'driverref', 'circuitid', 'circuitref', 'positionorder', 'date', 'dob', 'year'])
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

datasets = {
    1: pd.read_csv('datasets/f1_top20_target_single_position_all_circuits.csv', delimiter=';'),
    2: pd.read_csv('datasets/f1_top20_target_agrupacio_2_all_circuits.csv', delimiter=';'),
    4: pd.read_csv('datasets/f1_top20_target_agrupacio_4_all_circuits.csv', delimiter=';'),
}

best_params_per_group = {}
for agrupacio, df in datasets.items():
    print(f"\nOptimitzant Random Forest per agrupació {agrupacio}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective_rf(trial, df, agrupacio), n_trials=50)
    best_params_per_group[agrupacio] = study.best_params
    print(f"\nMillors hiperparàmetres per agrupació {agrupacio}: {study.best_params}")
    print(f"Millor valor d'accuracy per agrupació {agrupacio}: {study.best_value:.4f}")


[I 2025-02-02 19:48:43,327] A new study created in memory with name: no-name-a45659da-ec55-4654-9bac-aafcf1d068fd



Optimitzant Random Forest per agrupació 1...


[I 2025-02-02 19:48:44,324] Trial 0 finished with value: 0.15076560659599528 and parameters: {'n_estimators': 151, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.15076560659599528.
[I 2025-02-02 19:48:44,686] Trial 1 finished with value: 0.15783274440518258 and parameters: {'n_estimators': 110, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.15783274440518258.
[I 2025-02-02 19:48:45,009] Trial 2 finished with value: 0.15665488810365136 and parameters: {'n_estimators': 109, 'max_depth': 4, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.15783274440518258.
[I 2025-02-02 19:48:45,676] Trial 3 finished with value: 0.1519434628975265 and parameters: {'n_estimators': 252, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.15783274440518258.
[I 2025-02-02 19:48:46,801] Trial 4 finished with value: 0.14958775029446408 and param


Millors hiperparàmetres per agrupació 1: {'n_estimators': 136, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 6}
Millor valor d'accuracy per agrupació 1: 0.1684

Optimitzant Random Forest per agrupació 2...


[I 2025-02-02 19:49:25,180] Trial 0 finished with value: 0.2767962308598351 and parameters: {'n_estimators': 272, 'max_depth': 19, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.2767962308598351.
[I 2025-02-02 19:49:25,530] Trial 1 finished with value: 0.29328621908127206 and parameters: {'n_estimators': 108, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.29328621908127206.
[I 2025-02-02 19:49:25,891] Trial 2 finished with value: 0.29799764428739695 and parameters: {'n_estimators': 100, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.29799764428739695.
[I 2025-02-02 19:49:26,826] Trial 3 finished with value: 0.28032979976442873 and parameters: {'n_estimators': 204, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 6}. Best is trial 2 with value: 0.29799764428739695.
[I 2025-02-02 19:49:27,729] Trial 4 finished with value: 0.28975265017667845 and paramet


Millors hiperparàmetres per agrupació 2: {'n_estimators': 50, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 7}
Millor valor d'accuracy per agrupació 2: 0.3074

Optimitzant Random Forest per agrupació 4...


[I 2025-02-02 19:49:49,090] Trial 0 finished with value: 0.4793875147232038 and parameters: {'n_estimators': 61, 'max_depth': 12, 'min_samples_split': 6, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.4793875147232038.
[I 2025-02-02 19:49:49,466] Trial 1 finished with value: 0.4734982332155477 and parameters: {'n_estimators': 87, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.4793875147232038.
[I 2025-02-02 19:49:50,029] Trial 2 finished with value: 0.45936395759717313 and parameters: {'n_estimators': 123, 'max_depth': 16, 'min_samples_split': 6, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.4793875147232038.
[I 2025-02-02 19:49:50,513] Trial 3 finished with value: 0.46878680800942285 and parameters: {'n_estimators': 109, 'max_depth': 19, 'min_samples_split': 10, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.4793875147232038.
[I 2025-02-02 19:49:51,022] Trial 4 finished with value: 0.4581861012956419 and parameters


Millors hiperparàmetres per agrupació 4: {'n_estimators': 87, 'max_depth': 13, 'min_samples_split': 7, 'min_samples_leaf': 8}
Millor valor d'accuracy per agrupació 4: 0.4853


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score
from imblearn.over_sampling import SMOTE
import pandas as pd

def entrenar_random_forest(df, agrupacio):
    df = df[df['year'] >= 2014]
    max_classes = {1: 20, 2: 10, 4: 5}[agrupacio]
    df = df[df['target'] <= max_classes]
    print(f"Entrenant amb agrupació {agrupacio}. Classes després del filtratge: {df['target'].nunique()}")
    X = df.drop(columns=['target', 'raceid', 'driverid', 'driverref', 'circuitref', 'positionorder', 'date', 'dob', 'year'])
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
    print(f"Distribució després de SMOTE: {y_train_smote.value_counts()}")
    model = RandomForestClassifier(random_state=42, n_estimators=100)
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    print(f"\nResultats per agrupació {agrupacio}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Recall: {recall:.4f}")
    print("\nInforme de classificació:")
    print(classification_report(y_test, y_pred))
    return model

print("Entrenant model per agrupació 1...")
model_rf_agrupacio_1 = entrenar_random_forest(dataframe_single_position, agrupacio=1)
print("\nEntrenant model per agrupació 2...")
model_rf_agrupacio_2 = entrenar_random_forest(dataframe_grouped_by_2, agrupacio=2)
print("\nEntrenant model per agrupació 4...")
model_rf_agrupacio_4 = entrenar_random_forest(dataframe_grouped_by_4, agrupacio=4)


Entrenant model per agrupació 1...
Entrenant amb agrupació 1. Classes després del filtratge: 20
Distribució després de SMOTE: target
13    173
7     173
11    173
9     173
5     173
14    173
12    173
19    173
4     173
17    173
18    173
16    173
20    173
10    173
15    173
3     173
8     173
1     173
6     173
2     173
Name: count, dtype: int64

Resultats per agrupació 1:
Accuracy: 0.1555
F1-Score: 0.1491
Recall: 0.1555

Informe de classificació:
              precision    recall  f1-score   support

           1       0.50      0.58      0.54        43
           2       0.28      0.23      0.25        43
           3       0.17      0.21      0.19        43
           4       0.16      0.16      0.16        43
           5       0.10      0.09      0.09        43
           6       0.20      0.28      0.24        43
           7       0.20      0.28      0.23        43
           8       0.19      0.19      0.19        43
           9       0.08      0.07      0.07       

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np


def entrenar_neural_network(df, agrupacio):
    
    df = df[df['year'] >= 2014]
    
    max_classes = {1: 20, 2: 10, 4: 5}[agrupacio]
    df = df[df['target'] <= max_classes]
    
    print(f"Entrenant amb agrupació {agrupacio}. Classes després del filtratge: {df['target'].nunique()}")
    
    X = df.drop(columns=['target', 'raceid', 'driverid', 'driverref', 'circuitid', 'circuitref', 'positionorder', 'date', 'dob', 'year'])
    y = df['target']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
    
    print(f"Distribució després de SMOTE: {y_train_smote.value_counts()}")
    

    encoder = OneHotEncoder(sparse_output=False)
    y_train_onehot = encoder.fit_transform(np.array(y_train_smote).reshape(-1, 1))
    y_test_onehot = encoder.transform(np.array(y_test).reshape(-1, 1))
    

    model = Sequential([
        Dense(128, input_dim=X_train_smote.shape[1], activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(len(encoder.categories_[0]), activation='softmax')  # Nombre de sortides igual al nombre de classes
    ])
    

    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    history = model.fit(X_train_smote, y_train_onehot,
                        validation_data=(X_test_scaled, y_test_onehot),
                        epochs=50,
                        batch_size=32,
                        verbose=0)
    
    y_pred_probs = model.predict(X_test_scaled)
    y_pred_classes = np.argmax(y_pred_probs, axis=1)
    y_test_classes = np.argmax(y_test_onehot, axis=1)
    
    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    f1 = f1_score(y_test_classes, y_pred_classes, average='weighted')
    recall = recall_score(y_test_classes, y_pred_classes, average='weighted')
    
    print(f"\nResultats per agrupació {agrupacio}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Recall: {recall:.4f}")
    print("\nInforme de classificació:")
    print(classification_report(y_test_classes, y_pred_classes))
    
    return model, history

print("Entrenant model per agrupació 1...")
model_nn_agrupacio_1, history_nn_agrupacio_1 = entrenar_neural_network(dataframe_single_position, agrupacio=1)

print("\nEntrenant model per agrupació 2...")
model_nn_agrupacio_2, history_nn_agrupacio_2 = entrenar_neural_network(dataframe_grouped_by_2, agrupacio=2)

print("\nEntrenant model per agrupació 4...")
model_nn_agrupacio_4, history_nn_agrupacio_4 = entrenar_neural_network(dataframe_grouped_by_4, agrupacio=4)


Entrenant model per agrupació 1...
Entrenant amb agrupació 1. Classes després del filtratge: 20
Distribució després de SMOTE: target
13    173
7     173
11    173
9     173
5     173
14    173
12    173
19    173
4     173
17    173
18    173
16    173
20    173
10    173
15    173
3     173
8     173
1     173
6     173
2     173
Name: count, dtype: int64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

Resultats per agrupació 1:
Accuracy: 0.1484
F1-Score: 0.1359
Recall: 0.1484

Informe de classificació:
              precision    recall  f1-score   support

           0       0.45      0.70      0.55        43
           1       0.28      0.28      0.28        43
           2       0.12      0.16      0.14        43
           3       0.15      0.19      0.17        43
           4       0.08      0.07      0.07        43
           5       0.12      0.19      0.15        43
           6       0.12      0.16      0.14        43
           7       0.18      0.14      0.16        43
           8       0.08      0.09      0.08        43
           9       0.08      0.07      0.07        43
          10       0.13      0.09      0.11        43
          11       0.07      0.07      0.07        41
          12       0.20      0.14      0.17        42
          13       0.09      0.07      0.08        42
          14

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step

Resultats per agrupació 2:
Accuracy: 0.2898
F1-Score: 0.2668
Recall: 0.2898

Informe de classificació:
              precision    recall  f1-score   support

           0       0.60      0.83      0.69        86
           1       0.41      0.47      0.44        85
           2       0.28      0.27      0.28        86
           3       0.25      0.34      0.29        86
           4       0.20      0.20      0.20        86
           5       0.13      0.14      0.14        85
           6       0.21      0.15      0.18        84
           7       0.19      0.29      0.23        84
           8       0.24      0.20      0.22        84
           9       0.00      0.00      0.00        83

    accuracy                           0.29       849
   macro avg       0.25      0.29      0.27       849
weighted avg       0.25      0.29      0.27       849


Entrenant model per agrupació 4...
Entrenant amb agrupació 4. C

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Resultats per agrupació 4:
Accuracy: 0.4853
F1-Score: 0.4691
Recall: 0.4853

Informe de classificació:
              precision    recall  f1-score   support

           0       0.69      0.84      0.76       171
           1       0.46      0.53      0.49       171
           2       0.36      0.30      0.33       170
           3       0.40      0.51      0.45       169
           4       0.49      0.24      0.32       168

    accuracy                           0.49       849
   macro avg       0.48      0.48      0.47       849
weighted avg       0.48      0.49      0.47       849



In [9]:
import optuna
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

def objective_nn(trial, df, agrupacio):

    df = df[df['year'] >= 2014]
    
    max_classes = {1: 20, 2: 10, 4: 5}[agrupacio]
    df = df[df['target'] <= max_classes]
    
    X = df.drop(columns=['target', 'raceid', 'driverid', 'driverref', 'circuitid', 'circuitref', 'positionorder', 'date', 'dob', 'year'])
    y = df['target']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

    encoder = OneHotEncoder(sparse_output=False)
    y_train_onehot = encoder.fit_transform(np.array(y_train_smote).reshape(-1, 1))
    y_test_onehot = encoder.transform(np.array(y_test).reshape(-1, 1))
    
    num_hidden_layers = trial.suggest_int('num_hidden_layers', 1, 3)
    num_neurons = trial.suggest_int('num_neurons', 32, 128)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2)
    
    model = Sequential()
    model.add(Dense(num_neurons, input_dim=X_train_smote.shape[1], activation='relu'))
    for _ in range(num_hidden_layers - 1):
        model.add(Dense(num_neurons, activation='relu'))
        model.add(Dropout(dropout_rate))
    model.add(Dense(len(encoder.categories_[0]), activation='softmax'))
    
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.fit(X_train_smote, y_train_onehot, epochs=10, batch_size=32, verbose=0)
    
    y_pred_probs = model.predict(X_test_scaled)
    y_pred_classes = np.argmax(y_pred_probs, axis=1)
    y_test_classes = np.argmax(y_test_onehot, axis=1)
    
    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    
    return accuracy


datasets = {
    1: dataframe_single_position, 
    2: dataframe_grouped_by_2,    
    4: dataframe_grouped_by_4,   
}


best_params_per_group = {}
for agrupacio, df in datasets.items():
    print(f"\nOptimitzant xarxa neuronal per agrupació {agrupacio}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective_nn(trial, df, agrupacio), n_trials=50)
    
    
    best_params_per_group[agrupacio] = study.best_params
    print(f"\nMillors hiperparàmetres per agrupació {agrupacio}: {study.best_params}")
    print(f"Millor valor d'accuracy per agrupació {agrupacio}: {study.best_value:.4f}")


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report
import joblib
import os

def entrenar_svm(df, agrupacio):
    """
    Entrena un modelo SVM usando las siguientes características:
      - constructorid
      - circuitid
      - grid
      - experience
      - hability
      - constructor_experience
      - constructor_fiability
      - constructor_performance
      - gap_to_best_time
      - age
    """

    df = df[df['year'] >= 2014]
    
    max_classes = {1: 20, 2: 10, 4: 5}[agrupacio]
    df = df[df['target'] <= max_classes]
    print(f"Entrenant amb agrupació {agrupacio}. Clases después del filtrado: {df['target'].nunique()}")
    
    features = [
        'constructorid',
        'circuitid',
        'grid',
        'experience',
        'hability',
        'constructor_experience',
        'constructor_fiability',
        'constructor_performance',
        'gap_to_best_time',
        'age'
    ]
    
    missing = [col for col in features if col not in df.columns]
    if missing:
        print("Faltan las siguientes columnas en el DataFrame:", missing)
        return None, None
    
    X = df[features]
    y = df['target']
    print("Columnas usadas en el entrenamiento SVM:", list(X.columns))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
    print("Distribución después de SMOTE:")
    print(pd.Series(y_train_smote).value_counts())
    
    model = SVC(kernel='linear', probability=True, random_state=42)
    model.fit(X_train_smote, y_train_smote)
    
    y_pred = model.predict(X_test_scaled)
    print(f"\nResultats per agrupació {agrupacio}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print("\nInforme de clasificación:")
    print(classification_report(y_test, y_pred))
    
    return model, scaler

print("Entrenant model per agrupació 1...")
model_agrupacio_1, scaler_1 = entrenar_svm(dataframe_single_position, agrupacio=1)

ruta_modelo_1 = r'C:\Users\Lenovo\Desktop\TFG\TFG GIT\TFG\F1_prediction_all_circuits\SVM models\svm_model_agrupacio_1.pkl'
ruta_scaler_1 = r'C:\Users\Lenovo\Desktop\TFG\TFG GIT\TFG\F1_prediction_all_circuits\SVM models\scaler_agrupacio_1.pkl'

joblib.dump(model_agrupacio_1, os.path.abspath(ruta_modelo_1))
joblib.dump(scaler_1, os.path.abspath(ruta_scaler_1))

print("\n¡Modelos SVM entrenats i guardats amb èxit!")

Entrenant model per agrupació 1...
Entrenant amb agrupació 1. Clases después del filtrado: 20
Columnas usadas en el entrenamiento SVM: ['constructorid', 'circuitid', 'grid', 'experience', 'hability', 'constructor_experience', 'constructor_fiability', 'constructor_performance', 'gap_to_best_time', 'age']
Distribución después de SMOTE:
target
13    173
7     173
11    173
9     173
5     173
14    173
12    173
19    173
4     173
17    173
18    173
16    173
20    173
10    173
15    173
3     173
8     173
1     173
6     173
2     173
Name: count, dtype: int64

Resultats per agrupació 1:
Accuracy: 0.1449
F1-Score: 0.1238
Recall: 0.1449

Informe de clasificación:
              precision    recall  f1-score   support

           1       0.39      0.77      0.52        43
           2       0.21      0.16      0.18        43
           3       0.18      0.26      0.21        43
           4       0.14      0.21      0.17        43
           5       0.11      0.14      0.12        43
  

In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib

def entrenar_random_forest(df, agrupacio):
  
    df = df[df['year'] >= 2014]
    
    
    max_classes = {1: 20, 2: 10, 4: 5}[agrupacio]
    df = df[df['target'] <= max_classes]
    
    print(f"Entrenando modelo para agrupación {agrupacio}. Clases tras el filtrado: {df['target'].nunique()}")
    
   
    X = df.drop(columns=['target', 'raceid', 'driverid', 'driverref', 'circuitref', 'positionorder', 'date', 'dob', 'year'])
    y = df['target']
    
   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
    
    print(f"Distribución tras SMOTE:\n{pd.Series(y_train_smote).value_counts()}")
    
    model = RandomForestClassifier(random_state=42, n_estimators=100)
    model.fit(X_train_smote, y_train_smote)
    
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print(f"\nResultados para agrupación {agrupacio}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Recall: {recall:.4f}")
    print("\nInforme de clasificación:")
    print(classification_report(y_test, y_pred))
    
    return model, scaler

print("Entrenando modelo Random Forest para agrupación 1...")
model_rf_agrupacio_1, scaler_rf_1 = entrenar_random_forest(dataframe_single_position, agrupacio=1)

print("\nEntrenando modelo Random Forest para agrupación 2...")
model_rf_agrupacio_2, scaler_rf_2 = entrenar_random_forest(dataframe_grouped_by_2, agrupacio=2)

print("\nEntrenando modelo Random Forest para agrupación 4...")
model_rf_agrupacio_4, scaler_rf_4 = entrenar_random_forest(dataframe_grouped_by_4, agrupacio=4)

joblib.dump(model_rf_agrupacio_1, 'random_forest_model_agrupacio_1.pkl')
joblib.dump(scaler_rf_1, 'scaler_rf_agrupacio_1.pkl')

joblib.dump(model_rf_agrupacio_2, 'random_forest_model_agrupacio_2.pkl')
joblib.dump(scaler_rf_2, 'scaler_rf_agrupacio_2.pkl')

joblib.dump(model_rf_agrupacio_4, 'random_forest_model_agrupacio_4.pkl')
joblib.dump(scaler_rf_4, 'scaler_rf_agrupacio_4.pkl')

print("\n¡Modelos Random Forest entrenados y guardados exitosamente!")


Entrenando modelo Random Forest para agrupación 1...
Entrenando modelo para agrupación 1. Clases tras el filtrado: 20
Distribución tras SMOTE:
target
13    173
7     173
11    173
9     173
5     173
14    173
12    173
19    173
4     173
17    173
18    173
16    173
20    173
10    173
15    173
3     173
8     173
1     173
6     173
2     173
Name: count, dtype: int64

Resultados para agrupación 1:
Accuracy: 0.1555
F1-Score: 0.1491
Recall: 0.1555

Informe de clasificación:
              precision    recall  f1-score   support

           1       0.50      0.58      0.54        43
           2       0.28      0.23      0.25        43
           3       0.17      0.21      0.19        43
           4       0.16      0.16      0.16        43
           5       0.10      0.09      0.09        43
           6       0.20      0.28      0.24        43
           7       0.20      0.28      0.23        43
           8       0.19      0.19      0.19        43
           9       0.08      0