# Create Time-Series ML Model

## Load libraries

First, we will load the libraries.

In [1]:
import pandas as pd
import pprint
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

pp = pprint.PrettyPrinter(indent=2)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

## Prepare data

Then we load, and prepare the data.

In [2]:
def sort_by_year_repetition(df):
    # Extract election id components from the index
    df['year'] = df.index.map(lambda x: int(x.split('_')[0][1:5]))  # Assumes year is four digits long
    df['repetitionid'] = df.index.map(lambda x: int(x.split('_')[0][5:]))  # Assumes repetitionid immediately follows year

    # Sort the DataFrame by year and then by repetitionid
    df_sorted = df.sort_values(by=['year', 'repetitionid'])

    # Drop the temporary columns used for sorting
    df_sorted = df_sorted.drop(columns=['year', 'repetitionid'])

    return df_sorted

In [3]:
def add_shifted_columns_grouped(df):
    # Extract 'mundissec' from the index
    df['mundissec'] = df.index.map(lambda x: x.split('_')[1])

    # Identify unique party codes by splitting each column name
    party_codes = set(col.split('_')[-1] for col in df.columns if ('_' in col) and (col.split('_')[-1].isdigit()))

    # Iterate over each party code to create shifted columns group-wise
    for party_code in party_codes:
        # Identify columns for the current party code
        party_columns = [col for col in df.columns if col.endswith(party_code)]
        for col in party_columns:
            # Create a new shifted column name
            shifted_col_name = f"{col}_shifted"
            # Group by 'mundissec' and shift within each group
            df[shifted_col_name] = df.groupby('mundissec')[col].shift(-1)  # Shift within each group

    # Drop the temporary 'mundissec' column after shifting
    df.drop(columns='mundissec', inplace=True)

    return df

In [4]:
df_timeseries = pd.read_pickle("../../data/output/timeseries_2010_2024_6_1_True_True_True_True.pkl")
df_sorted = sort_by_year_repetition(df_timeseries)
df_timeseries = add_shifted_columns_grouped(df_sorted)
df_timeseries.head()

Unnamed: 0_level_0,cens_electoral_percentage_10,cens_electoral_percentage_1001,cens_electoral_percentage_1003,cens_electoral_percentage_1013,cens_electoral_percentage_1015,cens_electoral_percentage_1030,cens_electoral_percentage_1031,cens_electoral_percentage_201914111,cens_electoral_percentage_3000000,cens_electoral_percentage_301,...,cens_electoral_percentage_337_shifted,cens_electoral_percentage_1031_shifted,cens_electoral_percentage_1030_shifted,cens_electoral_percentage_82484191_shifted,cens_electoral_percentage_365_shifted,cens_electoral_percentage_10_shifted,cens_electoral_percentage_431094190_shifted,cens_electoral_percentage_751_shifted,cens_electoral_percentage_999999999_shifted,cens_electoral_percentage_1015_shifted
electionid_mundissec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A20101_08001801001,3.577513,0.0,0.0,0.0,0.0,0.0,17.802385,0.0,0.0,2.555366,...,5.821918,13.69863,0.0,0.0,0.0,4.023973,0.0,0.0,0.870434,0.0
A20101_08001801002,1.992337,0.0,0.0,0.0,0.0,0.0,15.402299,0.0,0.0,1.762452,...,5.490196,9.490196,0.0,0.0,0.0,2.352941,0.0,0.0,0.928105,0.0
A20101_08001801003,3.676471,0.0,0.0,0.0,0.0,0.0,22.610294,0.0,0.0,4.595588,...,5.57554,20.683453,0.0,0.0,0.0,2.158273,0.0,0.0,0.719424,0.0
A20101_08001801004,4.765818,0.0,0.0,0.0,0.0,0.0,21.446179,0.0,0.0,2.793755,...,6.344171,17.287867,0.0,0.0,0.0,3.647898,0.0,0.0,0.660851,0.0
A20101_08001801005,2.55848,0.0,0.0,0.0,0.0,0.0,14.473684,0.0,0.0,1.900585,...,7.211185,9.050773,0.0,0.0,0.0,2.796174,0.0,0.0,1.017905,0.0


In [5]:
df_timeseries.tail(5)

Unnamed: 0_level_0,cens_electoral_percentage_10,cens_electoral_percentage_1001,cens_electoral_percentage_1003,cens_electoral_percentage_1013,cens_electoral_percentage_1015,cens_electoral_percentage_1030,cens_electoral_percentage_1031,cens_electoral_percentage_201914111,cens_electoral_percentage_3000000,cens_electoral_percentage_301,...,cens_electoral_percentage_337_shifted,cens_electoral_percentage_1031_shifted,cens_electoral_percentage_1030_shifted,cens_electoral_percentage_82484191_shifted,cens_electoral_percentage_365_shifted,cens_electoral_percentage_10_shifted,cens_electoral_percentage_431094190_shifted,cens_electoral_percentage_751_shifted,cens_electoral_percentage_999999999_shifted,cens_electoral_percentage_1015_shifted
electionid_mundissec,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A20211_43906001002,11.009174,0.0,2.54842,1.630989,0.0,0.0,20.897044,0.0,0.0,2.038736,...,,,,,,,,,,
A20211_43907601001,8.389831,0.0,1.610169,3.644068,0.0,0.0,5.423729,0.0,0.0,3.389831,...,,,,,,,,,,
A20211_43907601002,4.393673,0.0,0.615114,2.372583,0.0,0.0,0.790861,0.0,0.0,3.690685,...,,,,,,,,,,
A20211_43907601003,7.974138,0.0,2.262931,2.586207,0.0,0.0,6.573276,0.0,0.0,3.232759,...,,,,,,,,,,
A20211_43907601004,5.937235,0.0,2.120441,2.374894,0.0,0.0,3.307888,0.0,0.0,2.544529,...,,,,,,,,,,


In [6]:
def split_data(df):
    # Identify unique elections from the index
    elections = df.index.map(lambda x: x.split('_')[0]).unique()

    # Latest and penultimate election identifiers
    last_election = elections[-1]
    penultimate_election = elections[-2] if len(elections) > 1 else None

    # Split the DataFrame based on the election identifiers
    new_data = df.loc[df.index.map(lambda x: x.split('_')[0]) == last_election]
    test_data = df.loc[df.index.map(lambda x: x.split('_')[0]) == penultimate_election] if penultimate_election else pd.DataFrame()
    train_data = df.loc[~df.index.map(lambda x: x.split('_')[0]).isin([last_election, penultimate_election])]

    # Columns that are not shifted
    non_shifted_columns = [col for col in df.columns if not col.endswith('_shifted')]
    # Columns that are shifted
    shifted_columns = [col for col in df.columns if col.endswith('_shifted')]

    # Creating new_data, X_test, y_test, X_train, y_train
    new_data = new_data[non_shifted_columns]
    X_test = test_data[non_shifted_columns]
    y_test = test_data[shifted_columns]
    X_train = train_data[non_shifted_columns]
    y_train = train_data[shifted_columns]

    return X_train, y_train, X_test, y_test, new_data

# Test the function
X_train, y_train, X_test, y_test, new_data = split_data(df_timeseries)

# Printing shapes to verify the splits
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')
print(f'New data shape: {new_data.shape}')

X_train shape: (66079, 29), y_train shape: (66079, 21)
X_test shape: (5083, 29), y_test shape: (5083, 21)
New data shape: (5083, 29)


In [7]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)
X_test = X_test.astype(float)
y_test = y_test.astype(float)
new_data = new_data.astype(float)

In [22]:
# Set up the parameters for XGBoost
params_multioutput = {
    "n_estimators": 400,  # Number of boosting rounds
    "max_depth": 10,  # Typically 3-10. Higher values can lead to overfitting.
    "eta": 0.01,  # Learning rate, typically between 0.01 and 0.2
    "objective": "reg:squarederror",  # Regression with squared loss
    "eval_metric": "rmse",  # Root Mean Square Error for evaluation
    "tree_method": "hist",  # Fast histogram optimized approximate greedy algorithm
    "multi_strategy": "multi_output_tree",
    "early_stopping_rounds": 5,
    "reg_alpha": 100,  # L1 regularization term on weights. Increasing this value will make model more conservative. 
    "reg_lambda": 100,  # L2 regularization term on weights. Increasing this value will make model more conservative.
}
# Define the model_multioutput
model_multioutput = XGBRegressor(**params_multioutput)

# Train the model_multioutput
eval_set = [(X_test, y_test)]
model_multioutput.fit(X_train, y_train, eval_set=eval_set, verbose=True)

[0]	validation_0-rmse:4.64218
[1]	validation_0-rmse:4.59022
[2]	validation_0-rmse:4.54875
[3]	validation_0-rmse:4.49814
[4]	validation_0-rmse:4.45868
[5]	validation_0-rmse:4.40950
[6]	validation_0-rmse:4.36123
[7]	validation_0-rmse:4.31384
[8]	validation_0-rmse:4.26983
[9]	validation_0-rmse:4.22430
[10]	validation_0-rmse:4.18718
[11]	validation_0-rmse:4.14277
[12]	validation_0-rmse:4.10618
[13]	validation_0-rmse:4.06231
[14]	validation_0-rmse:4.01865
[15]	validation_0-rmse:3.97832
[16]	validation_0-rmse:3.94329
[17]	validation_0-rmse:3.90358
[18]	validation_0-rmse:3.86121
[19]	validation_0-rmse:3.82184
[20]	validation_0-rmse:3.78093
[21]	validation_0-rmse:3.74324
[22]	validation_0-rmse:3.71099
[23]	validation_0-rmse:3.67420
[24]	validation_0-rmse:3.63585
[25]	validation_0-rmse:3.59858
[26]	validation_0-rmse:3.56773
[27]	validation_0-rmse:3.53123
[28]	validation_0-rmse:3.49660
[29]	validation_0-rmse:3.46081
[30]	validation_0-rmse:3.43240
[31]	validation_0-rmse:3.39761
[32]	validation_0-

In [23]:
predictions = model_multioutput.predict(X_test, iteration_range=(0, model_multioutput.best_iteration + 1))
if not isinstance(predictions, pd.DataFrame):
    predictions = pd.DataFrame(predictions, index=y_test.index, columns=y_test.columns)

# Initialize a dictionary to store metrics for each column
metrics = {}

# Loop through each column in y_test to calculate metrics
for column in y_test.columns:
    mae = mean_absolute_error(y_test[column], predictions[column])
    mse = mean_squared_error(y_test[column], predictions[column])
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test[column], predictions[column])

    # Store metrics in the dictionary
    metrics[column] = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R^2': r2}

# Print or return the metrics
for col, values in metrics.items():
    print(f"Metrics for {col}:")
    print(f"  MAE: {values['MAE']}")
    print(f"  MSE: {values['MSE']}")
    print(f"  RMSE: {values['RMSE']}")
    print(f"  R^2: {values['R^2']}\n")

Metrics for cens_electoral_percentage_1001_shifted:
  MAE: 0.7666111234461912
  MSE: 0.59780966304232
  RMSE: 0.7731815201117522
  R^2: 0.0

Metrics for cens_electoral_percentage_86_shifted:
  MAE: 1.7128607204643578
  MSE: 3.870433550191575
  RMSE: 1.9673417471785564
  R^2: -0.573821342331621

Metrics for cens_electoral_percentage_637_shifted:
  MAE: 0.7685917627147725
  MSE: 0.598204293464145
  RMSE: 0.7734366770874944
  R^2: 0.0

Metrics for cens_electoral_percentage_3000000_shifted:
  MAE: 0.7450559554127242
  MSE: 0.5628983719794678
  RMSE: 0.7502655343139972
  R^2: 0.0

Metrics for cens_electoral_percentage_5000000_shifted:
  MAE: 0.8863778505859097
  MSE: 0.7958542090562846
  RMSE: 0.8921066130548997
  R^2: 0.0

Metrics for cens_electoral_percentage_1013_shifted:
  MAE: 1.7033195314864116
  MSE: 4.481394278022045
  RMSE: 2.1169303904526586
  R^2: -0.3940993009747451

Metrics for cens_electoral_percentage_38_shifted:
  MAE: 0.9743876672391462
  MSE: 1.0007442430288085
  RMSE: 1.0

This results are promising, as we got good $R^2$ values for 4 parties. These parties are:
- JxCat (1031): $R^2 = 0.820$
- PSC (6): $R^2 = 0.569$
- CUP (1003): $R^2 = 0.445$
- ERC (10): $R^2 = 0.471$

We've added the election type for each election, as this is an important feature. The results are similar, but slighly worse:

- JxCat (1031): $R^2 = 0.819$
- PSC (6): $R^2 = 0.241$
- CUP (1003): $R^2 = 0.329$
- ERC (10): $R^2 = 0.681$

We've also added the proportion of born abroad and the proportion of different groups of age. The results are similar, but they have slightly improved for PSC, and ERC, and slightly worsened for JxCat and CUP. The results are as follows:

- JxCat (1031): $R^2 = 0.800$
- PSC (6): $R^2 = 0.479$
- CUP (1003): $R^2 = 0.299$
- ERC (10): $R^2 = 0.443$

We've also added the mean income data by census section. The results are again similar, but slightly worse, as the $R^2$ values are very low. That is something strange because the mean income is a very important feature in the prediction of the election results. The results are as follows:

- JxCat (1031): $R^2 = 0.805$
- PSC (6): $R^2 = 0.463$
- CUP (1003): $R^2 = 0.261$
- ERC (10): $R^2 = 0.461$

In [10]:
# Set up the parameters for XGBoost
params = {
    'max_depth': 6,  # Typically 3-10. Higher values can lead to overfitting.
    'eta': 0.1,  # Learning rate, typically between 0.01 and 0.2
    'objective': 'reg:squarederror',  # Regression with squared loss
    'eval_metric': 'rmse',  # Root Mean Square Error for evaluation
    "early_stopping_rounds": 10, 
}
num_boost_round = 200  # Number of boosting rounds

# Define the model
model = XGBRegressor(**params)

# Train the model
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_set=eval_set, verbose=True)

[0]	validation_0-rmse:4.24881
[1]	validation_0-rmse:3.76417
[2]	validation_0-rmse:3.43164
[3]	validation_0-rmse:3.12083
[4]	validation_0-rmse:2.85455
[5]	validation_0-rmse:2.63265
[6]	validation_0-rmse:2.44811
[7]	validation_0-rmse:2.25955
[8]	validation_0-rmse:2.15956
[9]	validation_0-rmse:2.07259
[10]	validation_0-rmse:2.00732
[11]	validation_0-rmse:1.95113
[12]	validation_0-rmse:1.90680
[13]	validation_0-rmse:1.87444
[14]	validation_0-rmse:1.87778
[15]	validation_0-rmse:1.87561
[16]	validation_0-rmse:1.90633
[17]	validation_0-rmse:1.92145
[18]	validation_0-rmse:1.94002
[19]	validation_0-rmse:1.96837
[20]	validation_0-rmse:1.99231
[21]	validation_0-rmse:2.02724
[22]	validation_0-rmse:2.04395
[23]	validation_0-rmse:2.06933


In [11]:
# Assuming 'predictions' is a DataFrame or array with the same structure as 'y_test'
predictions = model.predict(X_test)
if not isinstance(predictions, pd.DataFrame):
    predictions = pd.DataFrame(predictions, index=y_test.index, columns=y_test.columns)

# Initialize a dictionary to store metrics for each column
metrics = {}

# Loop through each column in y_test to calculate metrics
for column in y_test.columns:
    mae = mean_absolute_error(y_test[column], predictions[column])
    mse = mean_squared_error(y_test[column], predictions[column])
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test[column], predictions[column])

    # Store metrics in the dictionary
    metrics[column] = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R^2': r2}

# Print or return the metrics
for col, values in metrics.items():
    print(f"Metrics for {col}:")
    print(f"  MAE: {values['MAE']}")
    print(f"  MSE: {values['MSE']}")
    print(f"  RMSE: {values['RMSE']}")
    print(f"  R^2: {values['R^2']}\n")

Metrics for cens_electoral_percentage_1001_shifted:
  MAE: 0.6633839247359457
  MSE: 0.44008060400414944
  RMSE: 0.6633857128429504
  R^2: 0.0

Metrics for cens_electoral_percentage_86_shifted:
  MAE: 1.1636995691030763
  MSE: 2.550627948396235
  RMSE: 1.5970685484337341
  R^2: -0.03715324122663666

Metrics for cens_electoral_percentage_637_shifted:
  MAE: 0.667268783317268
  MSE: 0.4458231100739962
  RMSE: 0.6676998652643239
  R^2: 0.0

Metrics for cens_electoral_percentage_3000000_shifted:
  MAE: 0.6627719747065653
  MSE: 0.4394082509498331
  RMSE: 0.6628787603701246
  R^2: 0.0

Metrics for cens_electoral_percentage_5000000_shifted:
  MAE: 0.7594773340783391
  MSE: 0.7735676596391464
  RMSE: 0.8795269521959781
  R^2: 0.0

Metrics for cens_electoral_percentage_1013_shifted:
  MAE: 2.5668932155515956
  MSE: 9.552917665266683
  RMSE: 3.0907794591763875
  R^2: -1.971779542971975

Metrics for cens_electoral_percentage_38_shifted:
  MAE: 0.6646655479590067
  MSE: 0.44178056403335
  RMSE: 0

This results are promising, as we got good $R^2$ values for 4 parties. These parties are:
- JxCat (1031): $R^2 = 0.709$
- PSC (6): $R^2 = 0.726$
- CUP (1003): $R^2 = 0.419$
- ERC (10): $R^2 = 0.444$

This means that in some cases the past percentage of votes explains a lot of the future percentage of votes. This is a good sign, as we can use this information to predict the future. But we need to be careful, as this is not always the case.

This parties have in common that they are important, and have consistent data for the past years. Some parties have surprising behaviours that are not explained by the past data.

We've added the election type for each election, as this is an important feature. The results are similar, but slighly worse:

- JxCat (1031): $R^2 = 0.738$
- PSC (6): $R^2 = 0.679$
- CUP (1003): $R^2 = 0.402$
- ERC (10): $R^2 = 0.291$

## Predictions with new data

In [44]:
# Make predictions
new_predictions = model.predict(new_data)
print(f"Predictions for New Data: {new_predictions}")

Predictions for New Data: [[ 2.9525998   0.6624042  14.284147   ...  0.6636474   0.98431915
   0.6673583 ]
 [ 2.6769886   0.6624042  15.805771   ...  0.6636474   1.0255357
   0.6686701 ]
 [ 2.971561    0.6624042  15.143438   ...  0.6636474   0.99451333
   0.6673583 ]
 ...
 [ 1.6247534   0.6624042  16.779013   ...  0.6636474   1.8688223
   0.6630192 ]
 [ 2.7526872   0.6624042  14.180771   ...  0.6636474   0.8404626
   0.6647433 ]
 [ 2.5658808   0.6624042  13.727295   ...  0.6636474   1.8688223
   0.6630192 ]]


In [45]:
# for each column in the predictions, calculate the mean
mean_predictions = new_predictions.mean(axis=0)
mean_predictions

array([ 3.4947915 ,  0.6625365 , 10.9308    ,  3.241869  ,  0.66860753,
        0.94408894,  3.0239863 ,  1.7547996 ,  0.6647472 ,  0.66306853,
        0.7042014 ,  0.66365457,  0.7472769 , 12.07125   ,  0.66242945,
        8.5561695 ,  0.66252285,  0.6624268 ,  0.6646441 ,  1.1047701 ,
        0.70819294], dtype=float32)

In [46]:
mean_predictions = mean_predictions.reshape(1, -1)  # Reshape the array
mean_predictions_df = pd.DataFrame(mean_predictions, columns=y_test.columns)

In [47]:
mean_predictions_df

Unnamed: 0,cens_electoral_percentage_1003_shifted,cens_electoral_percentage_3000000_shifted,cens_electoral_percentage_6_shifted,cens_electoral_percentage_86_shifted,cens_electoral_percentage_751_shifted,cens_electoral_percentage_1013_shifted,cens_electoral_percentage_301_shifted,cens_electoral_percentage_38_shifted,cens_electoral_percentage_337_shifted,cens_electoral_percentage_82484191_shifted,...,cens_electoral_percentage_1030_shifted,cens_electoral_percentage_1001_shifted,cens_electoral_percentage_10_shifted,cens_electoral_percentage_431094190_shifted,cens_electoral_percentage_1031_shifted,cens_electoral_percentage_201914111_shifted,cens_electoral_percentage_365_shifted,cens_electoral_percentage_637_shifted,cens_electoral_percentage_999999999_shifted,cens_electoral_percentage_1015_shifted
0,3.494792,0.662537,10.9308,3.241869,0.668608,0.944089,3.023986,1.7548,0.664747,0.663069,...,0.663655,0.747277,12.07125,0.662429,8.55617,0.662523,0.662427,0.664644,1.10477,0.708193


In [48]:
total_census = 5754840
total_votes = (mean_predictions_df/100) * total_census
total_votes

Unnamed: 0,cens_electoral_percentage_1003_shifted,cens_electoral_percentage_3000000_shifted,cens_electoral_percentage_6_shifted,cens_electoral_percentage_86_shifted,cens_electoral_percentage_751_shifted,cens_electoral_percentage_1013_shifted,cens_electoral_percentage_301_shifted,cens_electoral_percentage_38_shifted,cens_electoral_percentage_337_shifted,cens_electoral_percentage_82484191_shifted,...,cens_electoral_percentage_1030_shifted,cens_electoral_percentage_1001_shifted,cens_electoral_percentage_10_shifted,cens_electoral_percentage_431094190_shifted,cens_electoral_percentage_1031_shifted,cens_electoral_percentage_201914111_shifted,cens_electoral_percentage_365_shifted,cens_electoral_percentage_637_shifted,cens_electoral_percentage_999999999_shifted,cens_electoral_percentage_1015_shifted
0,201119.669893,38127.915518,629050.075922,186564.377538,38477.292572,54330.809002,174025.575035,100985.913414,38255.136638,38158.532304,...,38192.257665,43004.589328,694681.102439,38121.754643,492393.851702,38127.130335,38121.604574,38249.206226,63577.748285,40755.371706


## PyCaret

In [108]:
train_data = X_train.copy()
train_data[y_train.columns] = y_train
test_data = X_test.copy()
test_data[y_test.columns] = y_test

# Combine features and target columns for the setup
all_columns = X_train.columns.tolist() + y_train.columns.tolist()

In [109]:
from pycaret.regression import *

for col in y_train.columns:
    print(col)
    train_data = X_train.copy()
    train_data[col] = y_train[col]
    test_data = X_test.copy()
    test_data[col] = y_test[col]

    # Initialize the setup
    reg_setup = setup(
        data=train_data,
        test_data=test_data,
        target=col,
        session_id=123,
        normalize=True,
        transformation=True,
        transform_target=True,
        verbose=False,
    )
    best_model = compare_models()
    print(f"The best model for {col} is: {best_model}")

cens_electoral_percentage_999999999_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,1.0376,10.4541,2.6493,-0.829,0.5399,1.5561,0.698
en,Elastic Net,1.0376,10.4541,2.6493,-0.829,0.5399,1.5561,0.137
dummy,Dummy Regressor,1.0376,10.4541,2.6493,-0.829,0.5399,1.5561,0.2
llar,Lasso Least Angle Regression,1.0376,10.4541,2.6493,-0.829,0.5399,1.5561,0.135
omp,Orthogonal Matching Pursuit,1.02,10.2508,2.6199,-1.1153,0.5147,1.3409,0.137
ada,AdaBoost Regressor,0.978,10.1882,2.6024,-1.3978,0.4902,1.16,0.484
br,Bayesian Ridge,0.994,10.178,2.6267,-2.6629,0.5012,1.2286,0.136
ridge,Ridge Regression,0.994,10.1779,2.6267,-2.6658,0.5012,1.2286,0.14
lr,Linear Regression,0.994,10.1779,2.6267,-2.666,0.5012,1.2286,0.779
lar,Least Angle Regression,1.0006,10.1931,2.6343,-3.0986,0.5043,1.2604,0.137


The best model for cens_electoral_percentage_999999999_shifted is: Lasso(random_state=123)
cens_electoral_percentage_1031_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,5.6057,77.9349,8.5935,0.1289,0.7357,0.4499,7.829
et,Extra Trees Regressor,5.5754,77.4537,8.4739,0.066,0.728,0.4439,1.898
xgboost,Extreme Gradient Boosting,5.8696,80.3865,8.7201,0.0418,0.7661,0.4871,0.26
lightgbm,Light Gradient Boosting Machine,5.8445,79.4175,8.6907,0.0375,0.7445,0.4764,0.401
huber,Huber Regressor,6.8264,95.9717,9.5813,-0.0749,0.8164,0.6251,0.193
omp,Orthogonal Matching Pursuit,6.9934,98.7084,9.7488,-0.1031,0.8717,0.7345,0.162
gbr,Gradient Boosting Regressor,6.8622,94.5992,9.5346,-0.1085,0.8167,0.6143,2.706
lar,Least Angle Regression,6.9804,100.4649,9.7885,-0.1381,0.8288,0.6342,0.162
ridge,Ridge Regression,7.053,101.5133,9.8654,-0.1446,0.8273,0.6359,0.157
lr,Linear Regression,7.053,101.5132,9.8654,-0.1446,0.8273,0.6359,0.149


The best model for cens_electoral_percentage_1031_shifted is: RandomForestRegressor(n_jobs=-1, random_state=123)
cens_electoral_percentage_86_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,2.3683,12.9915,3.3952,0.1571,0.4951,0.443,0.439
rf,Random Forest Regressor,2.2592,12.7974,3.4335,0.1223,0.4877,0.4266,6.047
xgboost,Extreme Gradient Boosting,2.4267,13.8228,3.4944,0.1052,0.5118,0.4515,0.272
et,Extra Trees Regressor,2.412,14.6107,3.5397,0.0859,0.5068,0.43,2.232
ada,AdaBoost Regressor,2.6687,15.1457,3.5893,0.0718,0.5542,0.5214,0.744
gbr,Gradient Boosting Regressor,2.6263,15.4564,3.62,0.0429,0.5295,0.4923,2.704
knn,K Neighbors Regressor,2.5983,17.5604,3.7935,-0.0525,0.5807,0.465,0.61
omp,Orthogonal Matching Pursuit,3.0108,17.0535,3.8496,-0.0829,0.5942,0.6173,0.164
br,Bayesian Ridge,3.0159,18.2956,3.9505,-0.1535,0.6056,0.5926,0.156
ridge,Ridge Regression,3.0161,18.298,3.9508,-0.1537,0.6056,0.5926,0.171


The best model for cens_electoral_percentage_86_shifted is: LGBMRegressor(n_jobs=-1, random_state=123)
cens_electoral_percentage_301_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,3.5751,40.6103,4.9991,-3.7105,0.7678,0.6128,0.959
omp,Orthogonal Matching Pursuit,4.2948,42.0943,5.1976,-6.0082,0.8763,0.8448,0.213
ada,AdaBoost Regressor,4.3174,46.9726,5.3734,-6.9937,0.8736,0.7497,1.773
br,Bayesian Ridge,4.8115,46.3062,5.7908,-7.0151,0.9537,1.0249,0.233
ridge,Ridge Regression,4.8122,46.3139,5.7917,-7.0164,0.9538,1.0249,0.176
lr,Linear Regression,4.8123,46.3143,5.7918,-7.0165,0.9538,1.0249,0.167
lar,Least Angle Regression,4.8123,46.3143,5.7918,-7.0165,0.9538,1.0249,0.196
gbr,Gradient Boosting Regressor,4.2368,45.9012,5.5086,-7.9447,0.8494,0.6839,6.471
huber,Huber Regressor,5.2345,51.9274,6.3681,-9.6661,1.0019,1.0963,0.45
lightgbm,Light Gradient Boosting Machine,3.9607,43.625,5.2137,-10.9639,0.8141,0.6082,0.821


The best model for cens_electoral_percentage_301_shifted is: KNeighborsRegressor(n_jobs=-1)
cens_electoral_percentage_237_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
par,Passive Aggressive Regressor,0.0112,0.4011,0.2828,0.7997,0.0289,,0.177
knn,K Neighbors Regressor,0.0112,0.4011,0.2828,0.3997,0.0289,,0.617
dt,Decision Tree Regressor,0.0112,0.4011,0.2831,0.3997,0.0292,,0.204
omp,Orthogonal Matching Pursuit,0.0112,0.4011,0.2828,0.2997,0.0289,,0.162


The best model for cens_electoral_percentage_237_shifted is: PassiveAggressiveRegressor(random_state=123)
cens_electoral_percentage_1008_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,2.9694,22.5308,4.0598,-0.4672,0.8236,,2.677
ada,AdaBoost Regressor,4.3548,38.484,5.4447,-0.4772,1.0025,,1.098
lightgbm,Light Gradient Boosting Machine,3.345,26.3848,4.4547,-0.5944,0.8474,,0.676
rf,Random Forest Regressor,3.0907,24.0038,4.275,-0.7099,0.8516,,10.791
knn,K Neighbors Regressor,3.1974,25.9554,4.4948,-0.7435,0.8797,,0.789
br,Bayesian Ridge,4.2512,45.2515,5.5537,-0.7612,0.9815,,0.199
ridge,Ridge Regression,4.252,45.2807,5.5553,-0.7623,0.9816,,0.189
lr,Linear Regression,4.2521,45.2826,5.5555,-0.7624,0.9816,,0.177
lar,Least Angle Regression,4.2521,45.2827,5.5555,-0.7624,0.9816,,0.194
lasso,Lasso Regression,5.5832,59.7304,6.9262,-0.8665,1.2715,,0.19


The best model for cens_electoral_percentage_1008_shifted is: ExtraTreesRegressor(n_jobs=-1, random_state=123)
cens_electoral_percentage_1003_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.874,5.2493,1.326,-0.1291,0.3694,,0.299
knn,K Neighbors Regressor,1.7025,11.455,2.9588,-0.1702,0.7284,,0.632
xgboost,Extreme Gradient Boosting,1.1942,7.586,2.0861,-0.1714,0.5336,,0.235
lightgbm,Light Gradient Boosting Machine,1.7968,12.1308,3.0051,-0.2137,0.7437,,0.352
et,Extra Trees Regressor,1.7949,12.1866,2.9935,-0.2249,0.7489,,1.393
rf,Random Forest Regressor,1.7858,12.1713,3.0122,-0.2287,0.7529,,5.303
gbr,Gradient Boosting Regressor,1.8711,12.407,3.0413,-0.2452,0.7679,,2.652
ada,AdaBoost Regressor,1.9594,12.5355,3.1009,-0.2679,0.8034,,0.449
en,Elastic Net,2.1389,13.6837,3.2644,-0.401,0.881,,0.238
llar,Lasso Least Angle Regression,2.1389,13.6837,3.2644,-0.401,0.881,,0.242


The best model for cens_electoral_percentage_1003_shifted is: HuberRegressor()
cens_electoral_percentage_82484191_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.0015,0.0484,0.0939,0.8,0.0103,,0.143
en,Elastic Net,0.0015,0.0484,0.0939,0.8,0.0103,,0.15
lar,Least Angle Regression,0.0015,0.0484,0.0939,0.8,0.0103,,0.146
llar,Lasso Least Angle Regression,0.0015,0.0484,0.0939,0.8,0.0103,,0.157
par,Passive Aggressive Regressor,0.0015,0.0484,0.0939,0.8,0.0103,,0.277
dt,Decision Tree Regressor,0.0015,0.0484,0.0939,0.8,0.0103,,0.274
ada,AdaBoost Regressor,0.0015,0.0484,0.0939,0.8,0.0103,,0.593
xgboost,Extreme Gradient Boosting,0.0015,0.0484,0.0939,0.8,0.0103,,0.317
dummy,Dummy Regressor,0.0015,0.0484,0.0939,0.8,0.0103,,0.22
omp,Orthogonal Matching Pursuit,0.0015,0.0484,0.0939,0.7,0.0103,,0.174


The best model for cens_electoral_percentage_82484191_shifted is: Lasso(random_state=123)
cens_electoral_percentage_10_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,4.4437,48.7867,6.4297,-0.1839,0.7082,0.3652,7.245
xgboost,Extreme Gradient Boosting,4.6045,50.3563,6.6287,-0.1863,0.7174,0.3845,0.27
et,Extra Trees Regressor,4.3971,48.7562,6.3597,-0.2055,0.7036,0.3535,2.006
lightgbm,Light Gradient Boosting Machine,4.537,50.2267,6.4876,-0.2174,0.7062,0.3749,0.471
huber,Huber Regressor,5.642,61.4518,7.4505,-0.3595,0.7751,0.4925,0.387
gbr,Gradient Boosting Regressor,5.224,60.0204,7.1925,-0.3964,0.7546,0.4447,2.664
br,Bayesian Ridge,5.9768,69.2146,7.8291,-0.4532,0.7995,0.4921,0.211
ridge,Ridge Regression,5.9776,69.2267,7.8297,-0.4534,0.7995,0.4922,0.151
lar,Least Angle Regression,5.9777,69.2282,7.8298,-0.4534,0.7996,0.4922,0.192
lr,Linear Regression,5.9777,69.2283,7.8298,-0.4534,0.7996,0.4922,0.162


The best model for cens_electoral_percentage_10_shifted is: RandomForestRegressor(n_jobs=-1, random_state=123)
cens_electoral_percentage_1000_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
xgboost,Extreme Gradient Boosting,2.4272,97.3283,3.147,-0.2113,0.3175,,0.426
et,Extra Trees Regressor,2.4285,97.3272,3.1523,-0.3511,0.3231,,1.096
llar,Lasso Least Angle Regression,2.4413,97.3261,3.1468,-0.4418,0.3215,,0.212
dummy,Dummy Regressor,2.4413,97.3261,3.1468,-0.4418,0.3215,,0.255
en,Elastic Net,2.4413,97.3261,3.1468,-0.4418,0.3215,,0.206
lasso,Lasso Regression,2.4413,97.3261,3.1468,-0.4418,0.3215,,0.196
rf,Random Forest Regressor,2.4332,97.4149,3.2938,-0.5935,0.3409,,6.211
omp,Orthogonal Matching Pursuit,2.4431,97.3266,3.1516,-0.8384,0.3261,,0.217
lightgbm,Light Gradient Boosting Machine,2.4263,97.3276,3.1533,-0.8514,0.3225,,0.608
gbr,Gradient Boosting Regressor,2.4357,97.3281,3.1568,-1.0799,0.3285,,4.381


The best model for cens_electoral_percentage_1000_shifted is: XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device='cpu', early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=-1,
             num_parallel_tree=None, random_state=123, ...)
cens_electoral_percentage_6_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,3.3378,24.4317,4.5765,0.488,0.478,0.3478,0.509
et,Extra Trees Regressor,3.3087,24.2739,4.5628,0.485,0.4801,0.3438,2.893
rf,Random Forest Regressor,3.3751,24.9835,4.6577,0.4681,0.4908,0.3549,10.759
xgboost,Extreme Gradient Boosting,3.4283,25.2413,4.6817,0.4634,0.4923,0.3605,0.321
gbr,Gradient Boosting Regressor,3.5491,25.8033,4.7539,0.4549,0.4896,0.375,3.359
knn,K Neighbors Regressor,3.6589,28.7396,5.0306,0.3548,0.5361,0.3895,0.846
br,Bayesian Ridge,4.2142,32.024,5.4242,0.3106,0.5788,0.4567,0.239
lr,Linear Regression,4.2144,32.0276,5.4245,0.3105,0.5788,0.4568,0.232
ridge,Ridge Regression,4.2144,32.0272,5.4245,0.3105,0.5788,0.4568,0.241
lar,Least Angle Regression,4.2383,32.3168,5.4457,0.3053,0.5821,0.4591,0.236


The best model for cens_electoral_percentage_6_shifted is: LGBMRegressor(n_jobs=-1, random_state=123)
cens_electoral_percentage_693_shifted


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
xgboost,Extreme Gradient Boosting,0.017,0.0079,0.0474,-0.0053,0.0303,,0.287
lightgbm,Light Gradient Boosting Machine,0.0164,0.0078,0.045,-0.0053,0.0287,,0.48
gbr,Gradient Boosting Regressor,0.039,0.0206,0.0939,-0.3063,0.0565,,2.841
lasso,Lasso Regression,0.7678,2.687,0.9871,-1.2793,0.4362,,0.192
dummy,Dummy Regressor,0.7678,2.687,0.9871,-1.2793,0.4362,,0.186
llar,Lasso Least Angle Regression,0.7678,2.687,0.9871,-1.2793,0.4362,,0.197
en,Elastic Net,0.7678,2.687,0.9871,-1.2793,0.4362,,0.207
huber,Huber Regressor,0.3227,0.6747,0.4395,-1.6295,0.2087,,0.374
par,Passive Aggressive Regressor,0.6805,2.5268,0.8944,-1.7321,0.3626,,0.214
br,Bayesian Ridge,0.3558,0.7709,0.484,-1.7416,0.2343,,0.219


The best model for cens_electoral_percentage_693_shifted is: XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device='cpu', early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=-1,
             num_parallel_tree=None, random_state=123, ...)
