# Polinomial Regression Model

## Imports

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, RANSACRegressor
from sklearn import metrics as mt
from sklearn import model_selection as ms
from sklearn.pipeline import Pipeline

## Dataset

In [3]:
# Dados Treinamento
X_train = pd.read_csv("/home/ds_deivisson/repos/datasets/X_training_regressao.csv")
y_train = pd.read_csv("/home/ds_deivisson/repos/datasets/y_training_regressao.csv").squeeze()

# Dados Validacao
X_val = pd.read_csv("/home/ds_deivisson/repos/datasets/X_validation_regressao.csv")
y_val = pd.read_csv("/home/ds_deivisson/repos/datasets/y_validation_regressao.csv").squeeze()

# Dados Teste
X_test = pd.read_csv("/home/ds_deivisson/repos/datasets/X_test_regressao.csv")
y_test = pd.read_csv("/home/ds_deivisson/repos/datasets/y_test_regressao.csv").squeeze()

## Avaliação do grau do polinômio

In [None]:
# Avaliação dos graus
# -------------------------------
graus = range(1, 5)
resultados = {}

for grau in graus:
    pipeline = Pipeline([
        ("poly", PolynomialFeatures(degree=grau, include_bias=False)),
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ])

    # cross_val_score 
    scores = ms.cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=5,
        scoring="neg_root_mean_squared_error"
    )

    rmse_medio = -scores.mean()
    resultados[grau] = rmse_medio

    print(f"Grau {grau} | RMSE médio: {rmse_medio:.2f}")

# -------------------------------
# Melhor grau
# -------------------------------
melhor_grau = min(resultados, key=resultados.get)

print("\n==============================")
print(f"Melhor grau do polinômio: {melhor_grau}")
print(f"RMSE: {resultados[melhor_grau]:.2f}")
print("==============================")

Grau 1 | RMSE médio: 21.37
Grau 2 | RMSE médio: 21.00
Grau 3 | RMSE médio: 21.72
Grau 4 | RMSE médio: 176.20

Melhor grau do polinômio: 2
RMSE: 21.00


## Função padrão de métricas

In [10]:
def regression_metrics(y_true, y_pred, prefix=""):
    return {
        f"R2{prefix}": mt.r2_score(y_true, y_pred),
        f"MSE{prefix}": mt.mean_squared_error(y_true, y_pred),
        f"RMSE{prefix}": np.sqrt(mt.mean_squared_error(y_true, y_pred)),
        f"MAE{prefix}": mt.mean_absolute_error(y_true, y_pred),
        f"MAPE{prefix}": mt.mean_absolute_percentage_error(y_true, y_pred),
    }

## Pipeline do Modelo

In [6]:
pipelines = {
    "base": Pipeline([
        ("poly", PolynomialFeatures(include_bias=False, interaction_only=True, degree=2)),
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ]),

    "lasso": Pipeline([
        ("poly", PolynomialFeatures(include_bias=False, interaction_only=True, degree=2)),
        ("scaler", StandardScaler()),
        ("model", Lasso(max_iter=5000))
    ]),

    "ridge": Pipeline([
        ("poly", PolynomialFeatures(include_bias=False, interaction_only=True, degree=2)),
        ("scaler", StandardScaler()),
        ("model", Ridge(solver="auto"))
    ]),

    "elastic": Pipeline([
        ("poly", PolynomialFeatures(include_bias=False, interaction_only=True, degree=2)),
        ("scaler", StandardScaler()),
        ("model", ElasticNet(max_iter=5000))
    ]),

    "ransac": Pipeline([
        ("poly", PolynomialFeatures(include_bias=False, interaction_only=True, degree=2)),
        ("scaler", StandardScaler()),
        ("model", RANSACRegressor())
    ])
}

## Grid de hiperparâmetros (fine tuning)

In [7]:
param_grids = {
    "base": {
    },

    "lasso": {        
        "model__alpha": [0.01, 0.1, 1.0]
    },

    "ridge": {        
        "model__alpha": [0.1, 1.0, 10.0, 50.0]
    },

    "elastic": {        
        "model__alpha": [0.01, 0.1, 1.0],
        "model__l1_ratio": [0.2, 0.5, 0.8]
    },

    "ransac": {    
        "model__min_samples": [0.5, 0.7, 0.9]
    }
}

## Tuning com validação (GridSearch)

In [11]:
best_models = {}

for name, pipe in pipelines.items():
    print(f"\nTuning model: {name}")

    gs = ms.GridSearchCV(
        estimator=pipe,
        param_grid=param_grids[name],
        scoring="neg_root_mean_squared_error",
        cv=5,
        n_jobs=-1
    )

    gs.fit(X_train, y_train)

    best_models[name] = gs.best_estimator_

    print("Best params:", gs.best_params_)


Tuning model: base
Best params: {}

Tuning model: lasso
Best params: {'model__alpha': 0.01}

Tuning model: ridge
Best params: {'model__alpha': 50.0}

Tuning model: elastic
Best params: {'model__alpha': 0.01, 'model__l1_ratio': 0.2}

Tuning model: ransac
Best params: {'model__min_samples': 0.9}


## Avaliação — Treino e Validação

In [12]:
results = {}

for name, model in best_models.items():
    print(f"\nModel: {name}")

    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    metrics_train = regression_metrics(y_train, y_train_pred, "_train")
    metrics_val = regression_metrics(y_val, y_val_pred, "_val")

    results[name] = {**metrics_train, **metrics_val}

    for k, v in results[name].items():
        print(f"{k}: {v:.4f}")


Model: base
R2_train: 0.0888
MSE_train: 435.5889
RMSE_train: 20.8708
MAE_train: 16.5260
MAPE_train: 8.4067
R2_val: 0.0631
MSE_val: 447.3675
RMSE_val: 21.1511
MAE_val: 16.7812
MAPE_val: 8.5704

Model: lasso
R2_train: 0.0878
MSE_train: 436.0271
RMSE_train: 20.8813
MAE_train: 16.5329
MAPE_train: 8.4225
R2_val: 0.0658
MSE_val: 446.0945
RMSE_val: 21.1209
MAE_val: 16.7540
MAPE_val: 8.5808

Model: ridge
R2_train: 0.0880
MSE_train: 435.9363
RMSE_train: 20.8791
MAE_train: 16.5292
MAPE_train: 8.4254
R2_val: 0.0650
MSE_val: 446.4685
RMSE_val: 21.1298
MAE_val: 16.7580
MAPE_val: 8.5827

Model: elastic
R2_train: 0.0874
MSE_train: 436.2258
RMSE_train: 20.8860
MAE_train: 16.5346
MAPE_train: 8.4332
R2_val: 0.0657
MSE_val: 446.1320
RMSE_val: 21.1218
MAE_val: 16.7493
MAPE_val: 8.5896

Model: ransac
R2_train: 0.0799
MSE_train: 439.8000
RMSE_train: 20.9714
MAE_train: 16.3422
MAPE_train: 8.6959
R2_val: 0.0555
MSE_val: 451.0279
RMSE_val: 21.2374
MAE_val: 16.6393
MAPE_val: 8.8639


## Treinamento final (Train + Val)

In [13]:
X_final = pd.concat([X_train, X_val])
y_final = np.concatenate([y_train, y_val])

In [14]:
final_models = {}

for name, model in best_models.items():
    model.fit(X_final, y_final)
    final_models[name] = model

## Avaliação final — Dados Teste

In [15]:
test_results = {}

for name, model in final_models.items():
    print(f"\nFinal Test - Model: {name}")

    y_test_pred = model.predict(X_test)
    metrics_test = regression_metrics(y_test, y_test_pred, "_test")

    test_results[name] = metrics_test

    for k, v in metrics_test.items():
        print(f"{k}: {v:.4f}")


Final Test - Model: base
R2_test: 0.0894
MSE_test: 443.3884
RMSE_test: 21.0568
MAE_test: 16.7451
MAPE_test: 8.3592

Final Test - Model: lasso
R2_test: 0.0892
MSE_test: 443.4562
RMSE_test: 21.0584
MAE_test: 16.7504
MAPE_test: 8.3553

Final Test - Model: ridge
R2_test: 0.0890
MSE_test: 443.5427
RMSE_test: 21.0605
MAE_test: 16.7537
MAPE_test: 8.3724

Final Test - Model: elastic
R2_test: 0.0881
MSE_test: 444.0112
RMSE_test: 21.0716
MAE_test: 16.7644
MAPE_test: 8.3774

Final Test - Model: ransac
R2_test: 0.0816
MSE_test: 447.1842
RMSE_test: 21.1467
MAE_test: 16.6074
MAPE_test: 8.6290
