In [1]:
%load_ext autoreload
%autoreload 2

import utils

import numpy as np
import pandas as pd

from pickle import load
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error

RANDOM_STATE = 42

In [2]:
with open('data/wine_quality_fixed.pkl', 'rb') as file:
    wine_data = load(file) 

In [3]:
y = wine_data.quality
X = wine_data.drop(['quality'], axis=1)

In [4]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

In [5]:
kurtosis_vars = abs(X_treino.kurtosis())
kurtosis_vars = list(kurtosis_vars[kurtosis_vars > 3].index)

In [6]:
label_vars = list(X_treino.select_dtypes(include=['object']).columns)

In [7]:
scale_vars = [col for col in X_treino.columns if col not in [*kurtosis_vars, *label_vars]]

In [8]:
preprocessing = ColumnTransformer(
    [("encode_categorical", OrdinalEncoder(), label_vars),
     ("scale", StandardScaler(copy=False), scale_vars),
     ("gaussianize", PowerTransformer(copy=False), kurtosis_vars)])

In [9]:
elasticnet = make_pipeline(preprocessing, ElasticNetCV(l1_ratio=np.linspace(0.01, 1, 50), cv=5, random_state=RANDOM_STATE, n_jobs=-1))

In [10]:
elasticnet = elasticnet.fit(X_treino, y_treino)

In [11]:
elasticnet.named_steps['elasticnetcv'].coef_

array([-0.23155479,  0.005065  ,  0.12121122,  0.10623422, -0.06961334,
        0.00312749,  0.40594162, -0.00286126, -0.24263865, -0.05955402,
       -0.01692839,  0.08960097])

In [12]:
y_pred = elasticnet.predict(X_treino)

In [13]:
rmse_treino = mean_squared_error(y_treino, y_pred)**0.5
print(rmse_treino)

0.7318572834937624


In [14]:
pred_teste = elasticnet.predict(X_teste)

In [15]:
rmse_teste = mean_squared_error(y_teste, pred_teste)**0.5
print(rmse_teste)

0.7467277621632225
