In [14]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
import pandas as pd
from sklearn.metrics import (
    make_scorer,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import  MLPRegressor
from sklearn.svm import SVR
import numpy as np

In [16]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

METRICS_REGRESSION = {
    "MAE": make_scorer(mean_absolute_error),
    "MSE": make_scorer(mean_squared_error),
    "RMSE": make_scorer(root_mean_squared_error),
    "R2": make_scorer(r2_score)
}

In [17]:
df = pd.read_csv("../cenario1_engine.csv")

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3207 entries, 0 to 3206
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand_Mod         3207 non-null   object 
 1   model             3207 non-null   object 
 2   model_year        3207 non-null   int64  
 3   KM                3207 non-null   int64  
 4   fuel              3207 non-null   object 
 5   HP                2578 non-null   float64
 6   Litros            2891 non-null   float64
 7   Cilindros         2705 non-null   float64
 8   transmission_mod  3207 non-null   object 
 9   ext_col_mod       3207 non-null   object 
 10  int_col_mod       3207 non-null   object 
 11  accident_mod      3207 non-null   object 
 12  clean_title_mod   3207 non-null   object 
 13  price_eur         3207 non-null   int64  
dtypes: float64(3), int64(3), object(8)
memory usage: 350.9+ KB


In [19]:
medias_colunas = df[['HP', 'Litros', 'Cilindros']].mean()
colunas = ['HP', 'Litros', 'Cilindros']

for coluna in colunas:
    df[coluna] = df[coluna].fillna(medias_colunas[coluna])

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3207 entries, 0 to 3206
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand_Mod         3207 non-null   object 
 1   model             3207 non-null   object 
 2   model_year        3207 non-null   int64  
 3   KM                3207 non-null   int64  
 4   fuel              3207 non-null   object 
 5   HP                3207 non-null   float64
 6   Litros            3207 non-null   float64
 7   Cilindros         3207 non-null   float64
 8   transmission_mod  3207 non-null   object 
 9   ext_col_mod       3207 non-null   object 
 10  int_col_mod       3207 non-null   object 
 11  accident_mod      3207 non-null   object 
 12  clean_title_mod   3207 non-null   object 
 13  price_eur         3207 non-null   int64  
dtypes: float64(3), int64(3), object(8)
memory usage: 350.9+ KB


In [21]:
categorical_columns = df.select_dtypes(include='object').columns

for column in categorical_columns:
    df[column] = df[column].astype('category').cat.codes

In [22]:
X, y = df.drop("price_eur", axis=1), df["price_eur"]

In [23]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

Random Forest

In [25]:
rf = RandomForestRegressor(random_state=1234)
scores_rf = cross_validate(rf, X, y, cv=splitter, scoring=METRICS_REGRESSION)
rf_scores = pd.DataFrame(scores_rf)
pd.DataFrame(rf_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_RMSE,test_R2
0,2.934233,0.011292,12496.153486,4181885000.0,51388.412742,0.417696


Redes Neuronais

In [26]:
nn = MLPRegressor(hidden_layer_sizes=(50,50), max_iter=200, random_state=1234)
scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS_REGRESSION)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_RMSE,test_R2
0,2.076837,0.002571,19871.121908,4977425000.0,58596.907188,0.280613


Support Vector Machine

In [27]:
svm = SVR()
scores_svm = cross_validate(svm, X, y, cv=splitter, scoring=METRICS_REGRESSION)
svm_scores = pd.DataFrame(scores_svm)
pd.DataFrame(svm_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_RMSE,test_R2
0,0.300433,0.06383,24566.91277,5946056000.0,67169.597083,-0.036693
