In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.metrics import (
    make_scorer,
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import  MLPRegressor
from sklearn.svm import SVR
import numpy as np

In [3]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

METRICS_REGRESSION = {
    "MAE": make_scorer(mean_absolute_error),
    "MSE": make_scorer(mean_squared_error),
    "RMSE": make_scorer(root_mean_squared_error),
    "R2": make_scorer(r2_score)
}

In [4]:
df = pd.read_csv("../cenario2_engine.csv")
df2 = pd.read_csv("../cenario2_engine.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2217 entries, 0 to 2216
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand_Mod         2217 non-null   object 
 1   model             2217 non-null   object 
 2   model_year        2217 non-null   int64  
 3   KM                2217 non-null   int64  
 4   fuel              2217 non-null   object 
 5   HP                2217 non-null   float64
 6   Litros            2217 non-null   float64
 7   Cilindros         2217 non-null   float64
 8   transmission_mod  2217 non-null   object 
 9   ext_col_mod       2217 non-null   object 
 10  int_col_mod       2217 non-null   object 
 11  accident_mod      2217 non-null   object 
 12  clean_title_mod   2217 non-null   object 
 13  price_eur         2217 non-null   int64  
dtypes: float64(3), int64(3), object(8)
memory usage: 242.6+ KB


In [6]:
df.head()

Unnamed: 0,Brand_Mod,model,model_year,KM,fuel,HP,Litros,Cilindros,transmission_mod,ext_col_mod,int_col_mod,accident_mod,clean_title_mod,price_eur
0,Outra,Wrangler Sport,2014,114263,Gasoline,285.0,3.6,6.0,Automatic,Outra,Black,None reported,Yes,20020
1,Outra,Highlander XLE,2015,110361,Gasoline,270.0,3.5,6.0,Automatic,Outra,Outra,At least 1 accident or damage reported,Yes,23660
2,Outra,Camry Hybrid XLE,2018,112076,Hybrid,208.0,2.5,4.0,CVT,white,Black,At least 1 accident or damage reported,Yes,21835
3,Outra,Outback Touring XT,2021,53913,Gasoline,260.0,2.4,4.0,CVT,Outra,Outra,None reported,Yes,30940
4,Outra,GX 460 Base,2018,120701,Gasoline,301.0,4.6,8.0,Automatic,white,beige,At least 1 accident or damage reported,Yes,34666


In [7]:
categorical_columns = df.select_dtypes(include='object').columns

for column in categorical_columns:
    df[column] = df[column].astype('category').cat.codes

In [8]:
df2.head()

Unnamed: 0,Brand_Mod,model,model_year,KM,fuel,HP,Litros,Cilindros,transmission_mod,ext_col_mod,int_col_mod,accident_mod,clean_title_mod,price_eur
0,Outra,Wrangler Sport,2014,114263,Gasoline,285.0,3.6,6.0,Automatic,Outra,Black,None reported,Yes,20020
1,Outra,Highlander XLE,2015,110361,Gasoline,270.0,3.5,6.0,Automatic,Outra,Outra,At least 1 accident or damage reported,Yes,23660
2,Outra,Camry Hybrid XLE,2018,112076,Hybrid,208.0,2.5,4.0,CVT,white,Black,At least 1 accident or damage reported,Yes,21835
3,Outra,Outback Touring XT,2021,53913,Gasoline,260.0,2.4,4.0,CVT,Outra,Outra,None reported,Yes,30940
4,Outra,GX 460 Base,2018,120701,Gasoline,301.0,4.6,8.0,Automatic,white,beige,At least 1 accident or damage reported,Yes,34666


In [9]:
X, y = df.drop("price_eur", axis=1), df["price_eur"]

In [10]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

Random Forest

In [11]:
rf = RandomForestRegressor(random_state=1234)
scores_rf = cross_validate(rf, X, y, cv=splitter, scoring=METRICS_REGRESSION)
rf_scores = pd.DataFrame(scores_rf)
pd.DataFrame(rf_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_RMSE,test_R2
0,1.795718,0.007946,5341.927022,58699710.0,7645.199779,0.818805


In [12]:
rf.fit(X, y)
previsoes = rf.predict(X)
prevs = pd.DataFrame()

prevs['Preco Previsto'] = previsoes

prevs.head()

Unnamed: 0,Preco Previsto
0,19624.46
1,22411.66
2,20960.42
3,31090.11
4,31699.17


In [13]:
df_final = pd.concat([df2, prevs], axis=1)
df_final.head()

Unnamed: 0,Brand_Mod,model,model_year,KM,fuel,HP,Litros,Cilindros,transmission_mod,ext_col_mod,int_col_mod,accident_mod,clean_title_mod,price_eur,Preco Previsto
0,Outra,Wrangler Sport,2014,114263,Gasoline,285.0,3.6,6.0,Automatic,Outra,Black,None reported,Yes,20020,19624.46
1,Outra,Highlander XLE,2015,110361,Gasoline,270.0,3.5,6.0,Automatic,Outra,Outra,At least 1 accident or damage reported,Yes,23660,22411.66
2,Outra,Camry Hybrid XLE,2018,112076,Hybrid,208.0,2.5,4.0,CVT,white,Black,At least 1 accident or damage reported,Yes,21835,20960.42
3,Outra,Outback Touring XT,2021,53913,Gasoline,260.0,2.4,4.0,CVT,Outra,Outra,None reported,Yes,30940,31090.11
4,Outra,GX 460 Base,2018,120701,Gasoline,301.0,4.6,8.0,Automatic,white,beige,At least 1 accident or damage reported,Yes,34666,31699.17


In [17]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2217 entries, 0 to 2216
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand_Mod         2217 non-null   object 
 1   model             2217 non-null   object 
 2   model_year        2217 non-null   int64  
 3   KM                2217 non-null   int64  
 4   fuel              2217 non-null   object 
 5   HP                2217 non-null   float64
 6   Litros            2217 non-null   float64
 7   Cilindros         2217 non-null   float64
 8   transmission_mod  2217 non-null   object 
 9   ext_col_mod       2217 non-null   object 
 10  int_col_mod       2217 non-null   object 
 11  accident_mod      2217 non-null   object 
 12  clean_title_mod   2217 non-null   object 
 13  price_eur         2217 non-null   int64  
 14  Preco Previsto    2217 non-null   float64
dtypes: float64(4), int64(3), object(8)
memory usage: 259.9+ KB


In [14]:
df_final.to_csv("dataset_previsoes.csv", index=False)

In [15]:
# Calcule os resíduos
residuals = previsoes - y

perc = (residuals/y)*100

# Calcule o desvio padrão dos resíduos
std_residuals = residuals.std()

prevs['dif'] = residuals
prevs['perc'] = perc
prevs.head()

Unnamed: 0,Preco Previsto,dif,perc
0,19624.46,-395.54,-1.975724
1,22411.66,-1248.34,-5.276162
2,20960.42,-874.58,-4.005404
3,31090.11,150.11,0.485165
4,31699.17,-2966.83,-8.558328
