In [75]:
from basepc_mal import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer

In [76]:
df_train = pd.read_csv('./data/train_limpio_pipeline.csv')

In [77]:
train_set, validation_set = train_test_split(df_train, test_size= 0.2, random_state= 42)

x_train = train_set.drop(['Price_euros', 'id'], axis= 1)
y_train = train_set['Price_euros']

x_validation = validation_set.drop(['Price_euros', 'id'], axis= 1)
y_validation = validation_set['Price_euros']

y_train = np.log10(y_train)

In [78]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 729 entries, 25 to 102
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           729 non-null    object 
 1   Product           729 non-null    object 
 2   TypeName          729 non-null    object 
 3   Inches            729 non-null    float64
 4   ScreenResolution  729 non-null    object 
 5   Cpu               729 non-null    float64
 6   Ram               729 non-null    int64  
 7   Memory            729 non-null    object 
 8   Gpu               729 non-null    object 
 9   OpSys             729 non-null    object 
 10  Weight            729 non-null    float64
dtypes: float64(3), int64(1), object(7)
memory usage: 68.3+ KB


In [79]:
categoricas = ['Company', 'Product', 'TypeName', 'ScreenResolution', 'Memory', 'Gpu', 'OpSys']
numericas = ['Inches', 'Cpu', 'Ram', 'Weight']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown= 'ignore'), categoricas),
        ('num', StandardScaler(), numericas)
        ])

In [80]:
model_forest = RandomForestRegressor(random_state= 42)

In [81]:
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', model_forest)
])

In [82]:
param_grid = {
    'model__n_estimators': [100, 200],  # Número de árboles
    'model__max_depth': [10, 15, 20],  # Profundidad máxima del árbol
    'model__min_samples_split': [10, 15],  # Número mínimo de muestras para dividir un nodo
    'model__min_samples_leaf': [4, 6, 8],  # Número mínimo de muestras en un nodo hoja
}

In [83]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs= -1, scoring= 'neg_mean_absolute_error')
grid_search.fit(x_train, y_train)

In [84]:
best_model = grid_search.best_estimator_

In [85]:
y_pred = best_model.predict(x_validation)
y_pred = np.power(10, y_pred)

mean_absolute_error(y_validation, y_pred)

204.16883380329557

In [86]:
y_pred

array([1498.5109332 , 1093.54101645,  697.4742975 ,  905.94176888,
       1588.85905873,  442.1421945 ,  289.36910487,  256.97413381,
        459.09157805,  880.63093143,  397.20896421, 1791.94129357,
        675.54828755, 1440.1815214 ,  412.55120284, 1370.73822819,
        449.52888678, 1404.37624877,  259.09511608,  894.86162482,
        413.84608212, 1181.78367299, 1274.45837289,  459.04594008,
       1009.38145169,  668.05274317, 1286.09356372, 1236.82242996,
       1369.02007489, 1410.32499757, 1026.41533152, 2342.39135536,
        338.21206303,  675.31506461,  303.17756038,  463.83408021,
        248.03279542, 2453.8562125 , 1066.84918064, 2369.45836182,
       1065.06222437,  749.72930896, 1927.04090647,  450.4094372 ,
        414.45105728,  406.39305382, 1448.41246818,  601.5184666 ,
        923.10280917, 1232.47664175, 1223.03182744, 1607.66348583,
        755.21015561, 1337.62011946, 1150.28004012,  889.57881799,
        445.69732101,  534.3525624 ,  748.43696887, 1681.49619