In [180]:
from basepc_mal import *

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer

In [181]:
df_train = pd.read_csv('./data/train_limpio_pipeline.csv')

In [182]:
# df_test = pd.read_csv('./data/test_limpio.csv')

In [183]:
train_set, validation_set = train_test_split(df_train, test_size= 0.2, random_state= 42)

In [184]:
x_train = train_set.drop(['Price_euros'], axis= 1)
y_train = train_set['Price_euros']

x_validation = validation_set.drop(['Price_euros'], axis= 1)
y_validation = validation_set['Price_euros']

In [185]:
y_train = np.log10(y_train)

In [186]:
model_catboost = CatBoostRegressor()

In [187]:
categoricas = ['Company', 'Product', 'TypeName', 'ScreenResolution', 'Memory', 'Gpu', 'OpSys']
numericas = ['Inches', 'Cpu', 'Ram', 'Weight']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown= 'ignore'), categoricas),
        ('num', StandardScaler(), numericas)
        ])

pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', model_catboost)
])

In [188]:
param_grid = {
    'model__iterations': [100, 200, 500],
    'model__depth': [4, 6, 10],
    'model__learning_rate': [0.01, 0.1, 0.3],
    'model__l2_leaf_reg': [1, 3, 5]
}

In [189]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs= -1, scoring= 'neg_mean_absolute_error', verbose= 0)
grid_search.fit(x_train, y_train)

0:	learn: 0.2526376	total: 27.8ms	remaining: 13.9s
1:	learn: 0.2365349	total: 33.7ms	remaining: 8.38s
2:	learn: 0.2212327	total: 58.4ms	remaining: 9.68s
3:	learn: 0.2075689	total: 83.2ms	remaining: 10.3s
4:	learn: 0.1953277	total: 107ms	remaining: 10.6s
5:	learn: 0.1848094	total: 133ms	remaining: 11s
6:	learn: 0.1752315	total: 162ms	remaining: 11.4s
7:	learn: 0.1670168	total: 187ms	remaining: 11.5s
8:	learn: 0.1597856	total: 200ms	remaining: 10.9s
9:	learn: 0.1542981	total: 203ms	remaining: 9.94s
10:	learn: 0.1481441	total: 227ms	remaining: 10.1s
11:	learn: 0.1431608	total: 252ms	remaining: 10.2s
12:	learn: 0.1381421	total: 279ms	remaining: 10.5s
13:	learn: 0.1334618	total: 313ms	remaining: 10.9s
14:	learn: 0.1292228	total: 340ms	remaining: 11s
15:	learn: 0.1247121	total: 355ms	remaining: 10.7s
16:	learn: 0.1207487	total: 382ms	remaining: 10.8s
17:	learn: 0.1173766	total: 412ms	remaining: 11s
18:	learn: 0.1144759	total: 442ms	remaining: 11.2s
19:	learn: 0.1115366	total: 469ms	remaining

In [190]:
best_model = grid_search.best_estimator_

In [191]:
y_pred = best_model.predict(x_validation)
y_pred = np.power(10, y_pred)

mean_absolute_error(y_validation, y_pred)

160.31714582014948

In [192]:
df_test = pd.read_csv('./data/test_limpio_pipeline.csv')

In [196]:
y_pred_test = best_model.predict(df_test.drop(['id'], axis= 1))
y_pred_test = np.power(10, y_pred_test)



In [199]:
df_test['Price_euros'] = y_pred_test

submision = df_test[['id', 'Price_euros']]
submision

Unnamed: 0,id,Price_euros
0,181,1473.193907
1,708,612.825355
2,862,348.085703
3,1064,1441.308959
4,702,1090.285257
...,...,...
386,1281,876.673275
387,524,1739.971747
388,1015,515.990176
389,1236,572.885419


In [200]:
submision.to_csv("submission.csv", index = False)