# Entrenamiento y evaluación.

## Importaciones.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor

from sklearn.decomposition import PCA

from sklearn import metrics

import pickle

## Normalización.

In [2]:
df = pd.read_csv("../data/train/coches_train.csv")

In [28]:
# x = df[["kms", "power", "antiquity", "doors", "num_make"]]
x = df[["kms", "power", "antiquity", "doors"]]
#x = df.drop(columns=["make", "model", "version", "fuel", "shift", "color"])
y = df["price"]

scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_scaler, y, test_size=0.2, random_state=10)

## Modelos.

### Modelo Lineal Regresion.

In [29]:
modelo_lr = LinearRegression()
modelo_lr.fit(x_train, y_train)

predictions = modelo_lr.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("r2_score train", modelo_lr.score(x_train, y_train))
print("r2_score test",modelo_lr.score(x_test, y_test))

MAE: 3583.7838967012826
MAPE: 0.4268264598971
MSE: 22657410.021619365
RMSE: 4759.980044245917
r2_score train 0.7099096036994559
r2_score test 0.7081511189910973


In [5]:
with open('../models/modelo_lr.pkl', "wb") as archivo:
    pickle.dump(modelo_lr, archivo)

### Modelo Lineal Regresion con regresion polinomial de 3 

In [30]:
scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)

poly_feats = PolynomialFeatures(degree = 3)
poly_feats.fit(x_scaler)
X_poly = poly_feats.transform(x_scaler)

x_train, x_test, y_train, y_test = train_test_split(X_poly,y, test_size = 0.2, random_state=12)

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

predictions = lin_reg.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("r2_score train", lin_reg.score(x_train, y_train))
print("r2_score test",lin_reg.score(x_test, y_test))


MAE: 2700.360114625511
MAPE: 0.2543162655538583
MSE: 14408751.691395836
RMSE: 3795.886153639995
r2_score train 0.815680588895998
r2_score test 0.818031430739291


In [None]:
x_test

In [None]:
with open('../models/modelo_polinomial_regression.pkl', "wb") as archivo:
    pickle.dump(modelo_lr, archivo)

### Modelo DecisionTreeRegressor

In [31]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=20, random_state=42)
tree_reg.fit(x_train, y_train)

predictions = tree_reg.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("r2_score train", tree_reg.score(x_train, y_train))
print("r2_score test",tree_reg.score(x_test, y_test))

MAE: 2910.1066866179317
MAPE: 0.2675354869098931
MSE: 19200955.48250685
RMSE: 4381.889487710393
r2_score train 0.9678626896376712
r2_score test 0.7575105413415683


In [None]:
with open('../models/modelo_DecisionTreeRegressor.pkl', "wb") as archivo:
    pickle.dump(modelo_lr, archivo)

### Modelo KNN.

In [32]:
knn_reg = KNeighborsRegressor(n_neighbors=3)

knn_reg.fit(x_train, y_train)

predictions_knn = knn_reg.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions_knn))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions_knn))
print("MSE:", metrics.mean_squared_error(y_test, predictions_knn))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions_knn)))
print("r2_score train", knn_reg.score(x_train, y_train))
print("r2_score test",knn_reg.score(x_test, y_test))

MAE: 2692.8170836005993
MAPE: 0.2545390234779409
MSE: 15220307.756360913
RMSE: 3901.3212834065475
r2_score train 0.9024759174138186
r2_score test 0.8077822641786143


In [23]:
with open('../models/modelo_KNeighborsRegressor.pkl', "wb") as archivo:
    pickle.dump(knn_reg, archivo)

### Modelo Pipeline (PCA y KNeighborsRegressor)

In [33]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Paso de estandarización
    ('pca', PCA(n_components=3)),  # Paso de reducción de dimensionalidad con PCA
    ('knn', KNeighborsRegressor(n_neighbors=3))  # Paso de clasificación con KNN
])

# Ajustamos el modelo en el conjunto de entrenamiento
pipeline.fit(x_train, y_train)

prediciones_pipeline = pipeline.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, prediciones_pipeline))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, prediciones_pipeline))
print("MSE:", metrics.mean_squared_error(y_test, prediciones_pipeline))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, prediciones_pipeline)))
print("r2_score train", pipeline.score(x_train, y_train))
print("r2_score test",pipeline.score(x_test, y_test))

MAE: 2735.1908809065644
MAPE: 0.258482061975968
MSE: 15736156.80413014
RMSE: 3966.882504452349
r2_score train 0.8990665470351993
r2_score test 0.8012675906532791


In [25]:
with open('../models/modelo_Pipeline.pkl', "wb") as archivo:
    pickle.dump(pipeline, archivo)