# Entrenamiento y evaluación.

## Importaciones.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor

from sklearn.decomposition import PCA

from sklearn import metrics

import pickle

## Normalización.

In [2]:
df = pd.read_csv("../data/raw/coches_segunda_mano_modelos.csv")

In [3]:
x = df[["year", "kms", "power", "antiquity", "num_make"]]
y = df["price"]

scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_scaler, y, test_size=0.2, random_state=10)

## Modelos.

### Modelo Lineal Regresion.

In [6]:
modelo_lr = LinearRegression()
modelo_lr.fit(x_train, y_train)

predictions = modelo_lr.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("r2_score train", modelo_lr.score(x_train, y_train))
print("r2_score test",modelo_lr.score(x_test, y_test))

MAE: 4487.438291166739
MAPE: 0.5116388945296748
MSE: 48313149.71569854
RMSE: 6950.766124370647
r2_score train 0.7268439407466206
r2_score test 0.7126410091983857


In [None]:
with open('../models/modelo_lr.pkl', "wb") as archivo:
    pickle.dump(modelo_lr, archivo)

### Modelo Lineal Regresion con regresion polinomial de 3 

In [8]:
scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)

poly_feats = PolynomialFeatures(degree = 3)
poly_feats.fit(x_scaler)
X_poly = poly_feats.transform(x_scaler)

x_train, x_test, y_train, y_test = train_test_split(X_poly,y, test_size = 0.2, random_state=12)

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

predictions = lin_reg.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("r2_score train", lin_reg.score(x_train, y_train))
print("r2_score test",lin_reg.score(x_test, y_test))


MAE: 3295.5444178422294
MAPE: 0.27977775418798806
MSE: 29281568.984970607
RMSE: 5411.244679828349
r2_score train 0.8227704513775838
r2_score test 0.8139934799247438


In [None]:
with open('../models/modelo_polinomial_regression.pkl', "wb") as archivo:
    pickle.dump(modelo_lr, archivo)

### Modelo DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=20, random_state=42)
tree_reg.fit(x_train, y_train)

predictions = tree_reg.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("r2_score train", lin_reg.score(x_train, y_train))
print("r2_score test",lin_reg.score(x_test, y_test))

MAE: 2966.160747500418
MAPE: 0.24944425430939107
MSE: 28731630.01717145
RMSE: 5360.189363928428
r2_score train 0.8227704513775838
r2_score test 0.8139934799247438


In [None]:
with open('../models/modelo_DecisionTreeRegressor.pkl', "wb") as archivo:
    pickle.dump(modelo_lr, archivo)

### Modelo KNN.

In [24]:
knn_reg = KNeighborsRegressor(n_neighbors=3)

knn_reg.fit(x_train, y_train)

predictions_knn = knn_reg.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions_knn))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions_knn))
print("MSE:", metrics.mean_squared_error(y_test, predictions_knn))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions_knn)))
print("r2_score train", knn_reg.score(x_train, y_train))
print("r2_score test",knn_reg.score(x_test, y_test))

MAE: 2799.419974651458
MAPE: 0.23322760723384625
MSE: 23167451.34342205
RMSE: 4813.2578721092905
r2_score train 0.9389804225670646
r2_score test 0.8622036551815196


In [17]:
with open('../models/modelo_KNeighborsRegressor.pkl', "wb") as archivo:
    pickle.dump(knn_reg, archivo)

### Modelo Pipeline (PCA y KNeighborsRegressor)

In [24]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Paso de estandarización
    ('pca', PCA(n_components=3)),  # Paso de reducción de dimensionalidad con PCA
    ('knn', KNeighborsRegressor(n_neighbors=3))  # Paso de clasificación con KNN
])

# Ajustamos el modelo en el conjunto de entrenamiento
pipeline.fit(x_train, y_train)

prediciones_pipeline = pipeline.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, prediciones_pipeline))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, prediciones_pipeline))
print("MSE:", metrics.mean_squared_error(y_test, prediciones_pipeline))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, prediciones_pipeline)))
print("r2_score train", pipeline.score(x_train, y_train))
print("r2_score test",pipeline.score(x_test, y_test))

MAE: 2684.0390367553864
MAPE: 0.22032871051583805
MSE: 21920108.96561047
RMSE: 4681.891601223855
r2_score train 0.9410835035734734
r2_score test 0.8696226508168954


In [25]:
with open('../models/modelo_Pipeline.pkl', "wb") as archivo:
    pickle.dump(pipeline, archivo)