# Entrenamiento y evaluación.

## Importaciones.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor

from sklearn.decomposition import PCA

from sklearn import metrics

import pickle

## Normalización.

In [29]:
df = pd.read_csv("../data/train/coches_train.csv")

In [30]:
#x = df[["kms", "power", "antiquity", "doors", "num_make"]]
x = df.drop(columns=["make", "model", "version", "fuel", "shift", "color"])
y = df["price"]

scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_scaler, y, test_size=0.2, random_state=10)

## Modelos.

### Modelo Lineal Regresion.

In [37]:
modelo_lr = LinearRegression()
modelo_lr.fit(x_train, y_train)

predictions = modelo_lr.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("r2_score train", modelo_lr.score(x_train, y_train))
print("r2_score test",modelo_lr.score(x_test, y_test))

MAE: 20.114965379109844
MAPE: 0.0005170942256873131
MSE: 2523159.3856109506
RMSE: 1588.4455878659962
r2_score train 1.0
r2_score test 0.9674993195340941


In [None]:
with open('../models/modelo_lr.pkl', "wb") as archivo:
    pickle.dump(modelo_lr, archivo)

### Modelo Lineal Regresion con regresion polinomial de 3 

In [38]:
scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)

poly_feats = PolynomialFeatures(degree = 3)
poly_feats.fit(x_scaler)
X_poly = poly_feats.transform(x_scaler)

x_train, x_test, y_train, y_test = train_test_split(X_poly,y, test_size = 0.2, random_state=12)

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

predictions = lin_reg.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("r2_score train", lin_reg.score(x_train, y_train))
print("r2_score test",lin_reg.score(x_test, y_test))


MemoryError: Unable to allocate 29.2 GiB for an array with shape (31177, 125580) and data type float64

In [None]:
with open('../models/modelo_polinomial_regression.pkl', "wb") as archivo:
    pickle.dump(modelo_lr, archivo)

### Modelo DecisionTreeRegressor

In [36]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=20, random_state=42)
tree_reg.fit(x_train, y_train)

predictions = tree_reg.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions))
print("MSE:", metrics.mean_squared_error(y_test, predictions))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print("r2_score train", tree_reg.score(x_train, y_train))
print("r2_score test",tree_reg.score(x_test, y_test))

MAE: 0.7148813341885825
MAPE: 6.090478901409408e-05
MSE: 30.001282873636946
RMSE: 5.4773426836046095
r2_score train 1.0
r2_score test 0.9999996135550874


In [None]:
with open('../models/modelo_DecisionTreeRegressor.pkl', "wb") as archivo:
    pickle.dump(modelo_lr, archivo)

### Modelo KNN.

In [34]:
knn_reg = KNeighborsRegressor(n_neighbors=3)

knn_reg.fit(x_train, y_train)

predictions_knn = knn_reg.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, predictions_knn))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, predictions_knn))
print("MSE:", metrics.mean_squared_error(y_test, predictions_knn))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, predictions_knn)))
print("r2_score train", knn_reg.score(x_train, y_train))
print("r2_score test",knn_reg.score(x_test, y_test))

MAE: 833.4043724609793
MAPE: 0.10643851722146545
MSE: 2099369.5040802513
RMSE: 1448.9201165282548
r2_score train 0.9884059799346457
r2_score test 0.9729581342260475


In [17]:
with open('../models/modelo_KNeighborsRegressor.pkl', "wb") as archivo:
    pickle.dump(knn_reg, archivo)

### Modelo Pipeline (PCA y KNeighborsRegressor)

In [35]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Paso de estandarización
    ('pca', PCA(n_components=3)),  # Paso de reducción de dimensionalidad con PCA
    ('knn', KNeighborsRegressor(n_neighbors=3))  # Paso de clasificación con KNN
])

# Ajustamos el modelo en el conjunto de entrenamiento
pipeline.fit(x_train, y_train)

prediciones_pipeline = pipeline.predict(x_test)

print("MAE:", metrics.mean_absolute_error(y_test, prediciones_pipeline))
print("MAPE:", metrics.mean_absolute_percentage_error(y_test, prediciones_pipeline))
print("MSE:", metrics.mean_squared_error(y_test, prediciones_pipeline))
print("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, prediciones_pipeline)))
print("r2_score train", pipeline.score(x_train, y_train))
print("r2_score test",pipeline.score(x_test, y_test))

MAE: 1482.5583707504811
MAPE: 0.18589473994936484
MSE: 6002104.013897799
RMSE: 2449.9191851768906
r2_score train 0.9601579733388461
r2_score test 0.9226872207157101


In [25]:
with open('../models/modelo_Pipeline.pkl', "wb") as archivo:
    pickle.dump(pipeline, archivo)