In [1]:
import pandas as pd
import numpy as np
# from models import LinearReg
# from models import Dataprocessor

In [None]:
class Dataprocessor:
    def __init__(self, df):
        self.df = df
        self.mean_std = []
    def preprocess(self):
        df = self.df.copy()

        # 1. Limpieza básica
        df_clean = df.drop(columns=["Unnamed: 0", "Título", "Descripción"], errors="ignore")

        # 2. Precio en USD
        usd_conversion_rate = 1185.26
        df_clean["Precio_usd"] = np.where(
            df_clean["Moneda"] == "$",
            df_clean["Precio"] / usd_conversion_rate,
            df_clean["Precio"]
        )

        # 3. Antigüedad
        df_clean["Antigüedad"] = 2025 - df_clean["Año"]

        # 4. Kilómetros a número
        df_clean["Kilómetros"] = (
            df_clean["Kilómetros"]
            .astype(str)
            .str.replace(" km", "", regex=False)
            .str.replace(".", "", regex=False)
            .str.replace(",", "", regex=False)
            .astype(float)
        )

        # 5. Rellenar categóricos faltantes
        categorical_cols = ["Color", "Transmisión", "Motor", "Con cámara de retroceso"]
        for col in categorical_cols:
            if col in df_clean.columns:
                df_clean[col] = df_clean[col].fillna("desconocido")

        # 6. One-hot encoding (manual)
        def one_hot_encode(df, column):
            unique_values = df[column].unique()
            one_hot = np.zeros((df.shape[0], len(unique_values)), dtype=float)
            value_to_index = {val: i for i, val in enumerate(unique_values)}
            for i, val in enumerate(df[column]):
                one_hot[i, value_to_index[val]] = 1
            col_names = [f"{column}_{val}" for val in unique_values]
            one_hot_df = pd.DataFrame(one_hot, columns=col_names, index=df.index)
            return one_hot_df

        columns_to_encode = [
            "Marca", "Modelo", "Color", "Tipo de combustible",
            "Transmisión", "Motor", "Tipo de carrocería",
            "Con cámara de retroceso", "Tipo de vendedor"
        ]

        encoded_parts = []
        for col in columns_to_encode:
            if col in df_clean.columns:
                encoded_parts.append(one_hot_encode(df_clean, col))

        df_encoded = pd.concat(encoded_parts, axis=1) if encoded_parts else pd.DataFrame(index=df_clean.index)

        # 7. Eliminar columnas categóricas originales si existen
        df_clean = df_clean.drop(columns=[col for col in columns_to_encode if col in df_clean.columns])
        df_clean = df_clean.drop(columns=["Versión", "Moneda"], errors="ignore")

        # 8. Agregar columnas codificadas
        df_clean = pd.concat([df_clean, df_encoded], axis=1)


        return df_clean.reset_index(drop=True)
    def normalize(self, X):
        for i in range(X.shape[1]):
            col = X[:, i]
            mean = col.mean()
            std = col.std()
            std = 1 if std == 0 else std
            self.mean_std.append((mean, std))
            X[:, i] = (col - mean) / std
        return X
    def normalize_new_data(self, X):
        for i in range(X.shape[1]):
            col = X[:, i]
            mean = self.mean_std[i][0]
            std = self.mean_std[i][1]
            # std = 1 if std == 0 else std
            X[:, i] = (col - mean) / std
        return X
    def get_means_std(self):
        return self.mean_std
    
    
class LinearReg:
    def __init__(self, X, y, l1=0, l2=0):
        self.X = np.column_stack((np.ones(X.shape[0]), X))
        self.y = np.array(y)
        self.W = np.zeros(self.X.shape[1])
        self.l1 = l1
        self.l2 = l2

    def train_pinv(self, reg=0):
        if reg == "l2":
            X = self.X
            I = np.eye(self.X.shape[1])
            I[0, 0] = 0  # No regularizar el término de bias
            self.W = np.linalg.inv(X.T @ X + self.l2 * I) @ X.T @ self.y
        else:
            U, S, vt = np.linalg.svd(self.X , full_matrices=False)
            S_inv = np.diag(1 / S)
            p_inv = vt.T @ S_inv @ U.T 
            self.W = p_inv @ self.y
            
    def gd(self, lr, reg):
        y_pred = self.X @ self.W
        gradient = -2 * self.X.T @ (self.y - y_pred) / len(self.y)
        if reg == "l1":
            reg_term =  self.l1 * np.sign(self.W)
        elif reg == "l2":
            reg_term = 2 * self.l2 * self.W
        else:
            reg_term = 0
        self.W -= lr * (gradient + reg_term)
        
    def train_gd(self, lr, epochs, reg=0):
        for _ in range(epochs):
            self.gd(lr, reg)
    
    def print_W(self, feature_names):
        for title, w in zip(feature_names, self.W):
            print(f"{title}: {w}")
        return self.W
    
    def predict(self, X):
        X = np.column_stack((np.ones(X.shape[0]), X))
        return X @ self.W
    def mean_squared_error(self, y_true, y_pred):
        mse = np.mean((y_true - y_pred) ** 2)
        return mse
    
    def rmse(self, y_true, y_pred):
        return np.sqrt(self.mean_squared_error(y_true, y_pred))





In [14]:

# Cargar el dataset
df = pd.read_csv("pf_suvs_i302_1s2025.csv")

dp = Dataprocessor(df)
df =pd.DataFrame(dp.preprocess())



df_train = df.sample(frac=0.8, random_state=42)
y = df["Precio_usd"].values
X = df.drop(columns=["Precio_usd"]).values
df_val = df.drop(df_train.index)
y_val = df_val["Precio_usd"].values
X_val = df_val.drop(columns=["Precio_usd"]).values

dp_train = Dataprocessor(df_train)
X = dp_train.normalize(X)
X_val = dp_train.normalize_new_data(X_val)


In [15]:
print(X.shape, y.shape)
print(X_val.shape, y_val.shape)

(18254, 552) (18254,)
(3651, 552) (3651,)


In [26]:
model = LinearReg(X, y,0, 0.001)
model.train_gd(0.01, 1000, "l2")
y_pred = model.predict(X_val)
rmse = model.rmse(y_val, y_pred)
print(f"RMSE: {rmse:.2f}")


RMSE: 8850.61


In [27]:
model.train_pinv("l2")
y_pred_pinv = model.predict(X_val)
rmse_pinv = model.rmse(y_val, y_pred_pinv)
print(f"RMSE con Pseudo-inversa: {rmse_pinv:.2f}")


RMSE con Pseudo-inversa: 8825.13


In [28]:
model.train_pinv()
y_pred_pinv = model.predict(X_val)
rmse_pinv = model.rmse(y_val, y_pred_pinv)
print(f"RMSE con Pseudo-inversa: {rmse_pinv:.2f}")


RMSE con Pseudo-inversa: 12577.43
