In [1]:
import pandas as pd
import os
import numpy as np

# Subimos dos niveles: desde notebooks/diego/ → notebooks/ → raíz del repo
repo_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Construimos la ruta correcta al dataset
data_path = os.path.join(repo_root, "data", "car_sales_dataset.csv")

print("Ruta usada:", data_path)

# Cargar dataset
df = pd.read_csv(data_path)
df.head()



Ruta usada: c:\Users\tecno\Documents\CUNEF 3º\Herramientas de Trabajo Colaborativo\car-sales-prediction\data\car_sales_dataset.csv


Unnamed: 0,Manufacturer,Model,Engine size,Fuel type,Year of manufacture,Mileage,Price
0,Ford,Fiesta,1.0,Petrol,2002,127300,3074
1,Porsche,718 Cayman,4.0,Petrol,2016,57850,49704
2,Ford,Mondeo,1.6,Diesel,2014,39190,24072
3,Toyota,RAV4,1.8,Hybrid,1988,210814,1705
4,VW,Polo,1.0,Petrol,2006,127869,4101


In [2]:
# Ver dimensiones del dataset
df.shape


(50000, 7)

In [3]:

# Ver tipos de datos
df.dtypes


Manufacturer            object
Model                   object
Engine size            float64
Fuel type               object
Year of manufacture      int64
Mileage                  int64
Price                    int64
dtype: object

In [4]:

# Estadísticos básicos
df.describe(include="all")


Unnamed: 0,Manufacturer,Model,Engine size,Fuel type,Year of manufacture,Mileage,Price
count,50000,50000,50000.0,50000,50000.0,50000.0,50000.0
unique,5,15,,3,,,
top,Ford,Mondeo,,Petrol,,,
freq,14959,5058,,25488,,,
mean,,,1.773058,,2004.20944,112497.3207,13828.90316
std,,,0.734108,,9.645965,71632.515602,16416.681336
min,,,1.0,,1984.0,630.0,76.0
25%,,,1.4,,1996.0,54352.25,3060.75
50%,,,1.6,,2004.0,100987.5,7971.5
75%,,,2.0,,2012.0,158601.0,19026.5


## Análisis de valores nulos

Comprobamos si el dataset contiene valores faltantes que requieran imputación.


In [12]:
df.isnull().sum()


Manufacturer           0
Model                  0
Engine size            0
Fuel type              0
Year of manufacture    0
Mileage                0
Price                  0
dtype: int64

No hay valores nulos

## Eliminación de duplicados

Verificamos si existen filas duplicadas en el dataset y las eliminamos en caso necesario.


In [5]:
duplicados = df.duplicated().sum()
print("Número de duplicados:", duplicados)

if duplicados > 0:
    df = df.drop_duplicates()
    print("Duplicados eliminados. Nuevas dimensiones:", df.shape)
else:
    print("No se han encontrado duplicados.")


Número de duplicados: 12
Duplicados eliminados. Nuevas dimensiones: (49988, 7)


## Identificación de variables numéricas y categóricas
Separaremos las columnas en:
- Variables numéricas
- Variables categóricas

Este paso es necesario para la codificación y el escalado.


In [12]:
# 1) Definir columnas correctamente
num_cols = ['Engine size', 'Year of manufacture', 'Mileage']
cat_cols = ['Manufacturer', 'Model', 'Fuel type']

print("Numéricas:", num_cols)
print("Categóricas:", cat_cols)



Numéricas: ['Engine size', 'Year of manufacture', 'Mileage']
Categóricas: ['Manufacturer', 'Model', 'Fuel type']


## Codificación de variables categóricas

Las columnas:
- Manufacturer
- Model
- Fuel type

serán transformadas mediante **One Hot Encoding**, eliminando la primera categoría para evitar multicolinealidad (`drop_first=True`).


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Crear pipelines
numeric_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ]
)


In [16]:
# Ajustar preprocesador SOLO sobre las features
X = df.drop(columns=["Price"])
preprocessor.fit(X)

# Transformar
X_processed = preprocessor.transform(X)

# Obtener nombres categóricos después de OHE
encoded_cat_cols = (
    preprocessor.named_transformers_["cat"]
    .named_steps["encoder"]
    .get_feature_names_out(cat_cols)
)

# Reconstruir nombres finales
all_cols = num_cols + encoded_cat_cols.tolist()

print("Número de columnas finales:", len(all_cols))

# Reconstruir dataframe
df_processed = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed,
                            columns=all_cols)

df_processed.head()

Número de columnas finales: 23


Unnamed: 0,Engine size,Year of manufacture,Mileage,Manufacturer_Ford,Manufacturer_Porsche,Manufacturer_Toyota,Manufacturer_VW,Model_911,Model_Cayenne,Model_Fiesta,...,Model_Mondeo,Model_Passat,Model_Polo,Model_Prius,Model_RAV4,Model_X3,Model_Yaris,Model_Z4,Fuel type_Hybrid,Fuel type_Petrol
0,-1.053121,-0.229073,0.206418,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3.033285,1.222312,-0.763234,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.235839,1.014971,-1.023762,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.036588,-1.680458,1.37243,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-1.053121,0.185608,0.214363,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Guardado del dataset procesado

Guardamos el dataset final en:



In [8]:
processed_path = os.path.join(repo_root, "data", "processed")
os.makedirs(processed_path, exist_ok=True)

output_path = os.path.join(processed_path, "car_sales_processed.csv")
df.to_csv(output_path, index=False)

print("Dataset procesado guardado en:", output_path)


Dataset procesado guardado en: c:\Users\tecno\Documents\CUNEF 3º\Herramientas de Trabajo Colaborativo\car-sales-prediction\data\processed\car_sales_processed.csv
