# Car Price Data Analysis

In [47]:
import pandas as pd
import kaggle as kg
import zipfile as zip
import subprocess
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import numpy as np

kaggle = True

In [18]:

def get_kaggle_dataset(dataset):
    """
    Downloads a dataset from Kaggle using the Kaggle API.

    This function executes the `kaggle datasets download` command to fetch 
    the specified dataset from Kaggle. It handles errors gracefully and 
    prints relevant messages based on the download status.
    
    unzio the compress file into data folder in the project folder

    Args:
        dataset (str): The Kaggle dataset identifier in the format 
                      "owner/dataset-name" (e.g., "asinow/car-price-dataset").

    Raises:
        subprocess.CalledProcessError: If the command execution fails, 
                                       an error message is displayed.

    Example:
        get_kaggle_dataset("asinow/car-price-dataset")
    """
    try:
        # execute the kaggle commando 
        result = subprocess.run(
            ["kaggle", "datasets", "download", "-d", dataset],
            check=True,
            capture_output=True,
            text=True
        )
        print("✅ Dataset donwloaded with succesfully!")
        print(result.stdout)
        print("Uncompress data file in data folder")
        
        #unzio file
        with zip.ZipFile(dataset.split('/')[-1]+".zip", 'r') as zip_ref:
            zip_ref.extractall("data")
        
    except subprocess.CalledProcessError as e:
        print("❌ Error descargando dataset ->", e.stderr)


# Download and read kaggle dataset

In [20]:
# Llamada a la función con el dataset de ejemplo
get_kaggle_dataset("asinow/car-price-dataset")

df_data = pd.read_csv('./data/car_price_dataset.csv', sep=',')

✅ Dataset donwloaded with succesfully!
Dataset URL: https://www.kaggle.com/datasets/asinow/car-price-dataset
License(s): other
car-price-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)

Uncompress data file in data folder


In [21]:
df_data

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,GLA,2020,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003,2.6,Hybrid,Semi-Automatic,286618,3,3,2867
...,...,...,...,...,...,...,...,...,...,...
9995,Kia,Optima,2004,3.7,Diesel,Semi-Automatic,5794,2,4,8884
9996,Chevrolet,Impala,2002,1.4,Electric,Automatic,168000,2,1,6240
9997,BMW,3 Series,2010,3.0,Petrol,Automatic,86664,5,1,9866
9998,Ford,Explorer,2002,1.4,Hybrid,Automatic,225772,4,1,4084


In [22]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         10000 non-null  object 
 1   Model         10000 non-null  object 
 2   Year          10000 non-null  int64  
 3   Engine_Size   10000 non-null  float64
 4   Fuel_Type     10000 non-null  object 
 5   Transmission  10000 non-null  object 
 6   Mileage       10000 non-null  int64  
 7   Doors         10000 non-null  int64  
 8   Owner_Count   10000 non-null  int64  
 9   Price         10000 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 781.4+ KB


In [23]:
df_data.describe()

Unnamed: 0,Year,Engine_Size,Mileage,Doors,Owner_Count,Price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2011.5437,3.00056,149239.1118,3.4971,2.9911,8852.9644
std,6.897699,1.149324,86322.348957,1.110097,1.422682,3112.59681
min,2000.0,1.0,25.0,2.0,1.0,2000.0
25%,2006.0,2.0,74649.25,3.0,2.0,6646.0
50%,2012.0,3.0,149587.0,3.0,3.0,8858.5
75%,2017.0,4.0,223577.5,4.0,4.0,11086.5
max,2023.0,5.0,299947.0,5.0,5.0,18301.0


In [24]:
df_data.isna().count()

Brand           10000
Model           10000
Year            10000
Engine_Size     10000
Fuel_Type       10000
Transmission    10000
Mileage         10000
Doors           10000
Owner_Count     10000
Price           10000
dtype: int64

# EDA Cars

In [None]:
for column in df_data.columns:
    fig = px.histogram(df_data,
                       x=column,
                       nbins=30)
    fig.update_layout(title='Distribution of '+column)
    
    if kaggle:
        fig.show(renderer='iframe_connected')
    else:
        fig.show()

In [None]:
df_corr = df_data[['Year',
                   'Engine_Size',
                   'Mileage',
                   'Doors',
                   'Owner_Count',
                   'Price']].corr(method='pearson')

fig = go.Figure(go.Heatmap(x = df_corr.columns,
                           y = df_corr.columns,
                           z = df_corr.values.tolist(),
                           colorscale = 'rdbu',
                           zmin = -1,
                           zmax = 1))

fig.update_layout(width = 800,
                  height = 700)

if kaggle:
    fig.show(renderer='iframe_connected')
else:
    fig.show()

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Seleccionar características (X) y variable objetivo (y)
X = df_data.drop(columns=["Price"])  # Variables predictoras
y = df_data["Price"]  # Variable objetivo

# Identificar columnas categóricas
categorical_cols = X.select_dtypes(include=["object"]).columns

# Convertir variables categóricas a numéricas usando Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Guardamos el encoder por si lo necesitamos después

# Dividir en conjunto de entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Mostrar tamaño de los conjuntos
X_train.shape, X_test.shape


((8000, 9), (2000, 9))

In [45]:
X

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count
0,6,24,2020,4.2,0,1,289944,3,5
1,2,19,2012,2.0,2,0,5356,2,3
2,7,16,2020,4.2,0,0,231440,4,2
3,0,22,2023,2.0,1,1,160971,2,1
4,9,17,2003,2.6,2,2,286618,3,3
...,...,...,...,...,...,...,...,...,...
9995,6,20,2004,3.7,0,2,5794,2,4
9996,2,18,2002,1.4,1,0,168000,2,1
9997,1,0,2010,3.0,3,0,86664,5,1
9998,3,13,2002,1.4,2,0,225772,4,1


In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Inicializar el modelo de regresión lineal
model = LinearRegression()

# Entrenar el modelo con los datos de entrenamiento
model.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar el modelo
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Mostrar resultados
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R²: {r2:.4f}")


MAE: 793.48
MSE: 818588.25
R²: 0.9109


In [None]:
# Calcular errores (residuos)
errors = y_test - y_pred

# Crear histograma interactivo de los errores
fig = px.histogram(errors, nbins=50, title="Distribución de Errores (Residuos)")
if kaggle:
    fig.show(renderer='iframe_connected')
else:
    fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode="markers", name="Predicción vs Real"))
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode="lines", name="Línea Ideal", line=dict(color="red")))

fig.update_layout(title="Comparación: Precio Real vs Predicho",
                  xaxis_title="Precio Real",
                  yaxis_title="Precio Predicho")

if kaggle:
    fig.show(renderer='iframe_connected')
else:
    fig.show()