In [41]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.0f}'.format

manufacturers = ['Nissan', 'Chevrolet', 'Volkswagen', 'Toyota', 'Honda', 'Ford']

descriptions = {
    'Nissan':['March', 'Sentra', 'Altima', 'Kicks', 'Tida'],
    'Chevrolet': ['Spark', 'Trax', 'Sonic', 'Onix', 'Aveo'],
    'Volkswagen': ['Vento', 'Nivus', 'T-Cross', 'Tiguan', 'Taigun'],
    'Toyota': ['Prius', 'Corolla', 'Camry', 'Rav4', 'Tundra'],
    'Honda':['Fit', 'CRV', 'HRV', 'Civic', 'Accord'],
    'Ford':['Escape', 'Bronco', 'Edge', 'Explorer', 'Expedition']
}

models = [2019, 2020, 2021, 2022, 2023, 2024]

prices = {
    2019:[90,100],
    2020:[100, 150],
    2021:[150, 200],
    2022:[200, 250],
    2023:[250, 300],
    2024:[300, 500],
}

manufacturer = []
description = []
price = []
model = []

for i in range(1000):
    arm = np.random.choice(manufacturers)
    manufacturer.append(arm)
    description.append(np.random.choice(descriptions[arm]))
    mod = np.random.choice(models)
    model.append(mod)
    price.append(np.random.randint(prices[mod][0], prices[mod][1]) * 1000.0) 
    
data = {'manufacturer': manufacturer, 'vehicle':description, 'model':model, 'price':price, 
       }
df = pd.DataFrame(data)

df['mileage'] = (2024 - df['model'] ) * 20_000 * 1.0

df.head(10)

Unnamed: 0,manufacturer,vehicle,model,price,mileage
0,Volkswagen,Tiguan,2020,111000,80000
1,Nissan,Kicks,2020,117000,80000
2,Toyota,Rav4,2020,123000,80000
3,Volkswagen,Vento,2020,125000,80000
4,Honda,Civic,2022,241000,40000
5,Ford,Expedition,2020,109000,80000
6,Honda,Fit,2024,465000,0
7,Honda,HRV,2024,313000,0
8,Nissan,March,2022,232000,40000
9,Toyota,Prius,2020,104000,80000


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Separate features and target variable
X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define categorical columns
categorical_cols = ['manufacturer','vehicle']
numeric_cols = ['model', 'mileage']

# Create a pipeline for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

# Create a pipeline for the model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print("Training R^2 score:", train_score)
print("Testing R^2 score:", test_score)


y_train_pred = model.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
print("Train RMSE:", rmse)

y_test_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print("Test RMSE:", rmse)

Training R^2 score: 0.9494069013856703
Testing R^2 score: 0.9128371971252878
Train RMSE: 23368.241532823187
Test RMSE: 28782.693189720336


In [50]:
df['prediction'] = model.predict(X)
df.head()

Unnamed: 0,manufacturer,vehicle,model,price,mileage,prediction
0,Volkswagen,Tiguan,2020,111000,80000,122897
1,Nissan,Kicks,2020,117000,80000,119014
2,Toyota,Rav4,2020,123000,80000,125234
3,Volkswagen,Vento,2020,125000,80000,127740
4,Honda,Civic,2022,241000,40000,226934


In [56]:
# Obtener la importancia de las características
onehot_columns = list(model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_cols))
#onehot_columns
numeric_columns = numeric_cols
feature_names = onehot_columns + numeric_columns
feature_importance = model.named_steps['regressor'].feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Imprimir el DataFrame con la importancia de las características
#print(feature_importance_df)

In [58]:
import joblib
joblib.dump(model, 'car_price_model.pkl')

['car_price_model.pkl']