In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df= pd.read_csv('/content/drive/MyDrive/Dataset/Unified mentors/Predict Vehicle Prices.csv')

In [None]:
df.sample(5)

Unnamed: 0,name,description,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
208,2024 RAM 3500 Laramie Mega Cab 4x4 6'4' Box,Pearl White 2024 Ram 3500 Laramie 4WD Aisin 6-...,RAM,3500,2024,89040.0,"6.7L I-6 diesel direct injection, VVT intercoo...",6.0,Diesel,5.0,Aisin 6-Speed Automatic,Laramie Mega Cab 4x4 6&#39;4&#39; Box,Pickup Truck,4.0,Pearl White,Black,Four-wheel Drive
76,2024 Kia Sportage S,\n \n 25/33 City/Highway MPGThe Cl...,Kia,Sportage,2024,27969.0,"4 port/direct injection, DOHC, CVVT variable v...",4.0,Gasoline,0.0,8-Speed Automatic,S,SUV,4.0,Gravity Gray,Black,Front-wheel Drive
852,2024 Dodge Hornet GT AWD,"\n \n Glass Chrysler, Dodge, Jeep,...",Dodge,Hornet,2024,40880.0,"ne 2L I-4 gasoline direct injection, DOHC, var...",4.0,Gasoline,24.0,Automatic,GT AWD,SUV,4.0,Acapulco Gold,Black,All-wheel Drive
766,2023 Jeep Gladiator Rubicon,\n \n Summary: Home of the Limite...,Jeep,Gladiator,2023,54210.0,24V MPFI DOHC,6.0,Gasoline,0.0,8-Speed Automatic,Rubicon,Pickup Truck,4.0,Sarge Green Clearcoat,Black,Four-wheel Drive
674,2024 Hyundai Sonata Hybrid SEL,This 2024 Hyundai Sonata Hybrid SEL is equippe...,Hyundai,Sonata Hybrid,2024,33005.0,16V GDI DOHC Hybrid,4.0,Hybrid,13.0,6-Speed Automatic,SEL,Sedan,4.0,White,Black,Front-wheel Drive


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1002 non-null   object 
 1   description     946 non-null    object 
 2   make            1002 non-null   object 
 3   model           1002 non-null   object 
 4   year            1002 non-null   int64  
 5   price           979 non-null    float64
 6   engine          1000 non-null   object 
 7   cylinders       897 non-null    float64
 8   fuel            995 non-null    object 
 9   mileage         968 non-null    float64
 10  transmission    1000 non-null   object 
 11  trim            1001 non-null   object 
 12  body            999 non-null    object 
 13  doors           995 non-null    float64
 14  exterior_color  997 non-null    object 
 15  interior_color  964 non-null    object 
 16  drivetrain      1002 non-null   object 
dtypes: float64(4), int64(1), object(1

In [None]:
df.isnull().sum()

Unnamed: 0,0
name,0
description,56
make,0
model,0
year,0
price,23
engine,2
cylinders,105
fuel,7
mileage,34


"""Here's a breakdown of the data:

Columns

1. name: The full name of the vehicle.
2. description: A brief description of the vehicle.
3. make: The manufacturer of the vehicle (e.g., Hyundai, Dodge, Audi).
4. model: The model name of the vehicle.
5. year: The year the vehicle was manufactured.
6. price: The price of the vehicle in USD.
7. engine: Details about the engine, including type and specifications.
8. cylinders: The number of cylinders in the vehicle's engine.
9. fuel: The type of fuel used by the vehicle (e.g., Gasoline, Hybrid).
10. mileage: The mileage of the vehicle, typically in miles.
11. transmission: The type of transmission (e.g., Automatic, Manual).
12. trim: The trim level of the vehicle, indicating different feature sets or packages.
13. body: The body style of the vehicle (e.g., Sedan, SUV, Hatchback).
14. doors: The number of doors on the vehicle.
15. exterior_color: The exterior color of the vehicle.
16. interior_color: The interior color of the vehicle.
17. drivetrain: The drivetrain of the vehicle (e.g., Front-wheel Drive, All-wheel Drive).

dtypes: float64(4), int64(1), object(12)
- name, description, make, model, engine, fuel, transmission, trim, body, exterior_color, interior_color, and drivetrain are strings.
- year, price, cylinders, and mileage are numerical values.
- doors is an integer.

Missing Values: There are some missing values in the price column.

Data Quality: The data appears to be well-structured and clean, with no obvious errors or inconsistencies. However, further data exploration and cleaning may be necessary to ensure data quality."""

In [None]:
df['price'].fillna(df['price'].median(),inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price'].fillna(df['price'].median(),inplace=True)


In [None]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = df.drop(['price'], axis=1)
y = df['price']
X_train,X_test, y_train, y_test =train_test_split(X,y, test_size=0.2,random_state=42 )

In [None]:
# Define preprocessing steps for numericals and categoricals columns
numericals_cols =['year','cylinders', 'mileage']
categoricals_cols =['name', 'description', 'make', 'model', 'engine', 'fuel', 'transmission', 'trim', 'body', 'exterior_color', 'interior_color','drivetrain']

In [None]:
# create transformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numericals_transformer= Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categoricals_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numericals_transformer, numericals_cols),
        ('cat', categoricals_transformer, categoricals_cols)])



In [None]:
# create a pipeline with preprocessing and a random forest regressor
# model RandomForest
from sklearn.ensemble import RandomForestRegressor
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor())])

or

In [None]:
from sklearn.linear_model import Ridge

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('ridge', Ridge())])


In [None]:
# Define the hyperparameter space and cross validation
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error



# Define the hyperparameter space
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 5, 10]
}




#
# Define the evaluation metric
scorer = make_scorer(mean_absolute_error, greater_is_better=False)


# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5,scoring=scorer)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 300}
Best Score: -5197.945367347811


Another simple method

In [None]:
pipeline.fit(X_train, y_train) # Train the pipeline on the training data

In [None]:
# Evaluate the pipeline on the testing data
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'MAE: {mae:2f}')
print(f'r2: {r2:2f}')
print(f'MSE: {mse:2f}')



MAE: 4002.002231
r2: 0.820626
MSE: 45722869.046087


In [None]:
# save prediction with price_range
submission_df= pd.DataFrame({'price': y_pred})
submission_df.to_csv('submission.csv', index=False)

In [None]:
from google.colab import files

# Download the file
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

"""
1. Data Loading: Loads the vehicle data from a CSV file.
2. Data Splitting: Splits the data into training and testing sets.
3. Numerical Preprocessing: Applies median imputation and standard scaling to numerical columns.
4. Categorical Preprocessing: Applies constant imputation and one-hot encoding to categorical columns.
5. Pipeline Creation: Creates a pipeline that combines preprocessing and a random forest regressor.
6. Pipeline Training: Trains the pipeline on the training data.
7. Pipeline Evaluation: Evaluates the pipeline on the testing data using mean squared error."""