In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("cardata.csv")

In [3]:
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
data = data.drop(columns=['Car_Name'],axis=1)

In [5]:
data.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Label Encoding for Transmission and Seller_Type
label_encoder_transmission = LabelEncoder()
data['Transmission'] = label_encoder_transmission.fit_transform(data['Transmission'])

label_encoder_seller_type = LabelEncoder()
data['Seller_Type'] = label_encoder_seller_type.fit_transform(data['Seller_Type'])

In [7]:
# One-Hot Encoding for Fuel_Type
onehot_encoder_fuel_type = OneHotEncoder()
fuel_type_encoded = onehot_encoder_fuel_type.fit_transform(data[['Fuel_Type']]).toarray()

# Create a DataFrame for the one-hot encoded Fuel_Type
fuel_type_df = pd.DataFrame(fuel_type_encoded, columns=onehot_encoder_fuel_type.get_feature_names_out(['Fuel_Type']))

# Concatenate the original DataFrame with the one-hot encoded Fuel_Type DataFrame
data = pd.concat([data, fuel_type_df], axis=1)

# Drop the original Fuel_Type column as it is now one-hot encoded
data.drop(['Fuel_Type'], axis=1, inplace=True)

In [8]:
data.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Seller_Type,Transmission,Owner,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol
0,2014,3.35,5.59,27000,0,1,0,0.0,0.0,1.0
1,2013,4.75,9.54,43000,0,1,0,0.0,1.0,0.0
2,2017,7.25,9.85,6900,0,1,0,0.0,0.0,1.0
3,2011,2.85,4.15,5200,0,1,0,0.0,0.0,1.0
4,2014,4.6,6.87,42450,0,1,0,0.0,1.0,0.0


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [10]:
# Features and target
X = data.drop('Selling_Price', axis=1)
y = data['Selling_Price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f'MAE: {mae}, MSE: {mse}, R-squared: {r2}')

MAE: 0.5816147540983612, MSE: 0.7442708463934428, R-squared: 0.9676903859010882


In [11]:
X_test.head()

Unnamed: 0,Year,Present_Price,Kms_Driven,Seller_Type,Transmission,Owner,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol
177,2016,0.57,24000,1,0,0,0.0,0.0,1.0
289,2016,13.6,10980,0,1,0,0.0,0.0,1.0
228,2012,9.4,60000,0,1,0,0.0,1.0,0.0
198,2011,0.57,35000,1,1,1,0.0,0.0,1.0
60,2013,18.61,40001,0,1,0,0.0,0.0,1.0


In [12]:
X_test.columns

Index(['Year', 'Present_Price', 'Kms_Driven', 'Seller_Type', 'Transmission',
       'Owner', 'Fuel_Type_CNG', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol'],
      dtype='object')

In [13]:
import pickle

# Save the trained model
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the LabelEncoders
with open('label_encoder_transmission.pkl', 'wb') as le_trans_file:
    pickle.dump(label_encoder_transmission, le_trans_file)

with open('label_encoder_seller_type.pkl', 'wb') as le_seller_file:
    pickle.dump(label_encoder_seller_type, le_seller_file)

# Save the OneHotEncoder
with open('onehot_encoder_fuel_type.pkl', 'wb') as ohe_file:
    pickle.dump(onehot_encoder_fuel_type, ohe_file)


In [14]:
import pandas as pd
import pickle

# Load the trained model and encoders
with open('random_forest_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('label_encoder_transmission.pkl', 'rb') as le_trans_file:
    label_encoder_transmission = pickle.load(le_trans_file)

with open('label_encoder_seller_type.pkl', 'rb') as le_seller_file:
    label_encoder_seller_type = pickle.load(le_seller_file)

with open('onehot_encoder_fuel_type.pkl', 'rb') as ohe_file:
    onehot_encoder_fuel_type = pickle.load(ohe_file)

# Default data
default_data = pd.DataFrame({
    'Year': [2020],
    'Present_Price': [8.5],
    'Kms_Driven': [25000],
    'Seller_Type': [1],  # Encoded value for 'Dealer'
    'Transmission': [0],  # Encoded value for 'Manual'
    'Owner': [1],  # Example value for Owner
    'Fuel_Type_CNG': [0],
    'Fuel_Type_Diesel': [1],
    'Fuel_Type_Petrol': [0]
})

# Preprocess the default data
# In this case, preprocessing is minimal as the default data already has the required format

# Make predictions
predictions = model.predict(default_data)

# Print predictions
print(f'Predicted Selling Price: {predictions[0]}')


Predicted Selling Price: 6.943
