In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
car_data = pd.read_csv("../Resources/master_data.csv", index_col=0)

In [3]:
car_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer,made_in,price_group
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,Maruti,India,"400,000 - 600,000"
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,Skoda,Europe,"200,000 - 400,000"
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,Honda,Asia,"20,000 - 200,000"
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,Hyundai,Asia,"200,000 - 400,000"
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,Maruti,India,"20,000 - 200,000"


In [4]:
# Switch between including manufacturer in modelling or not:
include_manufacturer = True

In [5]:
if include_manufacturer:
    car_names = car_data.pop("name")

    # Only one of the below should be uncommented at a time:
    car_makes = car_data.pop("manufacturer")
    # car_made_ins = car_data.pop("made_in")

else:
    car_names = car_data.pop("name")
    car_makes = car_data.pop("manufacturer")
    car_made_ins = car_data.pop("made_in")

In [6]:
# For testing: Drop columns here to test effect on model accuracy
# car_data = car_data.drop(["fuel"], axis=1)
# car_data = car_data.drop(["transmission"], axis=1)
car_data = car_data.drop(["owner"], axis=1) # Slight improvement if excluded

In [7]:
# Preview the data
car_data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,made_in,price_group
0,2014,450000,145500,Diesel,Individual,Manual,India,"400,000 - 600,000"
1,2014,370000,120000,Diesel,Individual,Manual,Europe,"200,000 - 400,000"
2,2006,158000,140000,Petrol,Individual,Manual,Asia,"20,000 - 200,000"
3,2010,225000,127000,Diesel,Individual,Manual,Asia,"200,000 - 400,000"
4,2007,130000,120000,Petrol,Individual,Manual,India,"20,000 - 200,000"


In [8]:
# Check that inferred datatypes are correct
car_data.dtypes

year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
made_in          object
price_group      object
dtype: object

In [9]:
# Split off the target (price group) before One-Hot Encoding
target = car_data["price_group"]

# Selling Price data will be used to verify model, but won't be useful for training the model
selling_prices = car_data["selling_price"]

car_data = car_data.drop(["price_group", "selling_price"], axis=1)

In [10]:
car_data.dtypes

year             int64
km_driven        int64
fuel            object
seller_type     object
transmission    object
made_in         object
dtype: object

In [11]:
# Split off categorical data 
cars_cat = car_data.dtypes[car_data.dtypes == "object"].index.tolist()
car_data[cars_cat].nunique()

fuel            3
seller_type     3
transmission    2
made_in         5
dtype: int64

In [12]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(car_data[cars_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cars_cat)
encode_df.head()

Unnamed: 0,fuel_Diesel,fuel_Other,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,made_in_America,made_in_Asia,made_in_Europe,made_in_India,made_in_Unknown
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [13]:
prepared_data = car_data.merge(encode_df, left_index=True, right_index=True)
prepared_data = prepared_data.drop(cars_cat, axis=1)
prepared_data = prepared_data.merge(target, left_index=True, right_index=True)

In [14]:
# Split model into train/test groups
X = prepared_data.drop(columns=['price_group'])
y = prepared_data['price_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12, random_state = 65)

In [15]:
# Train Decision Tree model
dtree_model = DecisionTreeClassifier(max_depth = 9).fit(X_train, y_train)
dtree_predictions = dtree_model.predict(X_test)

In [16]:
# Assess Decision Tree model
print("Decision Tree Accuracy: ", accuracy_score(y_test, dtree_predictions))
#test_pred = pd.DataFrame({"Test Target" : y_test, "Model Prediction" : dtree_predictions}); test_pred.head()

Decision Tree Accuracy:  0.4510268562401264


In [17]:
# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=350, random_state=101)
rf_model = rf_model.fit(X_train, y_train)

In [18]:
# Assess Random Forest Model
rf_predictions = rf_model.predict(X_test)
print("Random Forest Accuracy: ", accuracy_score(y_test, rf_predictions))

Random Forest Accuracy:  0.3902053712480253
