In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.tree import DecisionTreeClassifier

In [59]:
car_data = pd.read_csv("Resources/master_data.csv", index_col=0)

In [60]:
car_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer,price_group
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,Maruti,"400,000 - 600,000"
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,Skoda,"200,000 - 400,000"
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,Honda,"0 - 200,000"
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,Hyundai,"200,000 - 400,000"
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,Maruti,"0 - 200,000"


In [61]:
# Switch between including manufacturer in modelling or not:
include_manufacturer = False

In [62]:
if include_manufacturer:
    car_names = car_data.pop("name")
else:
    car_names = car_data.pop("name")
    car_makes = car_data.pop("manufacturer")

In [63]:
# Preview the data
car_data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,price_group
0,2014,450000,145500,Diesel,Individual,Manual,First Owner,"400,000 - 600,000"
1,2014,370000,120000,Diesel,Individual,Manual,Second Owner,"200,000 - 400,000"
2,2006,158000,140000,Petrol,Individual,Manual,Third Owner,"0 - 200,000"
3,2010,225000,127000,Diesel,Individual,Manual,First Owner,"200,000 - 400,000"
4,2007,130000,120000,Petrol,Individual,Manual,First Owner,"0 - 200,000"


In [64]:
# Check that inferred datatypes are correct
car_data.dtypes

year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
price_group      object
dtype: object

In [65]:
# Split off the target (price group) before One-Hot Encoding
target = car_data["price_group"]

# Selling Price data will be used to verify model, but won't be useful for training the model
selling_prices = car_data["selling_price"]

car_data = car_data.drop(["price_group", "selling_price"], axis=1)

In [66]:
# Split off categorical data 
cars_cat = car_data.dtypes[car_data.dtypes == "object"].index.tolist()
car_data[cars_cat].nunique()

fuel            3
seller_type     3
transmission    2
owner           4
dtype: int64

In [67]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(car_data[cars_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cars_cat)
encode_df.head()

Unnamed: 0,fuel_Diesel,fuel_Other,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Third Owner
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [68]:
prepared_data = car_data.merge(encode_df, left_index=True, right_index=True)
prepared_data = prepared_data.drop(cars_cat, axis=1)
prepared_data = prepared_data.merge(target, left_index=True, right_index=True)

prepared_data.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Other,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Third Owner,price_group
0,2014,145500,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,"400,000 - 600,000"
0,2014,145500,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,"0 - 200,000"
0,2007,70000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,"400,000 - 600,000"
0,2007,70000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,"0 - 200,000"
1,2014,120000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,"200,000 - 400,000"


In [69]:
# Split model into train/test groups
X = prepared_data.drop(columns=['price_group'])
y = prepared_data['price_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [96]:
# Train Decision Tree model
dtree_model = DecisionTreeClassifier(max_depth = 7).fit(X_train, y_train)
dtree_predictions = dtree_model.predict(X_test)

In [97]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, dtree_predictions))

0.3723631192225646


In [98]:
test_pred = pd.DataFrame({"Test Target" : y_test, "Model Prediction" : dtree_predictions})
test_pred.head()

Unnamed: 0,Test Target,Model Prediction
4253,"800,000+","0 - 200,000"
2316,"0 - 200,000","200,000 - 400,000"
578,"400,000 - 600,000","200,000 - 400,000"
3078,"800,000+","800,000+"
598,"600,000 - 800,000","200,000 - 400,000"


In [71]:
# TODO: Test model with OHE'd manufacturer data and see if it improves model accuracy
# TODO: Test additional binning of manufacturer (Ex: "Luxury Brand" or "Non-Luxury Brand", or by manufacturer vountry) 