In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [71]:
import warnings
warnings.filterwarnings(action='ignore')

  and should_run_async(code)


In [33]:
car_data = pd.read_csv("../Resources/master_data.csv", index_col=0)

In [34]:
car_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer,made_in,price_group
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,Maruti,India,"400,000 - 600,000"
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,Skoda,Europe,"200,000 - 400,000"
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,Honda,Asia,"20,000 - 200,000"
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,Hyundai,Asia,"200,000 - 400,000"
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,Maruti,India,"20,000 - 200,000"


In [35]:
# Switch between including manufacturer in modelling or not:
include_manufacturer = True

In [36]:
if include_manufacturer:
    car_names = car_data.pop("name")

    # Only one of the below should be uncommented at a time:
    car_makes = car_data.pop("manufacturer")
    # car_made_ins = car_data.pop("made_in")

else:
    car_names = car_data.pop("name")
    car_makes = car_data.pop("manufacturer")
    car_made_ins = car_data.pop("made_in")

In [37]:
# For testing: Drop columns here to test effect on model accuracy
# car_data = car_data.drop(["fuel"], axis=1)
# car_data = car_data.drop(["transmission"], axis=1)
car_data = car_data.drop(["owner"], axis=1) # Slight improvement if excluded

In [38]:
# Preview the data
car_data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,made_in,price_group
0,2014,450000,145500,Diesel,Individual,Manual,India,"400,000 - 600,000"
1,2014,370000,120000,Diesel,Individual,Manual,Europe,"200,000 - 400,000"
2,2006,158000,140000,Petrol,Individual,Manual,Asia,"20,000 - 200,000"
3,2010,225000,127000,Diesel,Individual,Manual,Asia,"200,000 - 400,000"
4,2007,130000,120000,Petrol,Individual,Manual,India,"20,000 - 200,000"


In [39]:
# Check that inferred datatypes are correct
car_data.dtypes

year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
made_in          object
price_group      object
dtype: object

In [40]:
# Split off the target (price group) before One-Hot Encoding
target = car_data["price_group"]

# Selling Price data will be used to verify model, but won't be useful for training the model
selling_prices = car_data["selling_price"]

car_data = car_data.drop(["price_group", "selling_price"], axis=1)

In [41]:
car_data.dtypes

year             int64
km_driven        int64
fuel            object
seller_type     object
transmission    object
made_in         object
dtype: object

In [42]:
# Split off categorical data 
cars_cat = car_data.dtypes[car_data.dtypes == "object"].index.tolist()
car_data[cars_cat].nunique()

fuel            3
seller_type     3
transmission    2
made_in         5
dtype: int64

In [43]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(car_data[cars_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cars_cat)
encode_df.head()

Unnamed: 0,fuel_Diesel,fuel_Other,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,made_in_America,made_in_Asia,made_in_Europe,made_in_India,made_in_Unknown
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [44]:
prepared_data = car_data.merge(encode_df, left_index=True, right_index=True)
prepared_data = prepared_data.drop(cars_cat, axis=1)
prepared_data = prepared_data.merge(target, left_index=True, right_index=True)

## Train/Test Split

In [45]:
# Split model into train/test groups
X = prepared_data.drop(columns=['price_group'])
y = prepared_data['price_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 65)

In [46]:
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Other,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,made_in_America,made_in_Asia,made_in_Europe,made_in_India,made_in_Unknown
0,2014,145500,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
0,2014,145500,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
0,2007,70000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
0,2007,70000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2014,120000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Decision Tree

In [47]:
# Train Decision Tree model
dtree_model = DecisionTreeClassifier(max_depth = 5).fit(X_train, y_train)
dtree_predictions = dtree_model.predict(X_test)

In [48]:
# Assess Decision Tree model
print("Decision Tree Accuracy: ", accuracy_score(y_test, dtree_predictions))
#test_pred = pd.DataFrame({"Test Target" : y_test, "Model Prediction" : dtree_predictions}); test_pred.head()

Decision Tree Accuracy:  0.45691596539054163


In [22]:
# Save decision tree model. To load:
#      loaded_model = pickle.load(open("dtree_model.sav", 'rb'))
import pickle
pickle.dump(dtree_model, open("dtree_model.sav", 'wb'))

In [23]:
from sklearn.tree import export_text
text_representation = export_text(dtree_model, feature_names=list(X.columns))
with open("tree_text.txt", "wb") as f:
    f.truncate(0)
    f.write(text_representation.encode())
    f.close()

### Assessing Misclassification Severity

In [75]:
pred_vs_real = pd.DataFrame({"predicted" : dtree_predictions, "actual" : y_test})
misclassed = pred_vs_real[pred_vs_real["predicted"] != pred_vs_real["actual"]]
misclassed.reset_index()

def get_label(record):
    if record == "20,000 - 200,000":
        return 1
    elif record == "200,000 - 400,000":
        return 2
    elif record == "400,000 - 600,000":
        return 3
    elif record == "600,000+":
        return 4
        
misclassed["pred_label"] = misclassed["predicted"].apply(get_label)
misclassed["actual_label"] = misclassed["actual"].apply(get_label)
misclassed["pred_distance"] = misclassed["pred_label"] - misclassed["actual_label"]

misclassed.head()

Unnamed: 0,predicted,actual,pred_label,actual_label,pred_distance
1265,"200,000 - 400,000","20,000 - 200,000",2,1,1
4800,"200,000 - 400,000","600,000+",2,4,-2
1642,"200,000 - 400,000","20,000 - 200,000",2,1,1
197,"600,000+","200,000 - 400,000",4,2,2
2103,"600,000+","200,000 - 400,000",4,2,2


In [76]:
misclassed["pred_distance"].value_counts()

 1    1552
-1    1081
 2     771
-2     731
 3     284
-3     163
Name: pred_distance, dtype: int64

57.4% of erroneous predictions were within one price range of the correct answer.

9.7% of erroneous predictions were maximally incorrect (three price ranges away from correct answer).

## Random Forest

In [24]:
# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=350, random_state=101)
rf_model = rf_model.fit(X_train, y_train)

In [25]:
# Assess Random Forest Model
rf_predictions = rf_model.predict(X_test)
print("Random Forest Accuracy: ", accuracy_score(y_test, rf_predictions))

Random Forest Accuracy:  0.36280668484058315


## Subset: Manufactured 2014 or Later

In [26]:
# Split model into train/test groups
prepared_decade = prepared_data[prepared_data["year"] > 2013]
X = prepared_decade.drop(columns=['price_group'])
y = prepared_decade['price_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 120)

In [27]:
# Train Decision Tree model
dtree_decade = DecisionTreeClassifier(max_depth = 4).fit(X_train, y_train)
dtree_predictions_decade = dtree_decade.predict(X_test)
print("Decision Tree Accuracy: ", accuracy_score(y_test, dtree_predictions_decade))

Decision Tree Accuracy:  0.45683247570040025


Do newer cars have more consistency in how their features affect their selling price? What features are used to differentiate between brackets?

In [28]:
text_representation = export_text(dtree_decade, feature_names=list(X.columns))
with open("recent_tree_text.txt", "wb") as f:
    f.truncate(0)
    f.write(text_representation.encode())
    f.close()

In [29]:
# 2014 or later decision tree logic:
#|--- year <= 2016.50
#|   |--- transmission_Manual <= 0.50
#|   |   |--- fuel_Petrol <= 0.50
#|   |   |   |--- class: 600,000+
#|   |   |--- fuel_Petrol >  0.50
#|   |   |   |--- class: 600,000+
#|   |--- transmission_Manual >  0.50
#|   |   |--- fuel_Diesel <= 0.50
#|   |   |   |--- class: 200,000 - 400,000
#|   |   |--- fuel_Diesel >  0.50
#|   |   |   |--- class: 600,000+
#|--- year >  2016.50
#|   |--- transmission_Automatic <= 0.50
#|   |   |--- fuel_Diesel <= 0.50
#|   |   |   |--- class: 600,000+
#|   |   |--- fuel_Diesel >  0.50
#|   |   |   |--- class: 600,000+
#|   |--- transmission_Automatic >  0.50
#|   |   |--- seller_type_Individual <= 0.50
#|   |   |   |--- class: 600,000+
#|   |   |--- seller_type_Individual >  0.50
#|   |   |   |--- class: 600,000+


In [30]:
prepared_decade["price_group"].value_counts()

600,000+             4882
400,000 - 600,000    2947
200,000 - 400,000    2800
20,000 - 200,000     1029
Name: price_group, dtype: int64

In [31]:
4882 / (prepared_decade["price_group"].value_counts().sum())

0.41876822782638534

Conclusion: Better to train on as many cars as possible. Subset model guesses the highest price group on almost every record at low depth, and higher depth model loses accuracy quickly. 