In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report 

In [49]:
car_data = pd.read_csv("Resources/master_data.csv", index_col=0)

In [50]:
car_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,Maruti
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,Skoda
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,Honda
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,Hyundai
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,Maruti


In [51]:
# Switch between including manufacturer in modelling or not:
include_manufacturer = False

In [52]:
if include_manufacturer:
    car_names = car_data.pop("name")
else:
    car_names = car_data.pop("name")
    car_makes = car_data.pop("manufacturer")

In [53]:
# Preview the data
car_data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2014,450000,145500,Diesel,Individual,Manual,First Owner
1,2014,370000,120000,Diesel,Individual,Manual,Second Owner
2,2006,158000,140000,Petrol,Individual,Manual,Third Owner
3,2010,225000,127000,Diesel,Individual,Manual,First Owner
4,2007,130000,120000,Petrol,Individual,Manual,First Owner


In [54]:
# Check that inferred datatypes are correct
car_data.dtypes

year              int64
selling_price     int64
km_driven         int64
fuel             object
seller_type      object
transmission     object
owner            object
dtype: object

In [55]:
# Split off categorical data 
cars_cat = car_data.dtypes[car_data.dtypes == "object"].index.tolist()
car_data[cars_cat].nunique()

fuel            3
seller_type     3
transmission    2
owner           4
dtype: int64

In [56]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(car_data[cars_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cars_cat)
encode_df.head()

Unnamed: 0,fuel_Diesel,fuel_Other,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Third Owner
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [57]:
prepared_data = car_data.merge(encode_df,left_index=True, right_index=True)
prepared_data = prepared_data.drop(cars_cat,1)
prepared_data.head()

Unnamed: 0,year,selling_price,km_driven,fuel_Diesel,fuel_Other,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Third Owner
0,2014,450000,145500,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
0,2007,60000,70000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,2014,370000,120000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,2007,135000,50000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,2006,158000,140000,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [58]:
# TODO: Create another column called "price_bin" which represents the relevant bin for each record's selling price (ex: 60000 -> "50000-10000")
# TODO: Train model to classify each record into a bin
# TODO: Test model with OHE'd manufacturer data and see if it improves model accuracy
# TODO: Test additional binning of manufacturer (Ex: "Luxury Brand" or "Non-Luxury Brand", or by manufacturer vountry) 