In [124]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.tree import DecisionTreeRegressor, plot_tree
from dmba import classificationSummary

In [125]:
# read in data file

corollas = pd.read_csv('ToyotaCorolla.csv')
corollas

Unnamed: 0,Id,Price,Age_08_22,KM,Fuel_Type,HP,Color,Automatic,CC,Doors,...,Airbag_1,Airbag_2,Airco,CD_Player,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Metallic_Rim
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,13500.0,23.0,46986.0,Diesel,90.0,Blue,0.0,2000.0,3.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,,,,,,,,,,,...,,,,,,,,,,
3,2.0,13750.0,23.0,72937.0,Diesel,90.0,Silver,0.0,2000.0,3.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2867,1440.0,8500.0,71.0,17016.0,Petrol,86.0,Blue,0.0,1300.0,3.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,,0.0,0.0
2868,,,,,,,,,,,...,,,,,,,,,,
2869,1441.0,7250.0,70.0,16916.0,Petrol,86.0,Grey,0.0,1300.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
2870,,,,,,,,,,,...,,,,,,,,,,


In [126]:
corollas=corollas.dropna(axis=0, how="all")

In [127]:
# how many observations are in this dataset? How many different columns?

corollas.shape

(1436, 24)

In [128]:
# look at data types of each column

corollas.dtypes

Id                 float64
Price              float64
Age_08_22          float64
KM                 float64
Fuel_Type           object
HP                 float64
Color               object
Automatic          float64
CC                 float64
Doors              float64
Cylinders          float64
Gears              float64
Mfr_Guarantee      float64
ABS                float64
Airbag_1           float64
Airbag_2           float64
Airco              float64
CD_Player          float64
Powered_Windows    float64
Power_Steering     float64
Radio              float64
Mistlamps          float64
Sport_Model        float64
Metallic_Rim       float64
dtype: object

In [129]:
#3 check for null values
corollas.isnull().sum()

Id                    0
Price                 1
Age_08_22             0
KM                    0
Fuel_Type             0
HP                    0
Color                 9
Automatic             0
CC                    5
Doors                 0
Cylinders             0
Gears                 0
Mfr_Guarantee         1
ABS                   0
Airbag_1              0
Airbag_2              0
Airco                 1
CD_Player             0
Powered_Windows       0
Power_Steering        0
Radio                 0
Mistlamps          1035
Sport_Model           0
Metallic_Rim          0
dtype: int64

In [130]:
# Removing columns which is not needed
corollas = corollas.drop(["Id"], axis = 1)
corollas = corollas.drop(["Mistlamps"], axis = 1)
corollas = corollas.drop(["Cylinders"], axis = 1)

In [131]:
corollas.dtypes

Price              float64
Age_08_22          float64
KM                 float64
Fuel_Type           object
HP                 float64
Color               object
Automatic          float64
CC                 float64
Doors              float64
Gears              float64
Mfr_Guarantee      float64
ABS                float64
Airbag_1           float64
Airbag_2           float64
Airco              float64
CD_Player          float64
Powered_Windows    float64
Power_Steering     float64
Radio              float64
Sport_Model        float64
Metallic_Rim       float64
dtype: object

In [132]:
corollas = corollas.dropna(axis=0, how = 'any').copy()

In [133]:
predictors_df_corollas = corollas[['Age_08_22', 'KM', 'Fuel_Type', 'HP', 'Color', 'Automatic', 'CC', 
              'Doors', 'Gears','Mfr_Guarantee', 'ABS', 'Airbag_1', 'Airbag_2', 'Airco', 'CD_Player', 
              'Powered_Windows', 'Power_Steering', 'Radio','Sport_Model', 'Metallic_Rim',]]
response_corollas = corollas['Price']

In [134]:
# flag categorical varibales
predictors_df_corollas = pd.get_dummies(predictors_df_corollas, columns=['Fuel_Type'])
predictors_df_corollas = pd.get_dummies(predictors_df_corollas, columns=['Color'])

In [135]:
predictors_df_corollas.dtypes

Age_08_22           float64
KM                  float64
HP                  float64
Automatic           float64
CC                  float64
Doors               float64
Gears               float64
Mfr_Guarantee       float64
ABS                 float64
Airbag_1            float64
Airbag_2            float64
Airco               float64
CD_Player           float64
Powered_Windows     float64
Power_Steering      float64
Radio               float64
Sport_Model         float64
Metallic_Rim        float64
Fuel_Type_CNG         uint8
Fuel_Type_Diesel      uint8
Fuel_Type_Petrol      uint8
Color_Beige           uint8
Color_Black           uint8
Color_Blue            uint8
Color_Green           uint8
Color_Grey            uint8
Color_Red             uint8
Color_Silver          uint8
Color_Violet          uint8
Color_White           uint8
Color_Yellow          uint8
dtype: object

In [136]:
predictors_df_corollas.isnull().sum()

Age_08_22           0
KM                  0
HP                  0
Automatic           0
CC                  0
Doors               0
Gears               0
Mfr_Guarantee       0
ABS                 0
Airbag_1            0
Airbag_2            0
Airco               0
CD_Player           0
Powered_Windows     0
Power_Steering      0
Radio               0
Sport_Model         0
Metallic_Rim        0
Fuel_Type_CNG       0
Fuel_Type_Diesel    0
Fuel_Type_Petrol    0
Color_Beige         0
Color_Black         0
Color_Blue          0
Color_Green         0
Color_Grey          0
Color_Red           0
Color_Silver        0
Color_Violet        0
Color_White         0
Color_Yellow        0
dtype: int64

In [137]:
response_corollas.isnull().sum()

0

In [138]:
corr = predictors_df_corollas.corr()

In [139]:
corr

Unnamed: 0,Age_08_22,KM,HP,Automatic,CC,Doors,Gears,Mfr_Guarantee,ABS,Airbag_1,...,Color_Beige,Color_Black,Color_Blue,Color_Green,Color_Grey,Color_Red,Color_Silver,Color_Violet,Color_White,Color_Yellow
Age_08_22,1.0,0.502924,-0.156036,0.043666,-0.097749,-0.133099,-0.008561,-0.171304,-0.411433,-0.103782,...,0.022568,-0.025746,-0.035293,0.103253,-0.129805,0.096089,-0.024972,0.017124,0.048724,-0.042731
KM,0.502924,1.0,-0.337537,-0.08057,0.103691,-0.030873,0.013986,-0.213465,-0.174443,-0.016729,...,-0.007109,0.037488,-0.006108,-0.018948,-0.10506,0.049549,0.005765,0.018476,0.129816,-0.038519
HP,-0.156036,-0.337537,1.0,0.009248,0.035123,0.089607,0.21316,0.145907,0.056733,0.024776,...,0.026537,-0.00161,-0.027922,0.010416,0.027018,0.014345,0.016415,-0.012317,-0.094687,-0.00033
Automatic,0.043666,-0.08057,0.009248,1.0,0.067807,-0.035258,-0.099489,0.021621,-0.019531,-0.013233,...,-0.011025,-0.039373,-0.017148,0.053233,0.008033,-0.023395,0.050183,-0.012736,-0.035798,-0.011025
CC,-0.097749,0.103691,0.035123,0.067807,1.0,0.078904,0.014792,-0.057471,0.037441,0.022545,...,0.002545,-0.007727,0.030657,-0.011357,-0.007694,-0.010298,-0.003718,-0.015776,0.027496,-0.011857
Doors,-0.133099,-0.030873,0.089607,-0.035258,0.078904,1.0,-0.159654,0.041332,0.059066,0.052309,...,-0.033355,-0.098722,-0.045589,0.061439,0.056914,0.001087,0.02716,0.012662,-0.008714,0.014991
Gears,-0.008561,0.013986,0.21316,-0.099489,0.014792,-0.159654,1.0,0.010691,0.087171,0.002736,...,0.074442,0.13005,-0.023339,-0.049882,-0.026357,0.033898,-0.069666,-0.007512,-0.021114,-0.006503
Mfr_Guarantee,-0.171304,-0.213465,0.145907,0.021621,-0.057471,0.041332,0.010691,1.0,0.120506,0.052637,...,0.024005,0.008717,-0.02823,0.034122,0.012078,0.002893,-0.006271,0.036737,-0.095215,0.055199
ABS,-0.411433,-0.174443,0.056733,-0.019531,0.037441,0.059066,0.087171,0.120506,1.0,0.276941,...,-0.016989,-0.037621,0.003992,-0.024104,0.060091,0.014224,-0.021589,-0.008305,-0.026422,0.022211
Airbag_1,-0.103782,-0.016729,0.024776,-0.013233,0.022545,0.052309,0.002736,0.052637,0.276941,1.0,...,0.008039,0.019826,0.003004,0.028281,0.06876,-0.082266,-0.021643,-0.069144,-0.030791,0.008039


# KNN

In [140]:
# create train and test splits

X_prediction = predictors_df_corollas
y_prediction = response_corollas
train_X_prediction, test_X_prediction, train_y_prediction, test_y_prediction = train_test_split(X_prediction, 
                                                            y_prediction, test_size=0.3, random_state=616)

In [141]:
z_score_norm2 = preprocessing.StandardScaler()
z_score_norm2.fit(predictors_df_corollas)

train_X_prediction = pd.DataFrame(z_score_norm2.transform(train_X_prediction), 
                                          columns = predictors_df_corollas.columns)
test_X_prediction = pd.DataFrame(z_score_norm2.transform(test_X_prediction), 
                                          columns = predictors_df_corollas.columns)

In [142]:
# train the k-NN model and look at performance on train data

knn_p = KNeighborsRegressor(n_neighbors=5,leaf_size=30).fit(train_X_prediction, train_y_prediction)
predicted_y_training3 = knn_p.predict(train_X_prediction)
print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(train_y_prediction, predicted_y_training3) ** 0.5, 4))

Root Mean Squared Error (RMSE):  1391.1371


In [143]:
# performance of k-NN on test data

predicted_y_test3 = knn_p.predict(test_X_prediction)
print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(test_y_prediction, predicted_y_test3) ** 0.5, 4))

Root Mean Squared Error (RMSE):  1804.4122


# Linear

In [144]:
linear_model = LinearRegression()
linear_model = linear_model.fit(train_X_prediction, train_y_prediction)
predicted_y_training4 = linear_model.predict(train_X_prediction)
print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(train_y_prediction, predicted_y_training4) ** 0.5, 4))

Root Mean Squared Error (RMSE):  1351.1285


In [145]:
# performance of linear regression on test data

predicted_y_test4 = linear_model.predict(test_X_prediction)
print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(test_y_prediction, predicted_y_test4) ** 0.5, 4))

Root Mean Squared Error (RMSE):  1322.8666


## Decision tree

In [146]:
DT_corollas = DecisionTreeRegressor(max_depth=5, random_state=13, splitter="best").fit(train_X_prediction, train_y_prediction) #not allowing it to grow fully,by setting max depth to 7
predicted_y_training_corollas = DT_corollas.predict(train_X_prediction)

In [147]:
print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(train_y_prediction, predicted_y_training_corollas) ** 0.5, 4))

Root Mean Squared Error (RMSE):  1067.0795


In [148]:
predicted_y_test_corollas = DT_corollas.predict(test_X_prediction)

In [149]:
print("Root Mean Squared Error (RMSE): ", round(mean_squared_error(test_y_prediction, predicted_y_test_corollas) ** 0.5, 4))

Root Mean Squared Error (RMSE):  1173.5588
