In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d

# %matplotlib inline
# %matplotlib notebook

pd.set_option('display.max_columns', 100)

In [2]:
df = pd.read_csv('cars_cleaned.csv', parse_dates=True, encoding = "ISO-8859-1", low_memory=False, index_col=0)

In [3]:
df.head()

Unnamed: 0,Year,Model,Specs,MSRP,EPA_Class,Body_Stype,Drivetrain,Psg_Cp,Psg_Doors,Front_Wheel_Mat,Rear_Wheel_Mat,Basic_Miles,Basic_Year,Drivetrain_Miles,Drivetrain_Years,Road_Assist_Miles,Road_Assist_Years,Max_Alt_Cp,Maint_Miles,Maint_Year,Max_Alt_Watts,Other_Features,Weight,Trailer_cpt,Volume,Width,Height,Length,Size_avg,Airbag_Front_Driver,Airbag_Front_Psg,Airbag_Psg_Switch,Airbag_SBF,Airbag_SBR,Airbag_SHF,Airbag_SHR,Child_Door_Locks,Day_Lights,Trac_Control,Night_Vision,Rollover_Prt,Fog_Lamps,Parking_Aid,Tire_P_Monitor,Backup_Camera,Stability_Ctl,Num_safety,Fuel_Tank_Cpt,MPG_avg,MPG_City,MPG_Hwy,Susp_Front,Susp_Front2,Susp_Rear,Susp_Rear2,Trans_Speed,Battery_Amps,HP1,HP2,Engine_Type,Displacement,Corrosion_Miles,Corrosion_Years
0,2019,Acura RDX,Acura RDX Specs: FWD w/Technology Pkg,40600,Small Sport Utility Vehicles 2WD,Sport Utility,Front Wheel Drive,5,4,Aluminum,Aluminum,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,3790.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,No,Yes,Yes,Yes,Yes,12,17.1,24.0,22.0,28.0,Strut,Strut,Multi-Link,Multi-Link,10.0,,280.0,272.0,Turbo Premium Unleaded I-4,2.0,300000.0,5.0
1,2019,Acura RDX,Acura RDX Specs: FWD w/Advance Pkg,45500,Small Sport Utility Vehicles 2WD,Sport Utility,Front Wheel Drive,5,4,Aluminum,Aluminum,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,3829.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,Yes,13,17.1,24.0,22.0,28.0,Strut,Strut,Multi-Link,Multi-Link,10.0,,280.0,272.0,Turbo Premium Unleaded I-4,2.0,300000.0,5.0
2,2019,Acura RDX,Acura RDX Specs: FWD w/A-Spec Pkg,43600,Small Sport Utility Vehicles 2WD,Sport Utility,Front Wheel Drive,5,4,Aluminum,Aluminum,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,3821.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,Yes,13,17.1,24.0,22.0,27.0,Strut,Strut,Multi-Link,Multi-Link,10.0,,280.0,272.0,Turbo Premium Unleaded I-4,2.0,300000.0,5.0
3,2019,Acura RDX,Acura RDX Specs: FWD,37400,Small Sport Utility Vehicles 2WD,Sport Utility,Front Wheel Drive,5,4,Aluminum,Aluminum,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,3783.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,No,No,Yes,Yes,Yes,11,17.1,24.0,22.0,28.0,Strut,Strut,Multi-Link,Multi-Link,10.0,,280.0,272.0,Turbo Premium Unleaded I-4,2.0,300000.0,5.0
4,2019,Acura RDX,Acura RDX Specs: AWD w/Technology Pkg,42600,Small Sport Utility Vehicles 4WD,Sport Utility,All Wheel Drive,5,4,Aluminum,Aluminum,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,4026.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,No,Yes,Yes,Yes,Yes,12,17.1,23.0,21.0,27.0,Strut,Strut,Multi-Link,Multi-Link,10.0,,280.0,272.0,Turbo Premium Unleaded I-4,2.0,300000.0,5.0


### Transform categorical features

In [4]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

columns_to_encode = ['Model', 'EPA_Class', 'Body_Stype', 'Drivetrain', 'Front_Wheel_Mat', 'Rear_Wheel_Mat', 'Engine_Type', 
                     'Susp_Front', 'Susp_Front2', 'Susp_Rear', 'Susp_Rear2']
for column in columns_to_encode:
    df[column] = df[column].fillna('')
    df[column] = lb_make.fit_transform(df[column])
    df[column] = df[column].replace('', np.nan)

In [5]:
df.head()

Unnamed: 0,Year,Model,Specs,MSRP,EPA_Class,Body_Stype,Drivetrain,Psg_Cp,Psg_Doors,Front_Wheel_Mat,Rear_Wheel_Mat,Basic_Miles,Basic_Year,Drivetrain_Miles,Drivetrain_Years,Road_Assist_Miles,Road_Assist_Years,Max_Alt_Cp,Maint_Miles,Maint_Year,Max_Alt_Watts,Other_Features,Weight,Trailer_cpt,Volume,Width,Height,Length,Size_avg,Airbag_Front_Driver,Airbag_Front_Psg,Airbag_Psg_Switch,Airbag_SBF,Airbag_SBR,Airbag_SHF,Airbag_SHR,Child_Door_Locks,Day_Lights,Trac_Control,Night_Vision,Rollover_Prt,Fog_Lamps,Parking_Aid,Tire_P_Monitor,Backup_Camera,Stability_Ctl,Num_safety,Fuel_Tank_Cpt,MPG_avg,MPG_City,MPG_Hwy,Susp_Front,Susp_Front2,Susp_Rear,Susp_Rear2,Trans_Speed,Battery_Amps,HP1,HP2,Engine_Type,Displacement,Corrosion_Miles,Corrosion_Years
0,2019,3,Acura RDX Specs: FWD w/Technology Pkg,40600,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,3790.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,No,Yes,Yes,Yes,Yes,12,17.1,24.0,22.0,28.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
1,2019,3,Acura RDX Specs: FWD w/Advance Pkg,45500,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,3829.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,Yes,13,17.1,24.0,22.0,28.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
2,2019,3,Acura RDX Specs: FWD w/A-Spec Pkg,43600,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,3821.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,Yes,13,17.1,24.0,22.0,27.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
3,2019,3,Acura RDX Specs: FWD,37400,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,3783.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,No,No,Yes,Yes,Yes,11,17.1,24.0,22.0,28.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
4,2019,3,Acura RDX Specs: AWD w/Technology Pkg,42600,69,23,9,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,Vehicle Stability Assist Electronic Stability ...,4026.0,,,74.8,65.7,,46.833333,Yes,Yes,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,No,Yes,Yes,Yes,Yes,12,17.1,23.0,21.0,27.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0


In [6]:
df = df.replace({'Yes': 1, 'No': 0})

In [7]:
df = df.drop(['Specs', 'Other_Features'], axis=1)

In [8]:
df

Unnamed: 0,Year,Model,MSRP,EPA_Class,Body_Stype,Drivetrain,Psg_Cp,Psg_Doors,Front_Wheel_Mat,Rear_Wheel_Mat,Basic_Miles,Basic_Year,Drivetrain_Miles,Drivetrain_Years,Road_Assist_Miles,Road_Assist_Years,Max_Alt_Cp,Maint_Miles,Maint_Year,Max_Alt_Watts,Weight,Trailer_cpt,Volume,Width,Height,Length,Size_avg,Airbag_Front_Driver,Airbag_Front_Psg,Airbag_Psg_Switch,Airbag_SBF,Airbag_SBR,Airbag_SHF,Airbag_SHR,Child_Door_Locks,Day_Lights,Trac_Control,Night_Vision,Rollover_Prt,Fog_Lamps,Parking_Aid,Tire_P_Monitor,Backup_Camera,Stability_Ctl,Num_safety,Fuel_Tank_Cpt,MPG_avg,MPG_City,MPG_Hwy,Susp_Front,Susp_Front2,Susp_Rear,Susp_Rear2,Trans_Speed,Battery_Amps,HP1,HP2,Engine_Type,Displacement,Corrosion_Miles,Corrosion_Years
0,2019,3,40600,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,3790.0,,,74.8,65.7,,46.833333,1,1,0,1,0,1,1,1,1,1,0,0,0,1,1,1,1,12,17.1,24.0,22.0,28.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
1,2019,3,45500,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,3829.0,,,74.8,65.7,,46.833333,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,1,1,13,17.1,24.0,22.0,28.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
2,2019,3,43600,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,3821.0,,,74.8,65.7,,46.833333,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,1,1,13,17.1,24.0,22.0,27.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
3,2019,3,37400,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,3783.0,,,74.8,65.7,,46.833333,1,1,0,1,0,1,1,1,1,1,0,0,0,0,1,1,1,11,17.1,24.0,22.0,28.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
4,2019,3,42600,69,23,9,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,4026.0,,,74.8,65.7,,46.833333,1,1,0,1,0,1,1,1,1,1,0,0,0,1,1,1,1,12,17.1,23.0,21.0,27.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
5,2019,3,47500,69,23,9,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,4068.0,,,74.8,65.7,,46.833333,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,1,1,13,17.1,23.0,21.0,27.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
6,2019,3,45600,69,23,9,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,,,,,4015.0,,,74.8,65.7,,46.833333,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,1,1,13,17.1,23.0,21.0,26.0,132,56,144,46,10.0,,280.0,272.0,120,2.0,300000.0,5.0
7,2018,3,37500,69,23,9,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,130.0,,,,3902.0,1500.0,,73.7,65.0,,46.233333,1,1,0,1,0,1,1,1,1,1,0,0,0,0,1,1,1,11,16.0,22.0,19.0,27.0,132,56,144,46,6.0,550.0,252.0,279.0,93,3.5,300000.0,5.0
8,2018,3,41000,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,130.0,,,,3772.0,1500.0,,73.7,65.0,,46.233333,1,1,0,1,0,1,1,1,1,1,0,0,0,0,1,1,1,11,16.0,23.0,20.0,28.0,132,56,144,46,6.0,550.0,252.0,279.0,93,3.5,300000.0,5.0
9,2018,3,39700,68,23,16,5,4,3,3,50000.0,4.0,70000.0,6.0,50000.0,4.0,130.0,,,,3768.0,1500.0,,73.7,65.0,,46.233333,1,1,0,1,0,1,1,1,1,1,0,0,0,0,1,1,1,11,16.0,23.0,20.0,28.0,132,56,144,46,6.0,550.0,252.0,279.0,93,3.5,300000.0,5.0


In [9]:
X = df.drop('MSRP', axis=1)
y = df['MSRP']

In [10]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X = imp.transform(X)

In [11]:
X

array([[2.019e+03, 3.000e+00, 6.800e+01, ..., 2.000e+00, 3.000e+05,
        5.000e+00],
       [2.019e+03, 3.000e+00, 6.800e+01, ..., 2.000e+00, 3.000e+05,
        5.000e+00],
       [2.019e+03, 3.000e+00, 6.800e+01, ..., 2.000e+00, 3.000e+05,
        5.000e+00],
       ...,
       [2.016e+03, 4.170e+02, 4.400e+01, ..., 2.500e+00, 3.000e+05,
        1.200e+01],
       [2.015e+03, 4.170e+02, 4.400e+01, ..., 2.500e+00, 3.000e+05,
        1.200e+01],
       [2.015e+03, 4.170e+02, 4.400e+01, ..., 2.500e+00, 3.000e+05,
        1.200e+01]])