The purpose of this section is to have final preparations before modeling.  My goal is to complete three steps.

-Create dummy or indicator features for categorical variables

-Standardize the magnitude of numeric features using a scaler

-Split my data into testing and training datasets

In [49]:
#Import anything that I might use
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [29]:
cars = pd.read_csv('cars.csv', index_col=0)
cars.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,state
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821.0,automatic,fwd,grey,MN
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800.0,automatic,fwd,blue,CT
6,7184710887,7995,2010.0,chevrolet,equinox,4 cylinders,108124.0,automatic,4wd,grey,MN
14,7184705759,10995,2008.0,chevrolet,tahoe,Unknown,143528.0,automatic,4wd,grey,MN
17,7184703651,14995,2011.0,chevrolet,silverado 1500,8 cylinders,102462.0,automatic,4wd,blue,MN


The first thing I will do is get dummy variables for my categorical variables.  This is manufacturer, cylinders, transmission, drive, color, and state. I'm also going to drop one column for each to avoid the dummy variable trap.

In [30]:
dummy_manufacturer = pd.get_dummies(cars['manufacturer'],drop_first = True)
dummy_manufacturer


Unnamed: 0,alfa-romeo,aston-martin,audi,bmw,buick,cadillac,chevrolet,chrysler,datsun,dodge,...,pontiac,porche,ram,rover,saturn,subaru,tesla,toyota,volkswagen,volvo
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
423847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
423849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
423850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [31]:
dummy_cylinders=pd.get_dummies(cars['cylinders'],drop_first = True)
dummy_cylinders

Unnamed: 0,12 cylinders,3 cylinders,4 cylinders,5 cylinders,6 cylinders,8 cylinders,Unknown,other
1,0,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0,0
6,0,0,1,0,0,0,0,0
14,0,0,0,0,0,0,1,0
17,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
423846,0,0,0,0,0,0,1,0
423847,0,0,0,0,0,0,1,0
423849,0,0,1,0,0,0,0,0
423850,0,0,1,0,0,0,0,0


In [32]:
dummy_transmission=pd.get_dummies(cars['transmission'],drop_first = True)
dummy_transmission

Unnamed: 0,automatic,manual,other
1,1,0,0
2,1,0,0
6,1,0,0
14,1,0,0
17,1,0,0
...,...,...,...
423846,1,0,0
423847,1,0,0
423849,1,0,0
423850,0,1,0


In [33]:
dummy_drive=pd.get_dummies(cars['drive'],drop_first = True)
dummy_drive

Unnamed: 0,Unknown,fwd,rwd
1,0,1,0
2,0,1,0
6,0,0,0
14,0,0,0
17,0,0,0
...,...,...,...
423846,0,0,0
423847,0,1,0
423849,0,0,0
423850,0,1,0


In [34]:
dummy_color=pd.get_dummies(cars['paint_color'],drop_first = True)
dummy_color

Unnamed: 0,black,blue,brown,custom,green,grey,orange,purple,red,silver,white,yellow
1,0,0,0,0,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,0,0
14,0,0,0,0,0,1,0,0,0,0,0,0
17,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
423846,0,0,0,0,0,0,0,0,0,0,0,0
423847,0,0,0,0,1,0,0,0,0,0,0,0
423849,0,0,0,0,0,0,0,0,0,1,0,0
423850,0,1,0,0,0,0,0,0,0,0,0,0


In [35]:
dummy_state=pd.get_dummies(cars['state'],drop_first = True)
dummy_state

Unnamed: 0,AL,AR,AZ,CA,CO,CT,DC,DE,FL,GA,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
423847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
423849,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
423850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now it's time to merge all of these variables into a main database, and I'm also going to drop the corresponding columns that go with the dummy columns.

In [36]:
cars_concat = pd.concat([cars, dummy_manufacturer, dummy_cylinders, dummy_transmission, dummy_drive, dummy_color, dummy_state],axis=1)
cars_concat.head()

Unnamed: 0,id,price,year,manufacturer,model,cylinders,odometer,transmission,drive,paint_color,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
1,7184773187,8750,2013.0,hyundai,sonata,4 cylinders,90821.0,automatic,fwd,grey,...,0,0,0,0,0,0,0,0,0,0
2,7193375964,10900,2013.0,toyota,prius,4 cylinders,92800.0,automatic,fwd,blue,...,0,0,0,0,0,0,0,0,0,0
6,7184710887,7995,2010.0,chevrolet,equinox,4 cylinders,108124.0,automatic,4wd,grey,...,0,0,0,0,0,0,0,0,0,0
14,7184705759,10995,2008.0,chevrolet,tahoe,Unknown,143528.0,automatic,4wd,grey,...,0,0,0,0,0,0,0,0,0,0
17,7184703651,14995,2011.0,chevrolet,silverado 1500,8 cylinders,102462.0,automatic,4wd,blue,...,0,0,0,0,0,0,0,0,0,0


In [39]:
cars_dum = cars_concat.drop(['id','manufacturer','model','cylinders','transmission','drive','paint_color','state'], axis=1)
cars_dum.head()

Unnamed: 0,price,year,odometer,alfa-romeo,aston-martin,audi,bmw,buick,cadillac,chevrolet,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
1,8750,2013.0,90821.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10900,2013.0,92800.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7995,2010.0,108124.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
14,10995,2008.0,143528.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
17,14995,2011.0,102462.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


With so many categorical variables, and some having many options, the list got quite a bit longer.  Now it is time to scale the data because these numbers vary by a large amount.

In [48]:
scaler = StandardScaler()
scaled_cars = scaler.fit_transform(cars_dum)
scaled_cars = pd.DataFrame(scaled_cars,index=cars_dum.index,columns=cars_dum.columns) 
scaled_cars

Unnamed: 0,price,year,odometer,alfa-romeo,aston-martin,audi,bmw,buick,cadillac,chevrolet,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
1,-0.705143,0.091782,0.351621,-0.02515,-0.011673,-0.114359,-0.174982,-0.127267,-0.130003,-0.403711,...,-0.065165,-0.166997,-0.219686,-0.052321,-0.164289,-0.085085,-0.19948,-0.192601,-0.037812,-0.041268
2,-0.536635,0.091782,0.399892,-0.02515,-0.011673,-0.114359,-0.174982,-0.127267,-0.130003,-0.403711,...,-0.065165,-0.166997,-0.219686,-0.052321,-0.164289,-0.085085,-0.19948,-0.192601,-0.037812,-0.041268
6,-0.764317,-0.313997,0.773672,-0.02515,-0.011673,-0.114359,-0.174982,-0.127267,-0.130003,2.477019,...,-0.065165,-0.166997,-0.219686,-0.052321,-0.164289,-0.085085,-0.19948,-0.192601,-0.037812,-0.041268
14,-0.529189,-0.584516,1.637240,-0.02515,-0.011673,-0.114359,-0.174982,-0.127267,-0.130003,2.477019,...,-0.065165,-0.166997,-0.219686,-0.052321,-0.164289,-0.085085,-0.19948,-0.192601,-0.037812,-0.041268
17,-0.215686,-0.178738,0.635566,-0.02515,-0.011673,-0.114359,-0.174982,-0.127267,-0.130003,2.477019,...,-0.065165,-0.166997,-0.219686,-0.052321,-0.164289,-0.085085,-0.19948,-0.192601,-0.037812,-0.041268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423846,0.802807,0.903339,-1.066056,-0.02515,-0.011673,-0.114359,-0.174982,-0.127267,-0.130003,-0.403711,...,-0.065165,-0.166997,-0.219686,-0.052321,-0.164289,-0.085085,-0.19948,-0.192601,-0.037812,-0.041268
423847,-0.999836,-0.719776,1.352807,-0.02515,-0.011673,-0.114359,-0.174982,-0.127267,-0.130003,-0.403711,...,-0.065165,-0.166997,-0.219686,-0.052321,-0.164289,-0.085085,-0.19948,-0.192601,-0.037812,-0.041268
423849,-0.616187,-0.043478,0.136461,-0.02515,-0.011673,-0.114359,-0.174982,-0.127267,-0.130003,-0.403711,...,-0.065165,-0.166997,-0.219686,-0.052321,-0.164289,-0.085085,-0.19948,-0.192601,-0.037812,-0.041268
423850,-1.332149,-1.396073,-0.887995,-0.02515,-0.011673,-0.114359,-0.174982,-0.127267,-0.130003,-0.403711,...,-0.065165,-0.166997,-0.219686,-0.052321,-0.164289,-0.085085,-0.19948,-0.192601,-0.037812,-0.041268


The final step in preparation will be to split the data into training and testing data.  Price is my y variable, everything else is the x variable.  Since I have such a large amount of data, I am going with a 70/30 split.

In [50]:
X_train, X_test, y_train, y_test = train_test_split(scaled_cars.drop(columns='price'),scaled_cars.price,test_size=0.3, random_state=42)

In [51]:
X_train.shape, X_test.shape

((128459, 119), (55055, 119))

In [52]:
y_train.shape, y_test.shape

((128459,), (55055,))

In [53]:
cars_dum.to_csv('cars_dum.csv')
scaled_cars.to_csv('scaled_cars.csv')

Everything is saved and ready to be modeled.
