In [14]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# importing the initial model I am going to use
# will be adding more models later on in the notebook!
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
# need statistical analysis tools
from scipy import stats
from scipy.stats import ttest_ind
import statsmodels.api as sm
from statsmodels.formula.api import ols
# importing some evaluation metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.model_selection import cross_val_score
# adding in some regularization tools
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV, RidgeCV
# feature selection tools
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectFromModel
# pipelines
from sklearn.pipeline import Pipeline

In [41]:
# Importing the car price data
data = pd.read_csv('car_price_data.csv')
print(data.shape)
data.head()

(205, 26)


Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


Some of these variables will need to be encoded. I am going to use `pd.get_dummies` for that.

In [16]:
data['CarName'].value_counts()

toyota corolla           6
toyota corona            6
peugeot 504              6
subaru dl                4
toyota mark ii           3
                        ..
toyota corona mark ii    1
bmw x4                   1
subaru r1                1
dodge colt (sw)          1
mitsubishi pajero        1
Name: CarName, Length: 147, dtype: int64

I am noticing that the car's MAKE always comes before the first space in the `CarName` variable. I can possibly use a Regex function to capture just the car's MAKE.

In [17]:
# Importing Regex
import re

In [42]:
make_list = data['CarName'].to_list()

In [20]:
make_list[0:5]

['alfa-romero giulia',
 'alfa-romero stelvio',
 'alfa-romero Quadrifoglio',
 'audi 100 ls',
 'audi 100ls']

Regex was actually not necessary. I used the `.partition()` method.

In [43]:
companies = []
for name in make_list:
    companies.append(name.partition(' ')[0])
print(companies[0:10])

['alfa-romero', 'alfa-romero', 'alfa-romero', 'audi', 'audi', 'audi', 'audi', 'audi', 'audi', 'audi']


In [44]:
companies_set = set(companies)
len(companies_set)

28

In [45]:
companies_set

{'Nissan',
 'alfa-romero',
 'audi',
 'bmw',
 'buick',
 'chevrolet',
 'dodge',
 'honda',
 'isuzu',
 'jaguar',
 'maxda',
 'mazda',
 'mercury',
 'mitsubishi',
 'nissan',
 'peugeot',
 'plymouth',
 'porcshce',
 'porsche',
 'renault',
 'saab',
 'subaru',
 'toyota',
 'toyouta',
 'vokswagen',
 'volkswagen',
 'volvo',
 'vw'}

There are some issues with the make names and also some duplicates. Will need to use a conditional statement to fix the ones that are spelt wrong or duplicated.  
  
Since there aren't too many I can just do this by hand. But I will still write a function so I can correct any of the entries in my data with incorrect MAKE names.

In [46]:
# First I am going to lowercase all of the CarNames in the original data
companies_lower = [x.lower() for x in companies]

In [47]:
data.insert(2,'make',companies)

In [48]:
data.head()

Unnamed: 0,car_ID,symboling,make,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,enginetype,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [49]:
data.drop(columns=['CarName'], axis=1, inplace=True)

In [50]:
# Replacing those with easily fixed names. Volkswagen is another issue.
data.replace({'maxda': 'mazda', 'porcshce': 'porsche', 'toyouta':'toyota'}, inplace=True)


In [51]:
data.replace({'vw':'volkswagen', 'vokswagen':'volkswagen'}, inplace=True)


In [53]:
data.make.value_counts()

toyota         32
nissan         17
mazda          17
honda          13
mitsubishi     13
subaru         12
volkswagen     12
peugeot        11
volvo          11
dodge           9
bmw             8
buick           8
plymouth        7
audi            7
saab            6
porsche         5
isuzu           4
chevrolet       3
alfa-romero     3
jaguar          3
renault         2
mercury         1
Nissan          1
Name: make, dtype: int64

In [54]:
# Nissan slipped through somehow
data.replace('Nissan', 'nissan', inplace=True)

In [55]:
data.make.value_counts()

toyota         32
nissan         18
mazda          17
honda          13
mitsubishi     13
volkswagen     12
subaru         12
peugeot        11
volvo          11
dodge           9
buick           8
bmw             8
plymouth        7
audi            7
saab            6
porsche         5
isuzu           4
alfa-romero     3
jaguar          3
chevrolet       3
renault         2
mercury         1
Name: make, dtype: int64

Now our data has only make and we can start comparing companies as part of our EDA