## Exploratory Analysis

In [41]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
auto_price = pd.read_csv('data/imports-85.csv', header=None)
auto_price.columns = ['symboling','normalized_losses','make','fuel_type','aspiration','num_of_doors','body_style',
                       'drive_wheels','engine_location','wheel_base','length','width','height','curb_weight','engine_type',
                       'num_of_cylinders','engine_size','fuel_system','bore','stroke','compression_ratio','horsepower',
                       'peak_rpm','city_mpg','highway_mpg','price']
auto_price.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [3]:
auto_price = auto_price.replace('?', np.nan)

In [4]:
auto_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  164 non-null    object 
 2   make               205 non-null    object 
 3   fuel_type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num_of_doors       203 non-null    object 
 6   body_style         205 non-null    object 
 7   drive_wheels       205 non-null    object 
 8   engine_location    205 non-null    object 
 9   wheel_base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    object 
 15  num_of_cylinders   205 non-null    object 
 16  engine_size        205 non

In [5]:
auto_price.iloc[:,:-1] = auto_price.iloc[:,:-1].fillna(auto_price.iloc[:,:-1].median())

In [6]:
auto_price['price'] = auto_price['price'].apply(pd.to_numeric, errors='coerce')
auto_price['price'] = auto_price['price'].fillna(auto_price['price'].mean())

In [7]:
auto_price['num_of_doors'].value_counts()

four    114
two      89
Name: num_of_doors, dtype: int64

In [8]:
auto_price['num_of_doors'] = auto_price['num_of_doors'].fillna('four')

In [9]:
auto_price.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,115,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,115,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,115,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [10]:
auto_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel_type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num_of_doors       205 non-null    object 
 6   body_style         205 non-null    object 
 7   drive_wheels       205 non-null    object 
 8   engine_location    205 non-null    object 
 9   wheel_base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    object 
 15  num_of_cylinders   205 non-null    object 
 16  engine_size        205 non

In [11]:
auto_price = auto_price.applymap(str)

In [12]:
import re
def convert_numeric(text):
    if re.match('\d*.[0-9]', text):
        value = float(text)
    else:
        value = text       
    return value
    
auto_price = auto_price.applymap(convert_numeric)

In [13]:
auto_price.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,115.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,13495.0
1,3,115.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
2,1,115.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152.0,mpfi,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109.0,mpfi,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136.0,mpfi,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0


In [14]:
auto_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    object 
 1   normalized_losses  205 non-null    float64
 2   make               205 non-null    object 
 3   fuel_type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num_of_doors       205 non-null    object 
 6   body_style         205 non-null    object 
 7   drive_wheels       205 non-null    object 
 8   engine_location    205 non-null    object 
 9   wheel_base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    float64
 14  engine_type        205 non-null    object 
 15  num_of_cylinders   205 non-null    object 
 16  engine_size        205 non

In [40]:
auto_price.select_dtypes(include=['float64']).describe()

Unnamed: 0,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,120.6,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329366,3.256098,10.142537,104.165854,5126.097561,25.219512,30.75122,13207.129353
std,31.805105,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270858,0.313634,3.97204,39.529733,477.035772,6.542142,6.886443,7868.768212
min,65.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,101.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,115.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10595.0
75%,137.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16500.0
max,256.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


In [42]:
columns = auto_price.select_dtypes(include=['float64']).columns

In [43]:
# Get z scores
z = np.abs(stats.zscore(auto_price[columns]))
print(z)

[[0.17650339 1.6907718  0.42652147 ... 0.64655303 0.54605874 0.03667351]
 [0.17650339 1.6907718  0.42652147 ... 0.64655303 0.54605874 0.41949787]
 [0.17650339 0.70859588 0.23151305 ... 0.95301169 0.69162706 0.41949787]
 ...
 [0.80687262 1.72187336 1.19854871 ... 1.10624102 1.12833203 1.05456591]
 [0.80687262 1.72187336 1.19854871 ... 0.11959362 0.54605874 1.18005077]
 [0.80687262 1.72187336 1.19854871 ... 0.95301169 0.83719538 1.19979712]]


In [44]:
# Drop columns above z=3 threshold
auto_price[columns] = auto_price[columns][(z < 3).all(axis=1)]
auto_price.shape

(205, 26)

## Data preprocessing

In [15]:
auto_price.select_dtypes(include=['object']).head()

Unnamed: 0,symboling,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,engine_type,num_of_cylinders,fuel_system
0,3,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,3,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,1,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,2,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,2,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [16]:
print(auto_price['num_of_doors'].value_counts())
print('--------------\n')
print(auto_price['num_of_cylinders'].value_counts())

four    116
two      89
Name: num_of_doors, dtype: int64
--------------

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: num_of_cylinders, dtype: int64


In [17]:
updated_nums= {'num_of_doors': {'two': 2, 'four': 4},
              'num_of_cylinders': {'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'eight':8, 'twelve':12}}

In [18]:
auto_price.replace(updated_nums, inplace = True)

In [19]:
auto_price['symboling'] = auto_price['symboling'].astype(int)

In [30]:
# Categorical to binary numeric
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

columns = auto_price.select_dtypes(include=['object']).columns
auto_price[columns] = auto_price.select_dtypes(include=['object']).apply(lb.fit_transform)

In [33]:
auto_price.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,115.0,0,1,0,2,0,2,0,88.6,...,130.0,5,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,13495.0
1,3,115.0,0,1,0,2,0,2,0,88.6,...,130.0,5,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
2,1,115.0,0,1,0,2,2,2,0,94.5,...,152.0,5,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
3,2,164.0,1,1,0,4,3,1,0,99.8,...,109.0,5,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
4,2,164.0,1,1,0,4,3,0,0,99.4,...,136.0,5,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0


In [34]:
auto_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int32  
 1   normalized_losses  205 non-null    float64
 2   make               205 non-null    int32  
 3   fuel_type          205 non-null    int32  
 4   aspiration         205 non-null    int32  
 5   num_of_doors       205 non-null    int64  
 6   body_style         205 non-null    int32  
 7   drive_wheels       205 non-null    int32  
 8   engine_location    205 non-null    int32  
 9   wheel_base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    float64
 14  engine_type        205 non-null    int32  
 15  num_of_cylinders   205 non-null    int64  
 16  engine_size        205 non

In [35]:
# Define features X
X = np.asarray(auto_price.iloc[:,:-1])
X[0:5]

array([[3.000e+00, 1.150e+02, 0.000e+00, 1.000e+00, 0.000e+00, 2.000e+00,
        0.000e+00, 2.000e+00, 0.000e+00, 8.860e+01, 1.688e+02, 6.410e+01,
        4.880e+01, 2.548e+03, 0.000e+00, 4.000e+00, 1.300e+02, 5.000e+00,
        3.470e+00, 2.680e+00, 9.000e+00, 1.110e+02, 5.000e+03, 2.100e+01,
        2.700e+01],
       [3.000e+00, 1.150e+02, 0.000e+00, 1.000e+00, 0.000e+00, 2.000e+00,
        0.000e+00, 2.000e+00, 0.000e+00, 8.860e+01, 1.688e+02, 6.410e+01,
        4.880e+01, 2.548e+03, 0.000e+00, 4.000e+00, 1.300e+02, 5.000e+00,
        3.470e+00, 2.680e+00, 9.000e+00, 1.110e+02, 5.000e+03, 2.100e+01,
        2.700e+01],
       [1.000e+00, 1.150e+02, 0.000e+00, 1.000e+00, 0.000e+00, 2.000e+00,
        2.000e+00, 2.000e+00, 0.000e+00, 9.450e+01, 1.712e+02, 6.550e+01,
        5.240e+01, 2.823e+03, 5.000e+00, 6.000e+00, 1.520e+02, 5.000e+00,
        2.680e+00, 3.470e+00, 9.000e+00, 1.540e+02, 5.000e+03, 1.900e+01,
        2.600e+01],
       [2.000e+00, 1.640e+02, 1.000e+00, 1.000e+00, 

In [36]:
# Define target y
y = np.asarray(auto_price['price'])
y[0:5]

array([13495., 16500., 16500., 13950., 17450.])

In [37]:
# Standardize the dataset
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[ 1.74347043, -0.17650339, -1.94825569,  0.32879797, -0.46929532,
        -1.14165266, -3.05097525,  1.21333041, -0.12186667, -1.6907718 ,
        -0.42652147, -0.84478235, -2.0204173 , -0.01456628, -2.86510549,
        -0.35288699,  0.07444893,  0.8695675 ,  0.52048915, -1.84134525,
        -0.28834891,  0.17330945, -0.26498274, -0.64655303, -0.54605874],
       [ 1.74347043, -0.17650339, -1.94825569,  0.32879797, -0.46929532,
        -1.14165266, -3.05097525,  1.21333041, -0.12186667, -1.6907718 ,
        -0.42652147, -0.84478235, -2.0204173 , -0.01456628, -2.86510549,
        -0.35288699,  0.07444893,  0.8695675 ,  0.52048915, -1.84134525,
        -0.28834891,  0.17330945, -0.26498274, -0.64655303, -0.54605874],
       [ 0.133509  , -0.17650339, -1.94825569,  0.32879797, -0.46929532,
        -1.14165266, -0.71720687,  1.21333041, -0.12186667, -0.70859588,
        -0.23151305, -0.19056612, -0.54352748,  0.51488192,  1.88688986,
         1.50203181,  0.60404617,  0.8695675 , -2

## Validation and Model Selection

### Train/Test Dataset
Let's split dataset into 80% train and 20% test set.

In [38]:
# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (164, 25) (164,)
Test set: (41, 25) (41,)
