In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import svm
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
import time
import warnings
def warn(*args, **kwargs):
    pass
#warnings.warn = warn  # uncomment this if you don't want to see the warnings generated by the models trained
#%matplotlib inline

## Import Data

In [2]:
df = pd.read_csv('Automobile_price_data.csv')

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


## 1st step: 'Cleaning' - select only the desired columns

In [4]:
#CLEANING - select the desired columns (it is recommended to do that before treating the missing values)
df2 = df[['make', 'body-style', 'wheel-base', 'engine-size', 'num-of-doors', 'engine-location', 
          'num-of-cylinders', 'horsepower', 'city-mpg', 'highway-mpg', 'aspiration', 'compression-ratio', #'peak-rpm'
          'fuel-system', 'price']]
#df2 = df[['make', 'body-style', 'wheel-base', 'engine-size', 'horsepower', 'peak-rpm', 'highway-mpg', 'price']]
#Remove all lines with the '?' character(the missing value)
df2 = df2[(df2 != '?').all(1)]

## ALTERNATE CLEANING METHOD

## replace all '?' cells with NaN
#df2 = df2.applymap(lambda x: np.NaN if str(x)=='?' else x)

## remove all lines that contains NaN values
#df2 = df2.dropna()

#df2.head()

#### Remove anomalias manualmente

## Convert numeric columns to number if the cells are formatted as text

In [5]:
#Converte as colunas numéricas que não foram convertidas durante o processo de leitura do CSV
df2 = df2.apply(pd.to_numeric, errors='ignore')
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197 entries, 0 to 204
Data columns (total 14 columns):
make                 197 non-null object
body-style           197 non-null object
wheel-base           197 non-null float64
engine-size          197 non-null int64
num-of-doors         197 non-null object
engine-location      197 non-null object
num-of-cylinders     197 non-null object
horsepower           197 non-null int64
city-mpg             197 non-null int64
highway-mpg          197 non-null int64
aspiration           197 non-null object
compression-ratio    197 non-null float64
fuel-system          197 non-null object
price                197 non-null int64
dtypes: float64(2), int64(5), object(7)
memory usage: 23.1+ KB


In [6]:
dfSelectedFeaturesAndTarget = df2
dfSelectedFeaturesAndTarget.head()

Unnamed: 0,make,body-style,wheel-base,engine-size,num-of-doors,engine-location,num-of-cylinders,horsepower,city-mpg,highway-mpg,aspiration,compression-ratio,fuel-system,price
0,alfa-romero,convertible,88.6,130,two,front,four,111,21,27,std,9.0,mpfi,13495
1,alfa-romero,convertible,88.6,130,two,front,four,111,21,27,std,9.0,mpfi,16500
2,alfa-romero,hatchback,94.5,152,two,front,six,154,19,26,std,9.0,mpfi,16500
3,audi,sedan,99.8,109,four,front,four,102,24,30,std,10.0,mpfi,13950
4,audi,sedan,99.4,136,four,front,five,115,18,22,std,8.0,mpfi,17450


## Show the relationship of the fields in respect to the price field
#### (the field is more related if its value is closer to 1.0)

In [7]:
import seaborn as sns
correlation_matrix = dfSelectedFeaturesAndTarget.corr()
correlations = correlation_matrix[["price"]].abs()
sorted = correlations.sort_values(by='price', ascending=False)
sorted.transpose()

Unnamed: 0,price,engine-size,horsepower,highway-mpg,city-mpg,wheel-base,compression-ratio
price,1.0,0.873708,0.811953,0.708659,0.692948,0.582976,0.074542


## Convert text columns(categories) to numeric-binary columns to make regression possible
#### see https://stackoverflow.com/questions/34007308/linear-regression-analysis-with-string-categorical-features-variables

In [8]:
allColumns = pd.get_dummies(dfSelectedFeaturesAndTarget) ## get_dummies function does the trick
allColumns.head()

Unnamed: 0,wheel-base,engine-size,horsepower,city-mpg,highway-mpg,compression-ratio,price,make_alfa-romero,make_audi,make_bmw,...,aspiration_std,aspiration_turbo,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,88.6,130,111,21,27,9.0,13495,1,0,0,...,1,0,0,0,0,0,0,1,0,0
1,88.6,130,111,21,27,9.0,16500,1,0,0,...,1,0,0,0,0,0,0,1,0,0
2,94.5,152,154,19,26,9.0,16500,1,0,0,...,1,0,0,0,0,0,0,1,0,0
3,99.8,109,102,24,30,10.0,13950,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,99.4,136,115,18,22,8.0,17450,0,1,0,...,1,0,0,0,0,0,0,1,0,0


## Separate 'Feature Columns' and 'Target Column" to distinct dataframes

In [9]:
featureColumns = allColumns.copy()
featureColumns = featureColumns.loc[:, featureColumns.columns != 'price']
targetColumn = allColumns[['price']]
featureColumns.head()

Unnamed: 0,wheel-base,engine-size,horsepower,city-mpg,highway-mpg,compression-ratio,make_alfa-romero,make_audi,make_bmw,make_chevrolet,...,aspiration_std,aspiration_turbo,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,88.6,130,111,21,27,9.0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,88.6,130,111,21,27,9.0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,94.5,152,154,19,26,9.0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,99.8,109,102,24,30,10.0,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0
4,99.4,136,115,18,22,8.0,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0


## Separate the train set from the test set

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(featureColumns, targetColumn, test_size = 0.25, random_state = 5)
Y_train_values = Y_train['price'].values
Y_test_values = Y_test['price'].values
print("Size of the train set = ", X_train.shape[0])
print("Size of the test set  = ", X_test.shape[0])

Size of the train set =  147
Size of the test set  =  50


In [11]:
def batch_regression(X_train, Y_train, X_test, Y_test, verbose = True):
    dict_models = {}
    for regressor_name, regressor in list(dict_regressors.items()):
        t_start = time.clock()
        regressor.fit(X_train, Y_train)
        t_end = time.clock()
        
        t_diff = t_end - t_start
        train_score = regressor.score(X_train, Y_train)
        test_score = regressor.score(X_test, Y_test)
        
        dict_models[regressor_name] = {'model': regressor_name, 'train_score': train_score, 'test_score': test_score, 'train_time': t_diff}
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=regressor_name, f=t_diff))
    return dict_models

def display_dict_models(dict_models, sort_by='test_score'):
    cls = [key for key in dict_models.keys()]
    test_s = [dict_models[key]['test_score'] for key in cls]
    training_s = [dict_models[key]['train_score'] for key in cls]
    training_t = [dict_models[key]['train_time'] for key in cls]
    
    df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),4)), columns = ['regressor', 'train_score', 'test_score', 'train_time'])
    for ii in range(0,len(cls)):
        df_.loc[ii, 'regressor'] = cls[ii]
        df_.loc[ii, 'train_score'] = str(training_s[ii])
        df_.loc[ii, 'test_score'] = str(test_s[ii])
        df_.loc[ii, 'train_time'] = training_t[ii]
    
    display(df_.sort_values(by=sort_by, ascending=False))

In [12]:
dict_regressors = {
    "Linear": linear_model.LinearRegression(),
    "Ridge": linear_model.Ridge(),
    "BayesianRidge": linear_model.BayesianRidge(),
    "Lasso": linear_model.Lasso(alpha=0.01, max_iter=50000),
    "ElasticNet" : linear_model.ElasticNet(),
    "LassoLars" : linear_model.LassoLars(max_iter=5, alpha=1.0),
    "ARDRegression" : linear_model.ARDRegression(),
    "TheilSenRegressor" : linear_model.TheilSenRegressor(),
    "HuberRegressor" : linear_model.HuberRegressor(),
    "Polynomial" : Pipeline([('poly', PolynomialFeatures(degree=3)),('linear', linear_model.LinearRegression(fit_intercept=False))]),
    "Perceptron" : linear_model.Perceptron(max_iter=10),
    "Lars" : linear_model.Lars(n_nonzero_coefs=20, eps=2e-5),
    "SGDRegression" : linear_model.SGDRegressor(),
    "PassiveAggressiveRegressor": linear_model.PassiveAggressiveRegressor()
}
dict_models = batch_regression(X_train, Y_train_values, X_test, Y_test)
display_dict_models(dict_models)

trained Linear in 0.02 s
trained Ridge in 0.00 s
trained BayesianRidge in 0.01 s
trained Lasso in 0.23 s
trained ElasticNet in 0.00 s
trained LassoLars in 0.00 s
trained ARDRegression in 0.47 s
trained TheilSenRegressor in 5.82 s
trained HuberRegressor in 0.04 s
trained Polynomial in 0.80 s
trained Perceptron in 0.04 s
trained Lars in 0.01 s
trained SGDRegression in 0.02 s
trained PassiveAggressiveRegressor in 0.00 s




Unnamed: 0,regressor,train_score,test_score,train_time
7,TheilSenRegressor,0.9412457095745252,0.910710082527898,5.818663
11,Lars,0.9302715200230984,0.9033575743112788,0.005399
3,Lasso,0.9527396486104523,0.8968217652088575,0.230751
0,Linear,0.952739659799792,0.8967306341282779,0.016506
6,ARDRegression,0.9465872938085385,0.8922280720091705,0.468862
1,Ridge,0.9459704516633344,0.8892170904785522,0.001807
2,BayesianRidge,0.945598345749561,0.8886625121315499,0.011784
8,HuberRegressor,0.8949677013014942,0.8276150853594803,0.039647
5,LassoLars,0.7727945698126081,0.794196386400881,0.002183
4,ElasticNet,0.8558082973523125,0.7615128990714789,0.002925
