# Import Required Packages

In [88]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
import statsmodels.formula.api as sm
import matplotlib.pylab as plt
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

# Create dummy variables for categorical variables

In [89]:
data_df = pd.read_csv('DataToyotaCorolla.csv')

In [90]:
dummy = pd.get_dummies(data_df['Fuel_Type'])
dummy

Unnamed: 0,CNG,Diesel,Petrol
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
1431,0,0,1
1432,0,0,1
1433,0,0,1
1434,0,0,1


In [91]:
dummy2 = pd.get_dummies(data_df['Color'])
dummy2

Unnamed: 0,Beige,Black,Blue,Green,Grey,Red,Silver,Violet,White,Yellow
0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1431,0,0,1,0,0,0,0,0,0,0
1432,0,0,0,0,1,0,0,0,0,0
1433,0,0,1,0,0,0,0,0,0,0
1434,0,0,0,0,1,0,0,0,0,0


# Merge the dummy variables with the file 

In [92]:
data_df = pd.concat([data_df, dummy], axis=1)
data_df.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar,CNG,Diesel,Petrol
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,0,0,1,0,0,0,0,0,1,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,0,0,1,0,0,0,0,0,1,0
2,3,�TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,0,1,0,0,0,0,0,1,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,0,1,0,0,0,0,0,1,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,0,1,0,0,0,0,0,1,0


# Read the data and select columns for regression analysis 

In [93]:
car_df = pd.read_csv('TrainingDataToyotaCorolla.csv')
predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Automatic', 'Doors', 'Quarterly_Tax', 'Mfr_Guarantee','Guarantee_Period','Airco','Automatic_airco', 'CD_Player', 'Powered_Windows', 'Sport_Model', 'Tow_Bar']
outcome = 'Price'

In [94]:
car_df

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,Fuel_Type_Diesel,Fuel_Type_Petrol,...,Color,Color_Blue,Color_Silver,Color_Black,Color_Red,Color_Grey,Color_Green,Color_Beige,Color_Violet,Color_White
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,1,0,...,Blue,1,0,0,0,0,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,1,0,...,Silver,0,1,0,0,0,0,0,0,0
2,3,�TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,1,0,...,Blue,1,0,0,0,0,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,1,0,...,Black,0,0,1,0,0,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,1,0,...,Black,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1431,1438,TOYOTA Corolla 1.3 16V HATCHB G6 2/3-Doors,7500,69,12,1998,20544,Petrol,0,1,...,Blue,1,0,0,0,0,0,0,0,0
1432,1439,TOYOTA Corolla 1.3 16V HATCHB LINEA TERRA 2/3-...,10845,72,9,1998,19000,Petrol,0,1,...,Grey,0,0,0,0,1,0,0,0,0
1433,1440,TOYOTA Corolla 1.3 16V HATCHB LINEA TERRA 2/3-...,8500,71,10,1998,17016,Petrol,0,1,...,Blue,1,0,0,0,0,0,0,0,0
1434,1441,TOYOTA Corolla 1.3 16V HATCHB LINEA TERRA 2/3-...,7250,70,11,1998,16916,Petrol,0,1,...,Grey,0,0,0,0,1,0,0,0,0


# Partition data into predictors (x) and output (y)

In [95]:
x = pd.get_dummies(car_df[predictors], drop_first=True)
y = car_df[outcome]

In [96]:
x

Unnamed: 0,Age_08_04,KM,HP,Automatic,Doors,Quarterly_Tax,Mfr_Guarantee,Guarantee_Period,Airco,Automatic_airco,CD_Player,Powered_Windows,Sport_Model,Tow_Bar,Fuel_Type_Diesel,Fuel_Type_Petrol
0,23,46986,90,0,3,210,0,3,0,0,0,1,0,0,1,0
1,23,72937,90,0,3,210,0,3,1,0,1,0,0,0,1,0
2,24,41711,90,0,3,210,1,3,0,0,0,0,0,0,1,0
3,26,48000,90,0,3,210,1,3,0,0,0,0,0,0,1,0
4,30,38500,90,0,3,210,1,3,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1431,69,20544,86,0,3,69,1,3,1,0,0,1,1,0,0,1
1432,72,19000,86,0,3,69,0,3,0,0,0,0,1,0,0,1
1433,71,17016,86,0,3,69,0,3,0,0,0,0,0,0,0,1
1434,70,16916,86,0,3,69,1,3,0,0,0,0,0,0,0,1


In [97]:
y

0       13500
1       13750
2       13950
3       14950
4       13750
        ...  
1431     7500
1432    10845
1433     8500
1434     7250
1435     6950
Name: Price, Length: 1436, dtype: int64

# Built the Linear Model based on the training data

In [98]:
car_lm = LinearRegression()
car_lm.fit(x, y)

LinearRegression()

# Print coefficients and the performance measures

In [99]:
#print coefficients
print('intercept ', car_lm.intercept_) #prints only B0
print(pd.DataFrame({'Predictor': x.columns, 'coefficient': car_lm.coef_})) #B1, B2, B3...

# print performance measures
regressionSummary(y, car_lm.predict(x))

intercept  9674.40868255198
           Predictor  coefficient
0          Age_08_04  -110.049091
1                 KM    -0.017991
2                 HP    34.123638
3          Automatic   551.370127
4              Doors   163.268567
5      Quarterly_Tax    16.551847
6      Mfr_Guarantee   198.909133
7   Guarantee_Period    81.505197
8              Airco   154.205911
9    Automatic_airco  3093.550967
10         CD_Player   301.241898
11   Powered_Windows   452.320250
12       Sport_Model   365.989573
13           Tow_Bar  -245.090036
14  Fuel_Type_Diesel  1890.588161
15  Fuel_Type_Petrol  1863.418403

Regression statistics

                      Mean Error (ME) : -0.0000
       Root Mean Squared Error (RMSE) : 1206.2978
            Mean Absolute Error (MAE) : 905.7848
          Mean Percentage Error (MPE) : -0.9865
Mean Absolute Percentage Error (MAPE) : 8.9553


# Find the top 4 predictors (highest score)

In [100]:
# The initial model is the constant model - this requires special handling
# in train_model and score_model
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(x[variables], y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(y, [y.mean()] * len(y), model, df=1)
    return AIC_score(y, model.predict(x[variables]), model)

best_model, best_variables = forward_selection(x.columns, train_model, score_model, verbose=True)

print(best_variables)

Variables: Age_08_04, KM, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, Tow_Bar, Fuel_Type_Diesel, Fuel_Type_Petrol
Start: score=27617.54, constant
Step: score=25518.97, add Age_08_04
Step: score=25121.15, add Automatic_airco
Step: score=24964.73, add HP
Step: score=24874.81, add KM
Step: score=24674.15, add Quarterly_Tax
Step: score=24623.00, add Powered_Windows
Step: score=24595.98, add Guarantee_Period
Step: score=24576.79, add Sport_Model
Step: score=24556.97, add Doors
Step: score=24545.80, add Automatic
Step: score=24537.72, add Tow_Bar
Step: score=24529.29, add CD_Player
Step: score=24523.07, add Mfr_Guarantee
Step: score=24517.67, add Fuel_Type_Diesel
Step: score=24490.47, add Fuel_Type_Petrol
Step: score=24488.93, add Airco
Step: score=24488.93, add None
['Age_08_04', 'Automatic_airco', 'HP', 'KM', 'Quarterly_Tax', 'Powered_Windows', 'Guarantee_Period', 'Sport_Model', 'Doors', 'Automatic',

# Rebuild the model using the 4 predictors only

In [59]:
car_df = pd.read_csv('TrainingDataToyotaCorolla.csv')
predictors = ['Age_08_04', 'Automatic_airco', 'KM', 'HP']
outcome = 'Price'

In [78]:
x = car_df[predictors]
y = car_df[outcome]

In [79]:
x

Unnamed: 0,Age_08_04,Automatic_airco,HP,KM
0,23,0,90,46986
1,23,0,90,72937
2,24,0,90,41711
3,26,0,90,48000
4,30,0,90,38500
...,...,...,...,...
1431,69,0,86,20544
1432,72,0,86,19000
1433,71,0,86,17016
1434,70,0,86,16916


In [80]:
car_lm = LinearRegression()
car_lm.fit(x, y)

LinearRegression()

# Read the testing data

In [81]:
test_df = pd.read_csv('TestingDataToyotaCorolla.csv')
test_df

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0


In [82]:
# predictors will be the top 4 found above

predictors = ['Age_08_04', 'Automatic_airco', 'HP', 'KM']
outcome = 'Price'

In [83]:
valid_x = test_df[predictors]
valid_y = test_df[outcome1]

# Use the developed model (car_lm) to predict validation data

In [85]:
car_lm_pred = car_lm.predict(valid_x)

result = pd.DataFrame({'Predicted': car_lm_pred, 'Actual': valid_y,
'Residual': valid_y - car_lm_pred})
print(result)

      Predicted  Actual     Residual
0  14969.359477   13500 -1469.359477
1  14668.647138   13750  -918.647138


# Regression summary based on validation data


In [86]:
regressionSummary(valid_y, car_lm_pred)


Regression statistics

                      Mean Error (ME) : -1194.0033
       Root Mean Squared Error (RMSE) : 1225.3428
            Mean Absolute Error (MAE) : 1194.0033
          Mean Percentage Error (MPE) : -8.7826
Mean Absolute Percentage Error (MAPE) : 8.7826
