In [1]:
#ebay_car_price_exploration_with_logistic_regression_v001.ipynb

In [2]:
import numpy as np
import pandas as pd
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
dataset_location = 'https://firebasestorage.googleapis.com/v0/b/acst001.appspot.com/o/Dataset_eBay_Toyota_Corolla_v001.csv?alt=media&token=0d7c808c-823b-4eb8-ae94-9a279c5d98c4'

toyota_corolla_df = pd.read_csv(dataset_location)

# For a dataset in the working directory, it's possible to use the following command:
# toyota_corolla_df = pd.read_csv('Dataset_eBay_Toyota_Corolla_v001.csv')

In [4]:
toyota_corolla_df.head(2)

Unnamed: 0,record_id,kilometers,miles,price_usd,year,maker,model
0,736,25999,16155.0,13100.0,2006.0,Toyota,Corolla
1,845,321869,200000.0,325.0,1991.0,Toyota,Corolla


In [5]:
toyota_corolla_df.tail(2)

Unnamed: 0,record_id,kilometers,miles,price_usd,year,maker,model
831,146385,106469,66157.0,4299.0,2001.0,Toyota,Corolla
832,146673,257495,160000.0,560.0,1995.0,Toyota,Corolla


In [6]:
toyota_corolla_df.sample(5)

Unnamed: 0,record_id,kilometers,miles,price_usd,year,maker,model
359,63093,205996,128000.0,3401.03,1997.0,Toyota,Corolla
785,139136,56890,35350.0,7899.0,2001.0,Toyota,Corolla
344,62206,59546,37000.0,7311.0,2004.0,Toyota,Corolla
664,117031,265220,164800.0,1925.0,1997.0,Toyota,Corolla
656,116001,279434,173632.0,255.0,1994.0,Toyota,Corolla


In [7]:
toyota_corolla_df.describe()

Unnamed: 0,record_id,kilometers,miles,price_usd,year
count,833.0,833.0,833.0,833.0,833.0
mean,74156.343337,172716.007203,107320.762305,4236.381885,1995.776711
std,42072.439847,109435.855132,68000.294588,3833.479195,7.219471
min,736.0,161.0,100.0,50.0,1969.0
25%,37385.0,82881.0,51500.0,1200.0,1991.0
50%,74805.0,169786.0,105500.0,2749.98,1996.0
75%,111422.0,252667.0,157000.0,7000.0,2002.0
max,146673.0,600990.0,373438.0,16505.0,2006.0


# Regressions without indicator variables

In [8]:
print('Regressing the price on the number of kilometers driven:\n')
results_ols = smf.ols('price_usd ~ kilometers', 
               data=toyota_corolla_df
               ).fit(cov_type='HC0')
print(' ')
print('coefficient of kilometers - mean:\n',
      results_ols.params['kilometers'])
print('coefficient of kilometers - confidence interval:\n',
      results_ols.conf_int()[0]['kilometers'], 
      results_ols.conf_int()[1]['kilometers'])
print(' ')
results_ols.summary()

Regressing the price on the number of kilometers driven:

 
coefficient of kilometers - mean:
 -0.026112883668289524
coefficient of kilometers - confidence interval:
 -0.028134697158200973 -0.024091070178378075
 


0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.556
Model:,OLS,Adj. R-squared:,0.555
Method:,Least Squares,F-statistic:,640.8
Date:,"Mon, 15 Aug 2022",Prob (F-statistic):,2.98e-105
Time:,21:14:02,Log-Likelihood:,-7717.1
No. Observations:,833,AIC:,15440.0
Df Residuals:,831,BIC:,15450.0
Df Model:,1,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,8746.4949,221.922,39.413,0.000,8311.537,9181.453
kilometers,-0.0261,0.001,-25.314,0.000,-0.028,-0.024

0,1,2,3
Omnibus:,19.474,Durbin-Watson:,1.683
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20.503
Skew:,0.349,Prob(JB):,3.53e-05
Kurtosis:,3.322,Cond. No.,382000.0


In [9]:
print('Regressing the price on the number of kilometers driven and the model year:\n')
results_ols = smf.ols('price_usd ~ kilometers + year', 
               data=toyota_corolla_df
               ).fit(cov_type='HC0')
print(' ')
print('coefficient of kilometers - mean:\n',
      results_ols.params['kilometers'])
print('coefficient of kilometers - confidence interval:\n',
      results_ols.conf_int()[0]['kilometers'], 
      results_ols.conf_int()[1]['kilometers'])
print(' ')
results_ols.summary()

Regressing the price on the number of kilometers driven and the model year:

 
coefficient of kilometers - mean:
 -0.017459071201082383
coefficient of kilometers - confidence interval:
 -0.019459952823817333 -0.015458189578347433
 


0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.695
Model:,OLS,Adj. R-squared:,0.694
Method:,Least Squares,F-statistic:,543.7
Date:,"Mon, 15 Aug 2022",Prob (F-statistic):,1.24e-151
Time:,21:14:03,Log-Likelihood:,-7561.1
No. Observations:,833,AIC:,15130.0
Df Residuals:,830,BIC:,15140.0
Df Model:,2,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-4.665e+05,3.22e+04,-14.483,0.000,-5.3e+05,-4.03e+05
kilometers,-0.0175,0.001,-17.102,0.000,-0.019,-0.015
year,237.3825,16.088,14.755,0.000,205.851,268.914

0,1,2,3
Omnibus:,97.58,Durbin-Watson:,1.689
Prob(Omnibus):,0.0,Jarque-Bera (JB):,142.743
Skew:,0.833,Prob(JB):,1.0100000000000001e-31
Kurtosis:,4.156,Cond. No.,68100000.0


In [10]:
print('Regressing the price on the number of kilometers driven for model year 2000:\n')
results_ols = smf.ols('price_usd ~ kilometers', 
               data=toyota_corolla_df[toyota_corolla_df['year']==2000]
               ).fit(cov_type='HC0')
print(' ')
print('coefficient of kilometers - mean:\n',
      results_ols.params['kilometers'])
print('coefficient of kilometers - confidence interval:\n',
      results_ols.conf_int()[0]['kilometers'], 
      results_ols.conf_int()[1]['kilometers'])
print(' ')
results_ols.summary()

Regressing the price on the number of kilometers driven for model year 2000:

 
coefficient of kilometers - mean:
 -0.01826702781763866
coefficient of kilometers - confidence interval:
 -0.026443240199669003 -0.010090815435608318
 


0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.325
Model:,OLS,Adj. R-squared:,0.299
Method:,Least Squares,F-statistic:,19.17
Date:,"Mon, 15 Aug 2022",Prob (F-statistic):,0.000173
Time:,21:14:03,Log-Likelihood:,-240.87
No. Observations:,28,AIC:,485.7
Df Residuals:,26,BIC:,488.4
Df Model:,1,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,6621.9338,747.435,8.860,0.000,5156.988,8086.880
kilometers,-0.0183,0.004,-4.379,0.000,-0.026,-0.010

0,1,2,3
Omnibus:,6.438,Durbin-Watson:,1.469
Prob(Omnibus):,0.04,Jarque-Bera (JB):,5.085
Skew:,-1.03,Prob(JB):,0.0787
Kurtosis:,3.336,Cond. No.,413000.0


In [11]:
print('Regressing the price on the number of kilometers driven for model year 2005:\n')
results_ols = smf.ols('price_usd ~ kilometers', 
               data=toyota_corolla_df[toyota_corolla_df['year']==2005]
               ).fit(cov_type='HC0')
print(' ')
print('coefficient of kilometers - mean:\n',
      results_ols.params['kilometers'])
print('coefficient of kilometers - confidence interval:\n',
      results_ols.conf_int()[0]['kilometers'], 
      results_ols.conf_int()[1]['kilometers'])
print(' ')
results_ols.summary()

Regressing the price on the number of kilometers driven for model year 2005:

 
coefficient of kilometers - mean:
 0.03043441541948763
coefficient of kilometers - confidence interval:
 -0.004008161204910696 0.06487699204388596
 


0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.034
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,2.999
Date:,"Mon, 15 Aug 2022",Prob (F-statistic):,0.0882
Time:,21:14:03,Log-Likelihood:,-600.7
No. Observations:,65,AIC:,1205.0
Df Residuals:,63,BIC:,1210.0
Df Model:,1,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,9538.4403,666.318,14.315,0.000,8232.481,1.08e+04
kilometers,0.0304,0.018,1.732,0.083,-0.004,0.065

0,1,2,3
Omnibus:,6.515,Durbin-Watson:,0.936
Prob(Omnibus):,0.038,Jarque-Bera (JB):,6.079
Skew:,-0.744,Prob(JB):,0.0479
Kurtosis:,3.173,Cond. No.,71600.0


# Regressions with fixed effects

In [12]:
# Regression with year fixed effects

results_ols = smf.ols('price_usd ~ kilometers + C(year)', 
               data=toyota_corolla_df
               ).fit(cov_type='HC0')
print('')
print('Regression with year fixed effects')
print('')
print('coefficient of kilometers:\n',
      results_ols.params['kilometers'])
results_ols.summary()


Regression with year fixed effects

coefficient of kilometers:
 -0.006791604874303556


0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.857
Model:,OLS,Adj. R-squared:,0.85
Method:,Least Squares,F-statistic:,108.8
Date:,"Mon, 15 Aug 2022",Prob (F-statistic):,3.95e-269
Time:,21:14:03,Log-Likelihood:,-7246.4
No. Observations:,833,AIC:,14570.0
Df Residuals:,795,BIC:,14750.0
Df Model:,37,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,7162.4419,8.329,859.974,0.000,7146.118,7178.766
C(year)[T.1970.0],-3000.5037,26.145,-114.763,0.000,-3051.747,-2949.260
C(year)[T.1971.0],-3478.8217,1595.888,-2.180,0.029,-6606.705,-350.938
C(year)[T.1972.0],-4040.8189,27.068,-149.286,0.000,-4093.871,-3987.767
C(year)[T.1974.0],-3940.6697,704.278,-5.595,0.000,-5321.030,-2560.309
C(year)[T.1975.0],-3945.7101,55.176,-71.511,0.000,-4053.854,-3837.567
C(year)[T.1976.0],-799.2842,2509.293,-0.319,0.750,-5717.408,4118.839
C(year)[T.1977.0],-3248.5093,1063.040,-3.056,0.002,-5332.030,-1164.989
C(year)[T.1978.0],-4666.1173,782.474,-5.963,0.000,-6199.738,-3132.496

0,1,2,3
Omnibus:,72.339,Durbin-Watson:,1.621
Prob(Omnibus):,0.0,Jarque-Bera (JB):,407.451
Skew:,0.028,Prob(JB):,3.34e-89
Kurtosis:,6.426,Cond. No.,36100000.0


# Regressions with interactions of indicator variables

In [13]:
# Regression with interactions between kilometers and year indicators

results_ols = smf.ols('price_usd ~ kilometers*C(year)', 
               data=toyota_corolla_df
               ).fit(cov_type='HC0')
print('')
print('Regression with interactions between kilometers and year indicators')
results_ols.summary()


Regression with interactions between kilometers and year indicators


0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.872
Model:,OLS,Adj. R-squared:,0.861
Method:,Least Squares,F-statistic:,5599000000000000.0
Date:,"Mon, 15 Aug 2022",Prob (F-statistic):,0.0
Time:,21:14:03,Log-Likelihood:,-7198.2
No. Observations:,833,AIC:,14530.0
Df Residuals:,764,BIC:,14860.0
Df Model:,68,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1586.7928,108.040,14.687,0.000,1375.038,1798.548
C(year)[T.1970.0],-7.685e-06,1.52e-07,-50.692,0.000,-7.98e-06,-7.39e-06
C(year)[T.1971.0],5825.9822,108.040,53.924,0.000,5614.227,6037.737
C(year)[T.1972.0],-4.952e-06,1.48e-07,-33.402,0.000,-5.24e-06,-4.66e-06
C(year)[T.1974.0],-748.3216,759.792,-0.985,0.325,-2237.487,740.844
C(year)[T.1975.0],-6.152e-06,8.16e-08,-75.398,0.000,-6.31e-06,-5.99e-06
C(year)[T.1976.0],8353.8206,108.040,77.321,0.000,8142.066,8565.575
C(year)[T.1977.0],-2735.3762,108.040,-25.318,0.000,-2947.131,-2523.621
C(year)[T.1978.0],-476.1269,904.653,-0.526,0.599,-2249.215,1296.961

0,1,2,3
Omnibus:,95.294,Durbin-Watson:,1.563
Prob(Omnibus):,0.0,Jarque-Bera (JB):,771.073
Skew:,-0.067,Prob(JB):,3.66e-168
Kurtosis:,7.711,Cond. No.,1.19e+16


In [14]:
# Regression with interactions between kilometers and year indicators written differently

results_ols = smf.ols('price_usd ~ kilometers + C(year) + kilometers:C(year)', 
               data=toyota_corolla_df
               ).fit(cov_type='HC0')
print('')
print('Regression with interactions between kilometers and year indicators')
results_ols.summary()


Regression with interactions between kilometers and year indicators


0,1,2,3
Dep. Variable:,price_usd,R-squared:,0.872
Model:,OLS,Adj. R-squared:,0.861
Method:,Least Squares,F-statistic:,5599000000000000.0
Date:,"Mon, 15 Aug 2022",Prob (F-statistic):,0.0
Time:,21:14:03,Log-Likelihood:,-7198.2
No. Observations:,833,AIC:,14530.0
Df Residuals:,764,BIC:,14860.0
Df Model:,68,,
Covariance Type:,HC0,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1586.7928,108.040,14.687,0.000,1375.038,1798.548
C(year)[T.1970.0],-7.685e-06,1.52e-07,-50.692,0.000,-7.98e-06,-7.39e-06
C(year)[T.1971.0],5825.9822,108.040,53.924,0.000,5614.227,6037.737
C(year)[T.1972.0],-4.952e-06,1.48e-07,-33.402,0.000,-5.24e-06,-4.66e-06
C(year)[T.1974.0],-748.3216,759.792,-0.985,0.325,-2237.487,740.844
C(year)[T.1975.0],-6.152e-06,8.16e-08,-75.398,0.000,-6.31e-06,-5.99e-06
C(year)[T.1976.0],8353.8206,108.040,77.321,0.000,8142.066,8565.575
C(year)[T.1977.0],-2735.3762,108.040,-25.318,0.000,-2947.131,-2523.621
C(year)[T.1978.0],-476.1269,904.653,-0.526,0.599,-2249.215,1296.961

0,1,2,3
Omnibus:,95.294,Durbin-Watson:,1.563
Prob(Omnibus):,0.0,Jarque-Bera (JB):,771.073
Skew:,-0.067,Prob(JB):,3.66e-168
Kurtosis:,7.711,Cond. No.,1.19e+16


In [18]:
toyota_corolla_with_a_bid_df = pd.read_csv(
    'https://firebasestorage.googleapis.com/v0/b/acst001.appspot.com/o/Dataset_eBay_Toyota_Corolla_with_a_bid_v001.csv?alt=media&token=de1ce403-3e2d-40f0-bd3e-21094da3c7da'
    )

In [19]:
toyota_corolla_with_a_bid_df

Unnamed: 0,record_id,sell,price_usd,kilometers,miles,year,maker,model
0,21,0,7600.00,89257,55462.0,2003.0,Toyota,Corolla
1,63,0,10300.00,15772,9800.0,2005.0,Toyota,Corolla
2,65,0,9095.00,15772,9800.0,2005.0,Toyota,Corolla
3,245,0,10000.00,61155,38000.0,2004.0,Toyota,Corolla
4,627,0,3170.00,183465,114000.0,1980.0,Toyota,Corolla
...,...,...,...,...,...,...,...,...
1624,145828,0,10000.00,63793,39639.0,2005.0,Toyota,Corolla
1625,146015,1,1525.00,332007,206300.0,1994.0,Toyota,Corolla
1626,146132,1,536.09,144841,90000.0,1978.0,Toyota,Corolla
1627,146385,1,4299.00,106469,66157.0,2001.0,Toyota,Corolla


In [36]:
results_logit = smf.logit('sell ~ kilometers + year', 
               data=toyota_corolla_with_a_bid_df
               ).fit()
print('')
print('Logit regression of the indicator of a successful sale.')
results_logit.summary()

Optimization terminated successfully.
         Current function value: 0.632219
         Iterations 5

Logit regression of the indicator of a successful sale.


0,1,2,3
Dep. Variable:,sell,No. Observations:,1629.0
Model:,Logit,Df Residuals:,1626.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 15 Aug 2022",Pseudo R-squ.:,0.08756
Time:,21:30:50,Log-Likelihood:,-1029.9
converged:,True,LL-Null:,-1128.7
Covariance Type:,nonrobust,LLR p-value:,1.1970000000000001e-43

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,73.5957,18.152,4.055,0.000,38.019,109.172
kilometers,5.229e-06,6.77e-07,7.729,0.000,3.9e-06,6.55e-06
year,-0.0372,0.009,-4.103,0.000,-0.055,-0.019


In [29]:
results_logit = smf.logit('sell ~ price_usd + kilometers + year', 
               data=toyota_corolla_with_a_bid_df
               ).fit()
print('')
print('Logit regression of the indicator of a successful sale.')
results_logit.summary()

Optimization terminated successfully.
         Current function value: 0.632119
         Iterations 6

Logit regression of the indicator of a successful sale.


0,1,2,3
Dep. Variable:,sell,No. Observations:,1629.0
Model:,Logit,Df Residuals:,1625.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 15 Aug 2022",Pseudo R-squ.:,0.08771
Time:,21:21:57,Log-Likelihood:,-1029.7
converged:,True,LL-Null:,-1128.7
Covariance Type:,nonrobust,LLR p-value:,1.1460000000000001e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,66.5223,21.882,3.040,0.002,23.634,109.410
price_usd,-1.449e-05,2.53e-05,-0.573,0.567,-6.4e-05,3.51e-05
kilometers,4.927e-06,8.54e-07,5.766,0.000,3.25e-06,6.6e-06
year,-0.0336,0.011,-3.055,0.002,-0.055,-0.012


In [31]:
results_logit = smf.logit('sell ~ price_usd + kilometers + C(year)', 
               data=toyota_corolla_with_a_bid_df
               ).fit()
print('')
print('Logit regression of the indicator of a successful sale.')
results_logit.summary()

Logit regression of the indicator of a successful sale.


0,1,2,3
Dep. Variable:,sell,No. Observations:,1629.0
Model:,Logit,Df Residuals:,1589.0
Method:,MLE,Df Model:,39.0
Date:,"Mon, 15 Aug 2022",Pseudo R-squ.:,0.1372
Time:,21:23:56,Log-Likelihood:,-973.82
converged:,False,LL-Null:,-1128.7
Covariance Type:,nonrobust,LLR p-value:,7.168e-44

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,14.0296,1628.575,0.009,0.993,-3177.920,3205.979
C(year)[T.1970.0],13.5058,1.26e+06,1.07e-05,1.000,-2.47e+06,2.47e+06
C(year)[T.1971.0],-14.6100,1628.576,-0.009,0.993,-3206.560,3177.340
C(year)[T.1972.0],-14.4415,1628.576,-0.009,0.993,-3206.392,3177.509
C(year)[T.1974.0],-14.4897,1628.576,-0.009,0.993,-3206.439,3177.460
C(year)[T.1975.0],-14.4486,1628.576,-0.009,0.993,-3206.399,3177.502
C(year)[T.1976.0],-14.7883,1628.576,-0.009,0.993,-3206.738,3177.162
C(year)[T.1977.0],-15.0789,1628.576,-0.009,0.993,-3207.029,3176.871
C(year)[T.1978.0],-14.0303,1628.576,-0.009,0.993,-3205.980,3177.919
