In [22]:
# Import all the necessary packages.
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.stats.api as sms
import sklearn
import matplotlib.pyplot as plt

from sklearn import linear_model
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols

# Note: Indicates situations that aren’t necessarily exceptions.
import warnings  
warnings.filterwarnings('ignore')  

In [2]:
# Load the CSV file (house_price.csv).
vc = pd.read_csv('ecommerce_data.csv')  

# Print the DataFrame.
vc.head() 

Unnamed: 0,Sale,por_OS,por_NON,recc,avg_no_it,age,dis,diff_reg,tax,bk,lowstat,Median_s
0,0.63,18.0,2.31,0,6.575,65.2,4.09,1,296,396.9,4.98,24.0
1,2.73,0.0,7.07,0,6.421,78.9,4.9671,2,242,396.9,9.14,21.6
2,2.73,0.0,7.07,0,7.185,61.1,4.9671,2,242,392.83,4.03,34.7
3,3.24,0.0,2.18,0,6.998,45.8,6.0622,3,222,394.63,2.94,33.4
4,6.91,0.0,2.18,0,7.147,54.2,6.0622,3,222,396.9,5.33,36.2


In [3]:
# View information on the DataFrame
vc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Sale       506 non-null    float64
 1   por_OS     506 non-null    float64
 2   por_NON    506 non-null    float64
 3   recc       506 non-null    int64  
 4   avg_no_it  506 non-null    float64
 5   age        506 non-null    float64
 6   dis        506 non-null    float64
 7   diff_reg   506 non-null    int64  
 8   tax        506 non-null    int64  
 9   bk         506 non-null    float64
 10  lowstat    506 non-null    float64
 11  Median_s   506 non-null    float64
dtypes: float64(9), int64(3)
memory usage: 47.6 KB


In [4]:
# Define the dependent variable.
y = vc['Median_s']  

# Define the independent variables.
X = vc[['avg_no_it', 'tax']] 

In [5]:
# Fit the regression model.
mlr = linear_model.LinearRegression()
mlr.fit(X, y) 

LinearRegression()

In [6]:
# Call the predictions for X (array).
mlr.predict(X) 

array([26.63108645, 26.25540212, 32.36181073, 31.18391582, 32.37482535,
       26.64407278, 21.89365442, 23.17248345, 18.8484428 , 21.82971297,
       24.81098314, 21.86967638, 20.91055461, 21.4534628 , 22.62838696,
       20.53430443, 21.34156526, 21.78116273, 17.51307086, 19.67908752,
       18.42423654, 21.5813457 , 22.99605031, 20.36645812, 21.25364576,
       18.6560243 , 20.36645812, 22.23674558, 25.81746685, 27.24815682,
       19.56718998, 22.43656261, 21.46145548, 19.4712778 , 22.62838696,
       21.76901102, 21.03368433, 21.10561847, 22.03276951, 27.48776043,
       30.91662076, 29.18737937, 24.38377784, 24.71947046, 23.5845097 ,
       20.49134199, 21.32258085, 23.27279512, 18.22941315, 19.85192747,
       22.57891721, 23.79380478, 26.95890662, 22.85866105, 18.4003434 ,
       33.12673155, 24.82726558, 29.19079572, 23.38427535, 21.6418708 ,
       20.15523206, 21.95358538, 25.86999928, 28.31575979, 32.12616101,
       23.70386238, 19.68354362, 20.28418302, 18.0142615 , 20.34

In [7]:
# Print the R-squared value.
print("R-squared: ", mlr.score(X,y))  

# Print the intercept.
print("Intercept: ", mlr.intercept_) 

# Print the coefficients.
print("Coefficients:")  

# Map a similar index of multiple containers (to be used as a single entity).
list(zip(X, mlr.coef_))  

R-squared:  0.5605639377690896
Intercept:  -21.233093360562155
Coefficients:


[('avg_no_it', 7.992681419323305), ('tax', -0.015836826081673555)]

In [8]:
# Create a variable 'New_Rooms' and define it as 5.7.
New_items_sold = 5.75

# Create 'New_Distance' and define it as 15.2.
New_tax = 15.2  

# Print the predicted value. 
print ("Predicted Value: \n", mlr.predict([[New_items_sold ,New_tax]]))  

Predicted Value: 
 [24.48410504]


In [9]:
# Split the data in 'train' (80%) and 'test' (20%) sets.
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, y,
                                                                            test_size = 0.20,
                                                                            random_state = 42)

In [10]:
# Training the model using the 'statsmodel' OLS library.
# Fit the model with the added constant.
model = sm.OLS(Y_train, sm.add_constant(X_train)).fit()

# Set the predicted response vector.
Y_pred = model.predict(sm.add_constant(X_test)) 

# Call a summary of the model.
print_model = model.summary()

# Print the summary.
print(print_model)  

                            OLS Regression Results                            
Dep. Variable:               Median_s   R-squared:                       0.578
Model:                            OLS   Adj. R-squared:                  0.576
Method:                 Least Squares   F-statistic:                     275.1
Date:                Thu, 15 Jun 2023   Prob (F-statistic):           6.00e-76
Time:                        12:22:13   Log-Likelihood:                -1300.6
No. Observations:                 404   AIC:                             2607.
Df Residuals:                     401   BIC:                             2619.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -23.2468      3.125     -7.438      0.0

In [11]:
# Print the multi.score of the model based on the train data set
print(mlr.score(X_train, Y_train)*100)

57.787137012992495


In [12]:
# Specify the model.
mlr = LinearRegression()  

# Fit the model. We can only fit the model with the training data set.
mlr.fit(X_train, Y_train) 

LinearRegression()

In [17]:
# Call the predictions for X in the test set.
y_pred_mlr = mlr.predict(X_train)  

# Print the predictions.
print("Prediction for test set: {}".format(y_pred_mlr))  

Prediction for test set: [10.24006404 20.33317294 22.01668643 16.87520947 19.44461154 25.60794054
 23.33488521 24.79711236 17.28212077 23.0951393  25.60196244 24.71494112
 31.36960383 17.74716225 38.17623117 18.38165958 17.94673374 25.81704555
 17.96717649 25.39123159 15.6627799  19.90424664 24.28836247 22.36925879
 25.29910875 33.11060074 21.45189672 38.26831113 20.38462466 22.52690137
 22.30959924 20.49385281 13.42993115 19.549198   23.51758008 29.29659524
 27.3933891  21.08056104 21.62864564 23.76808874 19.46723043 17.23795854
 12.12514291 24.36608497 21.26089754 19.39141607 20.15878239 44.34093947
 19.64054543 18.10424768 26.63018922 21.19141503 20.24416243 21.17190847
 15.40534622 19.32498157 31.34703854 13.38739836 24.78967993 19.92552912
 22.11022071 25.15120337 18.73954184 27.30879518 42.30717982 17.55616307
 20.15878239 23.07661181 15.16523225 24.42712381 20.90997246 28.02060766
 19.64884974 18.66690413 21.83915138 26.42024449 30.05139433 27.51120028
 11.0798108   6.93494779 1

In [18]:
# Print the R-squared value.
print(mlr.score(X_test, Y_test)*100)  

46.4835937616622


In [19]:
# Add a constant.
x_temp = sm.add_constant(X_train)  

# Create an empty DataFrame. 
vif = pd.DataFrame() 

# Calculate the 'vif' for each value.
vif["VIF Factor"] = [variance_inflation_factor(x_temp.values, 
                                               i) for i in range(x_temp.values.shape[1])]  


# Create the feature columns.
vif['features'] = x_temp.columns  

# Print the values to one decimal point.
print(vif.round(1))  

   VIF Factor   features
0       107.0      const
1         1.1  avg_no_it
2         1.1        tax


In [24]:
# Determine heteroscedasticity.
model = sms.het_breuschpagan(model.resid, model.model.exog) 

In [25]:
terms = ['LM stat', 'LM Test p-value', 'F-stat', 'F-test p-value']
print(dict(zip(terms, model)))

{'LM stat': 24.217653594287857, 'LM Test p-value': 5.510656487587344e-06, 'F-stat': 12.785321886624901, 'F-test p-value': 4.142460297790806e-06}


In [16]:
# Call the ‘metrics.mean_absolute_error’ function.  
print('Mean Absolute Error (Final):', metrics.mean_absolute_error(Y_test, Y_pred))  

# Call the ‘metrics.mean_squared_error’ function.
print('Mean Square Error (Final):', metrics.mean_squared_error(Y_test, Y_pred))  

Mean Absolute Error (Final): 3.8350138001428546
Mean Square Error (Final): 39.245605306020515
