# Model Building
To run the model, just clock `Run All` on the notebook. The test_MSE should be the printed result at the after cell 4.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from statsmodels.formula.api import ols 
from statsmodels.graphics.gofplots import ProbPlot


### Helper Functions



In [2]:
# Compute MSEs
def comp_mse(model, temp_data):
    predictions = model.predict(temp_data)
    mse_test = np.mean((temp_data['Withdraw']-predictions)**2)    
    return mse_test

# Perform Preprocessing feature engineering
def Preprocess_data(X):
    X_temp = X.drop(columns=['Downtown']).mul(X['Downtown'], axis=0).rename(columns={'Shops': "Shops_DT", 'ATMs': "ATMs_DT", 'Weekday': "Weekday_DT", 'Center': 'Center_DT', 'High': "High_DT"})  

    X_temp['Downtown_NotWeekday_Center'] = X['Downtown'] * (1 - X['Weekday'] ) * X['Center']
    X_temp["Shops_DWC"] = X_temp['Downtown_NotWeekday_Center'] * X['Shops'] 
    
    return pd.concat([X_temp,X], axis=1)

### Read and Perform Preprocessing on the Data 

In [3]:
# Read the data
data = pd.read_csv('ATM_sample.csv')
ATM_test = pd.read_csv('ATM_test.csv') 

# Define target variable
# Define the input variables
y_train = data['Withdraw'] 
X_train = data.drop(columns=['Withdraw'])

y_test = ATM_test['Withdraw'] 
X_test = ATM_test.drop(columns=['Withdraw'])

# Preprocess the data
X_train = Preprocess_data(X_train)
X_test = Preprocess_data(X_test) 

train = pd.concat([X_train, y_train], axis = 1)
test = pd.concat([X_test, y_test], axis = 1)

### Build the Model and compute the test_MSE
 

In [4]:
# Build and execute model
formula = "Withdraw ~ Shops + ATMs + Downtown + Weekday + Center + High + Shops_DT + Weekday_DT + Center_DT + Shops_DWC" 
model = ols(formula, data=train).fit()
print(f"Test MSE on ATMs_Test data: {comp_mse(model, test)}")

Test MSE on ATMs_Test data: 0.2710619060994234


In [5]:
model.summary()

0,1,2,3
Dep. Variable:,Withdraw,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,5108000.0
Date:,"Thu, 07 Nov 2024",Prob (F-statistic):,0.0
Time:,15:43:43,Log-Likelihood:,-16857.0
No. Observations:,22000,AIC:,33740.0
Df Residuals:,21989,BIC:,33820.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,18.8478,0.128,147.204,0.000,18.597,19.099
Shops,0.0182,0.001,14.325,0.000,0.016,0.021
ATMs,-1.0002,0.002,-508.916,0.000,-1.004,-0.996
Downtown,-47.8601,0.244,-196.119,0.000,-48.338,-47.382
Weekday,-2.2209,0.014,-157.390,0.000,-2.249,-2.193
Center,3.5883,0.021,169.201,0.000,3.547,3.630
High,0.9982,0.008,130.464,0.000,0.983,1.013
Shops_DT,0.0918,0.001,71.130,0.000,0.089,0.094
Weekday_DT,0.2247,0.017,13.065,0.000,0.191,0.258

0,1,2,3
Omnibus:,79.112,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,85.711
Skew:,0.114,Prob(JB):,2.44e-19
Kurtosis:,3.204,Cond. No.,86100.0
