In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut, train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import f

In [2]:
# Step 1: Data Preparation
# Load dataset
data = pd.read_csv('biele_WM_new.csv')

In [3]:
# Select relevant variables and update original dataset to data
columns = ["mietekalt", "qm_Preis", "wohnflaeche", "alter", "objektzustand", "zimmeranzahl", "balkon", "einbaukueche", "keller", "ausstattung_kat", "aufzug", "gaestewc", "garten", "heizungsart", "barrierefrei", "Einstellungsmonat"]
data = data[columns]

In [4]:
# Rename variables for clarity
data.rename(columns = {'mietekalt': 'netrent', 
                     'qm_Preis': 'rent_per_sqm', 
                     'wohnflaeche': 'area', 
                     'alter': 'age', 
                     'objektzustand': 'condition_cat', 
                     'zimmeranzahl': 'rooms', 
                     'balkon': 'balcony', 
                     'einbaukueche': 'kitchen', 
                     'keller': 'basement', 
                     'ausstattung_kat': 'appointments_cat', 
                     'aufzug': 'lift', 
                     'gaestewc': 'guesttoilet', 
                     'garten': 'garden', 
                     'heizungsart': 'heating_cat', 
                     'barrierefrei': 'barrierfree', 
                     'Einstellungsmonat': 'month'}, inplace= True)

In [5]:
# Drop rows with missing values
data = data.dropna()

In [6]:
# Encode categorical variables
data = pd.get_dummies(data,
                      columns = ['condition_cat', 'appointments_cat', 'heating_cat', 'month'],
                      drop_first = True, dtype = int)

In [114]:
# Convert all boolean columns to integers
# bool_cols = data.select_dtypes(include=['bool']).columns
# data[bool_cols] = data[bool_cols].astype(int)

In [7]:
# Define dependent and independent variables
dependent_var = 'netrent'
independent_vars = [col for col in data.columns if col != dependent_var]

In [None]:
### MODEL WITH statsmodels ###

In [9]:
# Split data into training and test sets
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [10]:
# Fit the linear regression model
X_train = train_data[independent_vars]
y_train = train_data[dependent_var]
X_train = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train).fit()

In [11]:
# Evaluate the model
X_test = test_data[independent_vars]
y_test = test_data[dependent_var]
X_test = sm.add_constant(X_test)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)
print(model.summary())

Mean Squared Error: 2423.918056286244
                            OLS Regression Results                            
Dep. Variable:                netrent   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.957
Method:                 Least Squares   F-statistic:                     999.4
Date:                Tue, 14 Jan 2025   Prob (F-statistic):               0.00
Time:                        21:00:44   Log-Likelihood:                -10038.
No. Observations:                1896   AIC:                         2.016e+04
Df Residuals:                    1853   BIC:                         2.040e+04
Df Model:                          42                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------

In [134]:
# Check multicollinearity using VIF
vif_data = pd.DataFrame()
vif_data['feature'] = X_train.columns
vif_data['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
print(vif_data)

               feature         VIF
0                const  306.717906
1         rent_per_sqm    1.674070
2                 area    3.470259
3                  age    1.725567
4                rooms    3.010518
5              balcony    1.322052
6              kitchen    1.256021
7             basement    1.153164
8                 lift    1.731689
9          guesttoilet    1.431099
10              garden    1.134603
11         barrierfree    1.734035
12   condition_cat_2.0    2.243021
13   condition_cat_3.0    2.592412
14   condition_cat_4.0    1.857249
15   condition_cat_5.0    3.530378
16   condition_cat_6.0    3.580483
17   condition_cat_7.0    7.815057
18   condition_cat_8.0    1.208144
19  appointments_cat_1    1.512348
20  appointments_cat_2    2.128014
21     heating_cat_2.0    1.615827
22     heating_cat_3.0   25.852799
23     heating_cat_4.0   25.856335
24     heating_cat_5.0   17.865142
25     heating_cat_6.0   23.083291
26     heating_cat_7.0    1.475653
27     heating_cat_8

In [None]:
### MODEL BY HAND ###

In [9]:
# Extract values of dependent and independent variables
X = data[independent_vars].values
y= data[dependent_var].values

In [10]:
# Add intercept
intercept = np.ones((X.shape[0], 1))  # Create a column of ones
X_with_intercept = np.hstack((intercept, X))  # Combine intercept with X

In [11]:
# X_1: Design matrix (including intercept column)
# y_1: Dependent variable (netrent)
X_1 = np.array(X_with_intercept)
y_1 = np.array(y)

# Estimate beta_hat
beta_hat_1 = np.linalg.inv(X_1.T @ X_1) @ X_1.T @ y_1

# Compute the hat matrix (H)
H_1 = X_1 @ np.linalg.inv(X_1.T @ X_1) @ X_1.T

# Residuals (epsilon_hat)
epsilon_hat_1 = (np.eye(len(y_1)) - H_1) @ y_1

# Fitted values (y_hat)
y_hat_1 = H_1 @ y_1

# Estimate sigma^2 (residual variance)
n, k = X_1.shape
sigma2_hat_1 = (1 / (n - k)) * (epsilon_hat_1.T @ epsilon_hat_1)

# Covariance matrix of beta_hat
Cov_beta_hat_1 = sigma2_hat_1 * np.linalg.inv(X_1.T @ X_1)

# Standard errors of beta_hat
sd_beta_hat_1 = np.sqrt(np.diag(Cov_beta_hat_1))

# Print results
print("Beta_hat:", beta_hat_1)
print("Residual variance (sigma^2):", sigma2_hat_1)
print("Covariance matrix of beta_hat:\n", Cov_beta_hat_1)
print("Standard errors of beta_hat:", sd_beta_hat_1)

Beta_hat: [-5.23948384e+02  6.10658948e+01  8.59321431e+00  5.53620883e-02
 -2.54468096e+00 -8.06528052e-01 -2.70222358e+00  7.98397364e+00
  1.26679156e+01 -2.41513406e+00 -6.90621988e+00 -5.81377010e-01
 -3.72479491e+01 -1.85609140e+01 -4.83964599e+01 -3.20319804e+01
 -3.40961045e+01 -3.61420108e+01 -2.68937018e+01 -4.35903126e+00
  4.50005655e+00 -8.95673158e+00  2.14116961e+01  2.13354685e+01
  2.63116797e+01  1.79557514e+01  6.29993033e+01  1.34671687e+01
  2.01176338e+01  7.45775517e+01  9.09234023e+00  1.64397137e+01
  5.78458433e+00  4.18994603e+00  1.90566502e+00  5.52617826e+00
  2.91243946e-01  1.06345318e+00 -5.51686719e+00  3.05773568e+00
 -1.99978245e-01 -1.95134315e+00 -2.52185488e+00]
Residual variance (sigma^2): 2368.123936743671
Covariance matrix of beta_hat:
 [[ 3.15628328e+02 -4.43861665e+00 -8.02369551e-02 ... -1.33214374e+01
  -9.85959626e+00 -8.62537925e+00]
 [-4.43861665e+00  4.52350495e-01  5.24957574e-03 ... -1.24591685e-01
  -1.57741243e-01 -1.69724940e-01]
 

In [24]:
# Comparison of coefficients and standard deviations
coeff_comparison = np.vstack([model.params, beta_hat_1])
stddev_comparison = np.vstack([model.bse, sd_beta_hat_1])

# Display results
print("Comparison of Coefficients:")
print(pd.DataFrame(coeff_comparison, index=["Statsmodels", "By-Hand"], 
                   columns=['intercept'] + independent_vars))

print("\nComparison of Standard Deviations:")
print(pd.DataFrame(stddev_comparison, index=["Statsmodels", "By-Hand"], 
                   columns=['intercept'] + independent_vars))

Comparison of Coefficients:
              intercept  rent_per_sqm      area       age     rooms   balcony  \
Statsmodels -513.838123     59.746158  8.543418  0.060895 -1.851352 -1.005545   
By-Hand     -523.948384     61.065895  8.593214  0.055362 -2.544681 -0.806528   

              kitchen  basement       lift  guesttoilet  ...   month_3  \
Statsmodels -4.804845  7.112504  12.967450    -0.617056  ...  5.310323   
By-Hand     -2.702224  7.983974  12.667916    -2.415134  ...  4.189946   

              month_4   month_5   month_6   month_7   month_8   month_9  \
Statsmodels  3.229679  9.167424 -0.235941  2.143417 -4.993112  5.806109   
By-Hand      1.905665  5.526178  0.291244  1.063453 -5.516867  3.057736   

             month_10  month_11  month_12  
Statsmodels  2.893099  0.113196 -2.547708  
By-Hand     -0.199978 -1.951343 -2.521855  

[2 rows x 43 columns]

Comparison of Standard Deviations:
             intercept  rent_per_sqm      area       age     rooms   balcony  \
Statsmod

In [None]:
### HYPOTHESIS TESTS FOR THE RESTRICTED MODEL ###

In [14]:
# List of variables to exclude
exclude_vars = ['age', 'rooms', 'balcony', 'guesttoilet', 'barrierfree']

# Restrict independent_vars by excluding the specified variables
restricted_independent_vars = [col for col in independent_vars if col not in exclude_vars]

In [21]:
X_2 = data[restricted_independent_vars]
y_2 = data[dependent_var]
# Add intercept to X
X_2 = sm.add_constant(X_2)

In [25]:
# Fit the reduced model
model_reduced = sm.OLS(y_2, X_2).fit()
print(model_reduced.summary())

                            OLS Regression Results                            
Dep. Variable:                netrent   R-squared:                       0.959
Model:                            OLS   Adj. R-squared:                  0.959
Method:                 Least Squares   F-statistic:                     1488.
Date:                Tue, 14 Jan 2025   Prob (F-statistic):               0.00
Time:                        21:29:51   Log-Likelihood:                -12556.
No. Observations:                2371   AIC:                         2.519e+04
Df Residuals:                    2333   BIC:                         2.541e+04
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const               -525.5022     17

In [26]:
# Perform F-test
SSE_full = np.sum(model.resid ** 2)             # Residual Sum of Squares for full model
SSE_reduced = np.sum(model_reduced.resid ** 2)  # Residual Sum of Squares for reduced model

n = len(y_2)               # Number of observations
p_full = X_train.shape[1]  # Number of parameters in full model
p_reduced = X_2.shape[1]   # Number of parameters in reduced model
r = p_full - p_reduced     # Number of restrictions

# Calculate F-statistic
F_H0 = ((SSE_reduced - SSE_full) / r) / (SSE_full / (n - p_full))
print(f"F-statistic: {F_H0}")

# Calculate critical value
alpha = 0.05
F_critical = f.ppf(1 - alpha, dfn=r, dfd=n - p_full)
print(f"Critical value (alpha={alpha}): {F_critical}")

# Calculate p-value
p_value = 1 - f.cdf(F_H0, dfn=r, dfd=n - p_full)
print(f"P-value: {p_value}")

# Interpretation of results
if p_value < alpha:
    print("Reject the null hypothesis: The variables are jointly significantly different than zero.")
else:
    print("Fail to reject the null hypothesis: The variables are jointly not significant.")

F-statistic: 118.59883423846998
Critical value (alpha=0.05): 2.217940762779465
P-value: 1.1102230246251565e-16
Reject the null hypothesis: The variables are jointly significantly different than zero.
