In [79]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut, train_test_split
from sklearn.metrics import mean_squared_error

In [122]:
# Step 1: Data Preparation
# Load dataset
data = pd.read_csv('biele_WM_new.csv')

In [123]:
# Select relevant variables and update original dataset to data
columns = ["mietekalt", "qm_Preis", "wohnflaeche", "alter", "objektzustand", "zimmeranzahl", "balkon", "einbaukueche", "keller", "ausstattung_kat", "aufzug", "gaestewc", "garten", "heizungsart", "barrierefrei", "Einstellungsmonat"]
data = data[columns]

In [124]:
# Rename variables for clarity
data.rename(columns = {'mietekalt': 'netrent', 
                     'qm_Preis': 'rent_per_sqm', 
                     'wohnflaeche': 'area', 
                     'alter': 'age', 
                     'objektzustand': 'condition_cat', 
                     'zimmeranzahl': 'rooms', 
                     'balkon': 'balcony', 
                     'einbaukueche': 'kitchen', 
                     'keller': 'basement', 
                     'ausstattung_kat': 'appointments_cat', 
                     'aufzug': 'lift', 
                     'gaestewc': 'guesttoilet', 
                     'garten': 'garden', 
                     'heizungsart': 'heating_cat', 
                     'barrierefrei': 'barrierfree', 
                     'Einstellungsmonat': 'month'}, inplace= True)

In [125]:
# Drop rows with missing values
data = data.dropna()

In [126]:
# Encode categorical variables
data = pd.get_dummies(data,
                      columns = ['condition_cat', 'appointments_cat', 'heating_cat', 'month'],
                      drop_first = True, dtype = int)

In [114]:
# Convert all boolean columns to integers
# bool_cols = data.select_dtypes(include=['bool']).columns
# data[bool_cols] = data[bool_cols].astype(int)

In [127]:
# Define dependent and independent variables
dependent_var = 'netrent'
independent_vars = [col for col in data.columns if col != dependent_var]

In [128]:
# Split data into training and test sets
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [129]:
# Fit the linear regression model
X_train = train_data[independent_vars]
y_train = train_data[dependent_var]
X_train = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train).fit()

In [133]:
# Evaluate the model
X_test = test_data[independent_vars]
y_test = test_data[dependent_var]
X_test = sm.add_constant(X_test)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)
print(model.summary())

Mean Squared Error: 2423.918056286244
                            OLS Regression Results                            
Dep. Variable:                netrent   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.957
Method:                 Least Squares   F-statistic:                     999.4
Date:                Sun, 12 Jan 2025   Prob (F-statistic):               0.00
Time:                        19:02:00   Log-Likelihood:                -10038.
No. Observations:                1896   AIC:                         2.016e+04
Df Residuals:                    1853   BIC:                         2.040e+04
Df Model:                          42                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------

In [134]:
# Check multicollinearity using VIF
vif_data = pd.DataFrame()
vif_data['feature'] = X_train.columns
vif_data['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
print(vif_data)

               feature         VIF
0                const  306.717906
1         rent_per_sqm    1.674070
2                 area    3.470259
3                  age    1.725567
4                rooms    3.010518
5              balcony    1.322052
6              kitchen    1.256021
7             basement    1.153164
8                 lift    1.731689
9          guesttoilet    1.431099
10              garden    1.134603
11         barrierfree    1.734035
12   condition_cat_2.0    2.243021
13   condition_cat_3.0    2.592412
14   condition_cat_4.0    1.857249
15   condition_cat_5.0    3.530378
16   condition_cat_6.0    3.580483
17   condition_cat_7.0    7.815057
18   condition_cat_8.0    1.208144
19  appointments_cat_1    1.512348
20  appointments_cat_2    2.128014
21     heating_cat_2.0    1.615827
22     heating_cat_3.0   25.852799
23     heating_cat_4.0   25.856335
24     heating_cat_5.0   17.865142
25     heating_cat_6.0   23.083291
26     heating_cat_7.0    1.475653
27     heating_cat_8