In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols
import missingno as msno
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [45]:
read = pd.read_csv('..\data\interim\\rentals_processing.csv', parse_dates = ['yearConstructed', 'date'])

In [46]:
rentals = read.copy()

In [47]:
rentals.shape

(266220, 33)

In [48]:
rentals.head()

Unnamed: 0,state,serviceCharge,heatingType,telekomTvOffer,newlyConst,balcony,telekomUploadSpeed,yearConstructed,firingTypes,hasKitchen,...,rent_per_sqm,area_km2,population_2019,population_per_km2,gdp_per_capita_2018,hdi,total_state_listings,total_state_sqm,listings_per_1000capita,listings_per_100sqm
0,Nordrhein_Westfalen,245.0,central_heating,ONE_YEAR_FREE,False,False,10.0,1965-01-01,oil,False,...,6.918605,34085,17932651,526,39678,0.936,62069,4615660.71,3.461228,1.344748
1,Nordrhein_Westfalen,95.0,self_contained_central_heating,ONE_YEAR_FREE,False,False,40.0,1953-01-01,gas,False,...,5.0,34085,17932651,526,39678,0.936,62069,4615660.71,3.461228,1.344748
2,Nordrhein_Westfalen,200.0,central_heating,ONE_YEAR_FREE,False,False,40.0,1951-01-01,oil,False,...,7.696047,34085,17932651,526,39678,0.936,62069,4615660.71,3.461228,1.344748
3,Nordrhein_Westfalen,215.0,gas_heating,ONE_YEAR_FREE,True,True,2.4,2018-01-01,gas,False,...,11.17931,34085,17932651,526,39678,0.936,62069,4615660.71,3.461228,1.344748
4,Nordrhein_Westfalen,121.0,central_heating,ONE_YEAR_FREE,False,True,40.0,1914-01-01,gas,False,...,5.061538,34085,17932651,526,39678,0.936,62069,4615660.71,3.461228,1.344748


In [49]:
rentals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266220 entries, 0 to 266219
Data columns (total 33 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   state                    266220 non-null  object        
 1   serviceCharge            259490 non-null  float64       
 2   heatingType              221782 non-null  object        
 3   telekomTvOffer           233938 non-null  object        
 4   newlyConst               266220 non-null  bool          
 5   balcony                  266220 non-null  bool          
 6   telekomUploadSpeed       233208 non-null  float64       
 7   yearConstructed          209427 non-null  datetime64[ns]
 8   firingTypes              209806 non-null  object        
 9   hasKitchen               266220 non-null  bool          
 10  cellar                   266220 non-null  bool          
 11  rent                     266220 non-null  float64       
 12  livingSpace     

- Goal: Prepare the data for modelling
- No missing values, all numbers, all scaled
    1. Find features with missing values
    2. Determine relevance of feature for modelling. Drop features you feel are not important
    3. Impute numerical missing values
    4. Convert categorical variables and datetime variables to numerical variables
    5. Impute categorical missing values
    6. Scale all data

In [50]:
rentals['yearBuilt'] = rentals.yearConstructed.dt.year

In [51]:
X = rentals.drop(columns = ['rent_per_sqm', 'date', 'yearConstructed'])
y = rentals[['rent_per_sqm']]

In [52]:
encoder = OneHotEncoder(handle_unknown = 'ignore')

In [53]:
imp = SimpleImputer(missing_values = np.nan, strategy = 'median')

In [54]:
scaler = StandardScaler(with_mean = False)

In [55]:
lr = LinearRegression(normalize = False)

In [56]:
ridge = Ridge(normalize = False)

In [57]:
lasso = Lasso(normalize = False)

In [58]:
steps = [('encoding', encoder), ('imputation', imp), ('scaling', scaler), ('ridge_regress', ridge)]

In [59]:
pipeline = Pipeline(steps)

In [60]:
parameters = {'ridge_regress__alpha': np.linspace(0, 1, num = 10, endpoint = True)}

In [61]:
cv = GridSearchCV(pipeline, param_grid = parameters)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 77)

In [66]:
X_train.head()

Unnamed: 0,state,serviceCharge,heatingType,telekomTvOffer,newlyConst,balcony,telekomUploadSpeed,firingTypes,hasKitchen,cellar,...,area_km2,population_2019,population_per_km2,gdp_per_capita_2018,hdi,total_state_listings,total_state_sqm,listings_per_1000capita,listings_per_100sqm,yearBuilt
144538,Baden_Württemberg,250.0,central_heating,ONE_YEAR_FREE,False,False,40.0,gas,False,False,...,35752,11100394,310,47290,0.953,15961,1342806.7,1.437877,1.18863,2006.0
91127,Sachsen,150.0,central_heating,ONE_YEAR_FREE,False,False,40.0,gas,False,True,...,18416,4077937,221,31453,0.93,57673,3837202.43,14.14269,1.502996,1995.0
236448,Sachsen_Anhalt,187.11,gas_heating,,False,True,,gas,False,True,...,20446,2208321,108,28800,0.908,19909,1254374.6,9.015447,1.587165,1955.0
209597,Bayern,70.0,central_heating,ONE_YEAR_FREE,False,True,40.0,gas,True,False,...,70552,13124737,185,48323,0.947,21490,1718920.03,1.637366,1.250204,1966.0
59635,Nordrhein_Westfalen,95.0,central_heating,ONE_YEAR_FREE,False,True,40.0,gas,False,True,...,34085,17932651,526,39678,0.936,62069,4615660.71,3.461228,1.344748,1968.0


In [63]:
cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('encoding',
                                        OneHotEncoder(handle_unknown='ignore')),
                                       ('imputation',
                                        SimpleImputer(strategy='median')),
                                       ('scaling',
                                        StandardScaler(with_mean=False)),
                                       ('ridge_regress', Ridge())]),
             param_grid={'ridge_regress__alpha': array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])})

In [64]:
cv.score(X_test, y_test)

0.45419000183060576

In [65]:
cv.best_params_

{'ridge_regress__alpha': 0.7777777777777777}

In [10]:
dummy = pd.get_dummies(X_train)
dummy.shape

(186354, 105)

In [11]:
imputer = SimpleImputer(strategy = 'median')

In [12]:
imputer.fit(dummy)

SimpleImputer(strategy='median')

In [13]:
dummyt = imputer.transform(dummy)

In [14]:
model = LinearRegression()

In [15]:
model.fit(dummyt, y_train)

LinearRegression()

In [16]:
dummy2 = pd.get_dummies(X_test)

In [17]:
dummy2t = imputer.transform(dummy2)

In [18]:
y_pred = model.predict(dummy2t)

In [19]:
model.score(dummy2t, y_test)

0.7115011875980928

In [20]:
model.score(dummyt, y_train)

0.8516594752770796

In [21]:
cv = cross_val_score(model, dummyt, y_train, cv = 5)

In [22]:
print(cv)

[0.86649227 0.84040335 0.85932501 0.83467726 0.84550675]


In [23]:
print(cv.mean())

0.8492809277907609


In [24]:
cv2 = cross_val_score(model, dummy2t, y_test, cv = 5)

In [25]:
print(cv2)

[0.45462894 0.91169377 0.85705471 0.86415    0.83736547]


In [26]:
cv2.mean()

0.7849785781740504