## Proces modelowania

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

### Sztuczny zbiór danych do testów

In [37]:
n = 150
wynajem = random.sample(range(1000, 10000), n)
df = pd.DataFrame(wynajem, columns=['wynajem'])
df['powierzchnia'] = random.choices(range(20, 100), k=n)
df['l_pokoi'] = random.choices(range(1, 6), k=n)
df['pietro'] = random.choices(range(1, 15), k=n)
data_od = ['2022-12-21','2022-12-31','2023-01-01','2022-12-11']
df['data_od'] = random.choices(data_od, k=n)
obsluga_zdalna = ['tak','nie']
df['obsluga_zdalna'] = random.choices(obsluga_zdalna, k=n)
df['czynsz'] = random.choices(range(300, 1500), k=n)
df['kaucja'] = random.choices(range(1000, 8000), k=n)
rodzaj_zabudowy = ['blok', 'apartamentowiec', 'kamienica']
df['rodzaj_zabudowy'] = random.choices(rodzaj_zabudowy, k=n)
bot = ['balkon', 'taras', 'brak', 'ogród']
df['bot'] = random.choices(bot, k=n)
wykonczenie = ['do zamieszkania', 'do zamieszkania', 'do zamieszkania', 'stan developerski']
df['wykonczenie'] = random.choices(wykonczenie, k=n)

In [38]:
df.head()

Unnamed: 0,wynajem,powierzchnia,l_pokoi,pietro,data_od,obsluga_zdalna,czynsz,kaucja,rodzaj_zabudowy,bot,wykonczenie
0,2324,32,5,2,2022-12-31,tak,379,3432,blok,taras,stan developerski
1,3030,87,3,4,2022-12-21,tak,1270,5666,blok,ogród,do zamieszkania
2,9652,77,4,10,2023-01-01,nie,623,2091,apartamentowiec,ogród,stan developerski
3,1064,36,2,1,2022-12-31,tak,673,4246,kamienica,brak,stan developerski
4,1586,74,4,3,2022-12-21,nie,1283,5352,apartamentowiec,taras,do zamieszkania


#### Dummy coding

In [40]:
dfm = pd.get_dummies(df)

## Modelowanie

#### Pakiet Scikit-Learn

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing

X = dfm.drop('wynajem', axis=1)
y = dfm['wynajem']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

skmodel = LinearRegression()

skmodel.fit(X_train, y_train)

LinearRegression()

In [54]:
print("y = "+str(skmodel.intercept_)+" + X * "+str(skmodel.coef_))
print("\nR-squared = "+str(skmodel.score(X, y)))

y = 4178.506152607888 + X * [ 2.32894722e+01  3.49425567e+01  1.83590205e+01 -9.80503576e-01
  4.87742412e-02  4.84537235e+02  1.95890188e+02 -7.89362007e+02
  1.08934584e+02  2.25956119e+02 -2.25956119e+02  2.77615842e+02
 -5.09070101e+02  2.31454259e+02  4.29626052e+02  2.14511784e+02
  1.25866814e+02 -7.70004650e+02  6.46014761e+01 -6.46014761e+01]

R-squared = 0.09513127063228799


In [56]:
predictions = skmodel.predict(X_test)

print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))

mean_squared_error :  7844727.781320942
mean_absolute_error :  2447.284861671714


#### Pakiet statsmodels

In [57]:
import statsmodels.api as sm

X = dfm.drop('wynajem', axis=1)
y = dfm['wynajem']

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

In [58]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                wynajem   R-squared:                       0.134
Model:                            OLS   Adj. R-squared:                  0.038
Method:                 Least Squares   F-statistic:                     1.388
Date:                Sun, 11 Dec 2022   Prob (F-statistic):              0.162
Time:                        12:26:38   Log-Likelihood:                -1386.8
No. Observations:                 150   AIC:                             2806.
Df Residuals:                     134   BIC:                             2854.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     