In [9]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_absolute_error

# original test

In [10]:
# read date
df = pd.read_csv('1_taipei_purged.csv')

df = df.drop(['address', 'latitude', 'longitude', 'style', 'district'], axis=1)
df.loc[:, 'date'] = df.loc[:, 'date'] % 100 + (df.loc[:, 'date'] // 100 - 2013 ) * 12

In [11]:
# train
X = df.drop(['total_price', 'avg'], axis=1)
y = df[['total_price']]

model = linear_model.LinearRegression()

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=1)

# original
r2_sum, mae_sum = 0, 0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    r2 = round(r2_score(y_test, y_pred), 4)
    mae = round(mean_absolute_error(y_test, y_pred), 4)
    r2_sum += r2
    mae_sum += mae
    
    print(f'R2: {r2}, MAE: {mae}')
    
print(f'\nR2: {round(r2_sum/folds, 4)}, MAE: {round(mae_sum/folds, 4)}')

R2: 0.5265, MAE: 515.3778
R2: 0.5966, MAE: 509.4179
R2: 0.6275, MAE: 509.8817
R2: 0.4654, MAE: 505.6145
R2: 0.5538, MAE: 508.4188

R2: 0.554, MAE: 509.7421


In [12]:
# show significant
X = df.drop(['total_price', 'avg'], axis=1)
y = df[['total_price']]

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

# P>|t| 小於 0.05 就具有顯著性 (0.05 0.01 0.005)

                            OLS Regression Results                            
Dep. Variable:            total_price   R-squared:                       0.611
Model:                            OLS   Adj. R-squared:                  0.611
Method:                 Least Squares   F-statistic:                 7.639e+04
Date:                Thu, 26 May 2022   Prob (F-statistic):               0.00
Time:                        09:37:18   Log-Likelihood:            -3.9338e+06
No. Observations:              486774   AIC:                         7.868e+06
Df Residuals:                  486763   BIC:                         7.868e+06
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          -269.0767      7.717    -34.870

# join data

In [21]:
# read date and join
df = pd.read_csv('1_taipei_purged.csv')

dfp = pd.read_csv('pop.csv')
dfp = dfp[['year', 'district', 'pop', 'sex_ratio', 'in', 'out', 'growth', 'marry_rate']]

dff = pd.read_csv('AllFinancialCols.csv')
dff = dff[['date', 'load_archi', 'load_house', 'M1B', 'income_rate']]

df['district'] = df['district'].str[3:5]
df['year'] = df['date'] // 100

df = pd.merge(df, dfp, on=['year', 'district'])
df = pd.merge(df, dff, on='date')

df = df.drop(['address', 'latitude', 'longitude', 'style', 'district', 'management', 'year'], axis=1)
df.loc[:, 'date'] = df.loc[:, 'date'] % 100 + (df.loc[:, 'date'] // 100 - 2013 ) * 12

In [23]:
# train
X = df.drop(['total_price', 'avg'], axis=1)
y = df[['total_price']]

model = linear_model.LinearRegression()

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=1)

# original
r2_sum, mae_sum = 0, 0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    r2 = round(r2_score(y_test, y_pred), 4)
    mae = round(mean_absolute_error(y_test, y_pred), 4)
    r2_sum += r2
    mae_sum += mae
    
    print(f'R2: {r2}, MAE: {mae}')
    
print(f'\nR2: {round(r2_sum/folds, 4)}, MAE: {round(mae_sum/folds, 4)}')

R2: 0.7578, MAE: 381.5039
R2: 0.7399, MAE: 384.2538
R2: 0.4237, MAE: 374.2506
R2: 0.6949, MAE: 385.6649
R2: 0.7393, MAE: 382.55

R2: 0.6711, MAE: 381.6446


In [24]:
# show significant
X = df.drop(['total_price', 'avg'], axis=1)
y = df[['total_price']]

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

# P>|t| 小於 0.05 就具有顯著性 (0.05 0.01 0.005)

                            OLS Regression Results                            
Dep. Variable:            total_price   R-squared:                       0.742
Model:                            OLS   Adj. R-squared:                  0.742
Method:                 Least Squares   F-statistic:                 7.382e+04
Date:                Thu, 26 May 2022   Prob (F-statistic):               0.00
Time:                        09:41:00   Log-Likelihood:            -3.8334e+06
No. Observations:              486774   AIC:                         7.667e+06
Df Residuals:                  486754   BIC:                         7.667e+06
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          2692.8598     92.370     29.153