# 선형회귀모델

---

## 1. 분석 데이터 준비

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

In [2]:
data = pd.read_csv('data/house_price.csv')
data.head()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.777,0.141112,2.442244,8.10396,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000


In [3]:
# 특성데이터 분류
X=data[data.columns[1:5]]
y=data[["house_value"]]

In [4]:
# 학습, 테스트 데이터 분류
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=42)

In [5]:
# 정규화
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
X_scaled_train=scaler.transform(X_train)
X_scaled_test=scaler.transform(X_test)

## 2. Statmodel 적용
- 통계분석을 위한 Python 패키지

In [7]:
import statsmodels.api as sm
x_train_new = sm.add_constant(X_train)
x_test_new = sm.add_constant(X_test)
x_train_new.head()

Unnamed: 0,const,income,bedrooms,households,rooms
17235,1.0,2.0577,0.185449,3.945455,6.372727
14220,1.0,4.0,0.171566,2.741497,6.363946
3280,1.0,5.8904,0.154485,2.969325,6.65184
15279,1.0,0.9393,0.24146,3.257256,4.51847
14727,1.0,2.7143,0.194977,2.679287,6.385301


In [8]:
multi_model = sm.OLS(y_train, x_train_new).fit()
print(multi_model.summary())

                            OLS Regression Results                            
Dep. Variable:            house_value   R-squared:                       0.546
Model:                            OLS   Adj. R-squared:                  0.545
Method:                 Least Squares   F-statistic:                     3980.
Date:                Mon, 29 Nov 2021   Prob (F-statistic):               0.00
Time:                        14:52:02   Log-Likelihood:            -1.6570e+05
No. Observations:               13266   AIC:                         3.314e+05
Df Residuals:                   13261   BIC:                         3.315e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.849e+04   8884.093     -3.206      0.0

## 3. Scikit-learn 적용

In [9]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_scaled_train, y_train)
pred_train=model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.5455724996358273

In [10]:
pred_test=model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.562684388358716

In [11]:
# RMSE (Root Mean Squared Error)
import numpy as np
from sklearn.metrics import mean_squared_error 
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)
print("훈련   데이터 RMSE:", np.sqrt(MSE_train))
print("테스트 데이터 RMSE:", np.sqrt(MSE_test))

훈련   데이터 RMSE: 64340.33927728243
테스트 데이터 RMSE: 63220.79672157402


In [12]:
# 기타 선형 모델평가지표 : MAE (Mean Absolute Error)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred_test)

47230.874701637375

In [13]:
# 기타 선형 모델평가지표 : MAPE (Mean Absolute Percentage Error)
def MAPE(y_test, y_pred):
    return np.mean(np.abs((y_test - pred_test) / y_test)) * 100
MAPE(y_test, pred_test)

house_value    30.571439
dtype: float64

In [14]:
# 기타 선형 모델평가지표 : MPE (Mean Percentage Error)
def MAE(y_test, y_pred):
    return np.mean((y_test - pred_test) / y_test) * 100
MAE(y_test, pred_test)

house_value   -12.37266
dtype: float64