In [1]:
from sklearn.datasets import fetch_california_housing

In [8]:
caldata = fetch_california_housing()

In [9]:
print(caldata.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [11]:
# 1. X block을 만든다 (Vector여서 대문자 X)
dfX = pd.DataFrame(caldata.data, columns=caldata.feature_names)

In [12]:
dfX

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [None]:
# 2. Y Block을 만든다 (Scalar 값이여서 소문자 y) - 분석할 데이터

In [14]:
dfy = pd.DataFrame(caldata.target, columns=["Price"])
dfy

Unnamed: 0,Price
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422
...,...
20635,0.781
20636,0.771
20637,0.923
20638,0.847


In [15]:
# 합쳐야 분석하기 편함
# "독립변수과 종속변수 데이터프레임을 하나의 데이터프레임으로 묶어두면 편리하다."
df = pd.concat([dfX, dfy], axis=1)
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [16]:
# 분석을 위해 statsmodels를 import
import statsmodels.api as sm

In [20]:
# price는 caldata의 feature 데이터에 비례한다.
formula = "Price ~ " + " + ".join(caldata.feature_names)
formula

'Price ~ MedInc + HouseAge + AveRooms + AveBedrms + Population + AveOccup + Latitude + Longitude'

### modelling 시작

In [25]:
model = sm.OLS.from_formula(formula, data=df)

In [22]:
result = model.fit()

In [23]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     3970.
Date:                Sat, 16 May 2020   Prob (F-statistic):               0.00
Time:                        12:30:28   Log-Likelihood:                -22624.
No. Observations:               20640   AIC:                         4.527e+04
Df Residuals:                   20631   BIC:                         4.534e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -36.9419      0.659    -56.067      0.0

### 분석 끝

In [None]:
## Condition Number가 너무 커서 

In [26]:
# 위 formula로 진행 시, condition number가 너무 크기에 줄여주기 위해서 scaling 진행
# 항상 scaling은 필수
formula = "Price ~ " + " + ".join(f"scale({n})" for n in caldata.feature_names)
formula

'Price ~ scale(MedInc) + scale(HouseAge) + scale(AveRooms) + scale(AveBedrms) + scale(Population) + scale(AveOccup) + scale(Latitude) + scale(Longitude)'

In [27]:
model = sm.OLS.from_formula(formula, data=df)

In [28]:
result = model.fit()

In [29]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     3970.
Date:                Sat, 16 May 2020   Prob (F-statistic):               0.00
Time:                        12:45:02   Log-Likelihood:                -22624.
No. Observations:               20640   AIC:                         4.527e+04
Df Residuals:                   20631   BIC:                         4.534e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             2.0686      0.00

In [None]:
분석 후 시각화
후 outlier 여부 확인
outlier가 왜 있는지 분석
outlier가 그 위치에 있는 명확한 이유 확인일 경우, 제거 가능
이유 확인 불가일 경우 유지
