# 캘리포니아 집 값 데이터

> 캘리포니아 집 값 예측

- 평균과 중위값
- 치우쳐진 데이터
- 내가 만들어보는 "앙상블" 모델

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error

#pip install catboost xgboost lightgbm
#pip install numpy<2 #2버전보다 낮은 최신버전 넘파이 에러뜨면 사용x
# 만약 CatBoostRegressor 오류 발생시 numpy 다시 업데이트 후에 계속 진행

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

## 데이터 불러오기

In [32]:
# X = fetch_california_housing(as_frame=True)['data']
# y = fetch_california_housing(as_frame=True)['target']
# df = pd.concat([X,y], axis=1)
# df.head()
df = pd.read_csv('./data/housing.csv')

## 데이터 전처리
### NA값 처리 - 중위값

In [33]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [27]:
# df.describe()

In [34]:
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)


### 특정 속성이 과도한 지표를 형성하고 있어서, 정리

In [35]:
df['bed_per_room'] = df['total_bedrooms'] / df['total_rooms']

In [28]:
# df.isnull().sum()

In [29]:
# sns.heatmap(df.corr(numeric_only=True), annot=True)

In [30]:
# sns.histplot(df['median_house_value'], kde=True)

### 데이터가 한쪽으로 치우쳐져 있는지 확인 후 정리

In [36]:
df['median_house_value'].skew() #첨도가 크면 왜곡이 심한것 0.6정도면 무난

0.9777632739098341

In [41]:
X = df.drop(['median_house_value'], axis=1)
y = np.log(df['median_house_value'])

In [38]:
# df.hist(bins=30) #data가 한쪽으로 치우쳐있음

In [None]:
## Skew가 너무 높음 => 데이터가 한쪽으로 치우쳐짐
## 방이 너무 많음 (특정 속성이 과도한 지표를 형성하고 있어서 정리)

In [39]:
# df['bed_per_room'] = df['total_bedrooms'] / df['total_rooms']
# df.info()



In [42]:
#코드 기억해둘것
from scipy.stats import skew
skew_df = pd.DataFrame(X.select_dtypes(np.number).columns, columns=["Feature"])
skew_df['Skew'] = skew_df["Feature"].apply(lambda feature: skew(df[feature]))
skew_df["ABS_Skew"] = skew_df["Skew"].apply(abs) #절대값 생김
skew_df["Skewed"] = skew_df["ABS_Skew"].apply(lambda x:True if x > 0.5 else False) #절대값 지표 씀
skew_df 


Unnamed: 0,Feature,Skew,ABS_Skew,Skewed
0,longitude,-0.29778,0.29778,False
1,latitude,0.465919,0.465919,False
2,housing_median_age,0.060326,0.060326,False
3,total_rooms,4.147042,4.147042,True
4,total_bedrooms,3.480888,3.480888,True
5,population,4.9355,4.9355,True
6,households,3.41019,3.41019,True
7,median_income,1.646537,1.646537,True
8,bed_per_room,6.316445,6.316445,True


In [43]:
skew_col = skew_df[skew_df["ABS_Skew"]>0.5]["Feature"].values #0.5보다 큰 애들만 추출
skew_col

array(['total_rooms', 'total_bedrooms', 'population', 'households',
       'median_income', 'bed_per_room'], dtype=object)

In [44]:
for col in skew_col:
    X[col] = np.log(X[col])

In [45]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,bed_per_room
0,-122.23,37.88,41.0,6.779922,4.859812,5.774552,4.836282,2.119287,NEAR BAY,-1.920110
1,-122.22,37.86,21.0,8.867709,7.008505,7.783641,7.037028,2.116424,NEAR BAY,-1.859204
2,-122.24,37.85,52.0,7.290975,5.247024,6.206576,5.176150,1.982022,NEAR BAY,-2.043951
3,-122.25,37.85,52.0,7.149917,5.459586,6.324359,5.389072,1.730434,NEAR BAY,-1.690331
4,-122.25,37.85,52.0,7.394493,5.634790,6.336826,5.556828,1.347086,NEAR BAY,-1.759704
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,7.417580,5.924256,6.739337,5.799093,0.444878,INLAND,-1.493325
20636,-121.21,39.49,18.0,6.546785,5.010635,5.874931,4.736198,0.938756,INLAND,-1.536150
20637,-121.22,39.43,17.0,7.720462,6.184149,6.914731,6.070738,0.530628,INLAND,-1.536313
20638,-121.32,39.43,18.0,7.528332,6.013715,6.608001,5.855072,0.624440,INLAND,-1.514617


## 문자열을 숫자로 변경

In [None]:
encoder = LabelEncoder()
X['ocean_proximity'] = encoder.fit_transform(X['ocean_proximity'])

## 데이터 분리

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 분류기 모음

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
predict_lr = lr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_lr))
r2 = r2_score(y_test, predict_lr)
print(rmse, r2) #r2값은 1에 가까우면 좋다

In [None]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
predict_knn = knn.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_knn))
r2 = r2_score(y_test, predict_knn)
print(rmse, r2)


In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
predict_rf = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_rf))
r2 = r2_score(y_test, predict_rf)
print(rmse, r2)

In [None]:
xg = XGBRegressor()
xg.fit(X_train, y_train)
predict_xg = xg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_xg))
r2 = r2_score(y_test, predict_xg)
print(rmse, r2)

In [None]:
gb = GradientBoostingRegressor()
rf.fit(X_train, y_train)
predict_gb = gb.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predict_gb))
r2 = r2_score(y_test, predict_gb)
print(rmse, r2)

In [None]:
f_predict = (
    0.25 * predict_cat +
    0.25 * predict_lgb +
    0.25 * predict_rf +
    0.25 * predict_xg
)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, f_predict))
r2 = r2_score(y_test, f_predict)
print(rmse, r2)