In [158]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

데이터 준비

In [159]:
houseDF = pd.read_csv('../data/cali_housing.csv')
houseDF

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


## 결측치 처리

In [160]:
houseDF.shape

(20640, 10)

In [161]:
houseDF.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [162]:
houseDF.dropna(inplace=True)

In [163]:
houseDF.shape

(20433, 10)

### 컬럼 뜻

1. 경도(longitude): 집이 얼마나 서쪽에 있는지를 나타내는 척도; 높은 값은 더 서쪽에 있음
2. 위도: 집이 얼마나 북쪽에 있는지를 나타내는 척도; 높은 값은 더 북쪽에 있음
3. housingMedianAge: 블록 내 주택의 평균 연령; 낮은 숫자는 새 건물임
4. 총 객실: 한 블록 내 총 객실 수
5. 총 침실: 한 블록 내 총 침실 수
6. 인구: 한 블록 내 거주하는 총 인원 수
7. 가구수 : 한 블록에 한 가구 단위로 거주하는 사람들의 모임인 총 가구수
8. 중위소득: 주택 한 블록 내 가구의 중위소득(수만 미국 달러 단위로 측정)
9. medianHouseValue: 블록 내 가구에 대한 medianhouseValue(미국 달러로 측정됨)
10. 해양근접 : 바다/바다와 함께 집의 위치

In [164]:
target = houseDF[['median_house_value']]

In [165]:
corrDF = houseDF.iloc[:, 0:-1]
corrDF.corr()['median_house_value'].sort_values()
# 거 다 너무 수치가 낮은거 아니요

latitude             -0.144638
longitude            -0.045398
population           -0.025300
total_bedrooms        0.049686
households            0.064894
housing_median_age    0.106432
total_rooms           0.133294
median_income         0.688355
median_house_value    1.000000
Name: median_house_value, dtype: float64

In [166]:
houseDF['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: count, dtype: int64

In [167]:
ocean_label = list(houseDF['ocean_proximity'].unique())

<1H OCEAN   : 해변 1시간 거리
INLAND      : 내륙
NEAR OCEAN  : 해변 근처
NEAR BAY    : 만 근처
ISLAND      : 섬

In [168]:
# 범주형 데이터 원핫 인코딩...

from sklearn.preprocessing import LabelEncoder

In [169]:
lencoder = LabelEncoder()
lencoder.fit(ocean_label)

In [170]:
lencoder.transform(ocean_label)

array([3, 0, 1, 4, 2])

In [171]:
from sklearn.preprocessing import OneHotEncoder
ohEncoder = OneHotEncoder()

In [172]:
arr_ocean = np.array(ocean_label).reshape(-1,1)
print(arr_ocean.shape)

ohEncoder.fit(arr_ocean)

(5, 1)


In [173]:
onehot = ohEncoder.transform(arr_ocean)
onehot.toarray()

array([[0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.]])

##### 저걸 ocean_proximity 대신 DF에 합쳐서 넣고 선형회귀분석

In [174]:
house_DF = pd.get_dummies(houseDF, columns=['ocean_proximity'])
house_DF

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,False,True,False,False,False
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,False,True,False,False,False
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,False,True,False,False,False
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,False,True,False,False,False


In [175]:
house_DF.iloc[:, [0,1,2,3,4,5,6,7,9,10,11,12,13]]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,False,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,False,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,False,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,False,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,False,True,False,False,False
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,False,True,False,False,False
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,False,True,False,False,False
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,False,True,False,False,False


In [176]:
house_DF[['median_house_value']]

Unnamed: 0,median_house_value
0,452600.0
1,358500.0
2,352100.0
3,341300.0
4,342200.0
...,...
20635,78100.0
20636,77100.0
20637,92300.0
20638,84700.0


In [177]:
target = house_DF[['median_house_value']]
feature = house_DF.iloc[:, [0,1,2,3,4,5,6,7,9,10,11,12,13]]

In [178]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [179]:
model.fit(feature,target)

In [180]:
model.coef_[0]

array([-2.68129893e+04, -2.54821848e+04,  1.07252004e+03, -6.19326372e+00,
        1.00556290e+02, -3.79690829e+01,  4.96173261e+01,  3.92595729e+04,
       -2.27883447e+04, -6.20726449e+04,  1.30113596e+05, -2.67423963e+04,
       -1.85102104e+04])

In [181]:
for i in range(0, len(model.coef_)):
    print(f'기울기 : ',model.coef_[i], '절편 : ',model.intercept_[i])

기울기 :  [-2.68129893e+04 -2.54821848e+04  1.07252004e+03 -6.19326372e+00
  1.00556290e+02 -3.79690829e+01  4.96173261e+01  3.92595729e+04
 -2.27883447e+04 -6.20726449e+04  1.30113596e+05 -2.67423963e+04
 -1.85102104e+04] 절편 :  -2247165.7714087036


In [182]:
model.score(target, feature)
# 예 ?

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- median_house_value
Feature names seen at fit time, yet now missing:
- households
- housing_median_age
- latitude
- longitude
- median_income
- ...


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
feature

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,False,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,False,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,False,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,False,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,False,True,False,False,False
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,False,True,False,False,False
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,False,True,False,False,False
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,False,True,False,False,False


In [None]:
# # y = ax + b
# pre_fea = model.coef_[0] * feature+ model.intercept_

# pre_jumsu = pre_fea.values.reshape(-1)    # 1차원화

In [None]:
# mse = mean_squared_error(target, pre_fea)
# rmse = mean_squared_error(target, pre_fea, squared=False)
# mae = mean_absolute_error(target, pre_fea)
# r2 = r2_score(target, pre_fea)

# # 손실/비용함수 값은 0에 가까울 수록
# ## 결정계수 값은 1에 가까울 수록 성능좋은 모델
# print(f'mse : {mse}')
# print(f'rmse : {rmse}')
# print(f'mae : {mae}')
# print(f'r2 : {r2}')

In [None]:
from sklearn.model_selection import train_test_split

Feature shape: (20433, 13)
Target shape: (20433, 1)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [184]:
print(f" {X_train.shape}")
print(f" {y_pred.shape}")

 (16346, 13)
 (4087, 1)


In [185]:
# 성능지표 => 오차계산과 결정 계수 계산
pre_fea = model.predict(feature)

# 손실/비용 계산 함수 ==> 정답과 예측값
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

# 얼마나 정답에 가깝게 값을 예측했느냐를 나타내는 지표, => 정답과 예측값 제공 
r2 = r2_score(y_test, y_pred)

In [186]:
print(f'mse : {mse}')
print(f'rmse : {rmse}')
print(f'mae : {mae}')
print(f'r2 : {r2}')

mse : 4802173538.604161
rmse : 69297.71669113032
mae : 50413.433308100364
r2 : 0.6488402154431991


뭔가 잘못한것같은데 아주 많이 잘못한것같은데 ??????????