#### 1. California Housing 데이터셋 load

In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

# 캘리포니아 주택 데이터 불러오기
data = fetch_california_housing(as_frame=True)

# 데이터와 타겟을 하나의 데이터프레임으로 결합
housing_df = pd.DataFrame(data.data, columns=data.feature_names)
housing_df['target'] = data.target
housing_df.head(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


#### 2. HouseAge, AveRooms와 target간의 상관관계 구하기

In [10]:
feature_analysis = ['HouseAge', 'AveRooms', 'target']

# 상관관계 행렬 계산
correlation_matrix = housing_df[feature_analysis].corr()
print(correlation_matrix['target'])

HouseAge    0.105623
AveRooms    0.151948
target      1.000000
Name: target, dtype: float64


#### 3. HouseAge와 AveRooms을 이용한 다항식 feature 생성
###### 다음과 같은 다항식에 의한 새로운 피처를 생성해 봅시다. 그리고 새롭게 생성한 피처를 포함한 상관관계를 구하여 출력합니다,

In [9]:
housing_df['new_HouseAge_AveRooms'] = housing_df['HouseAge'] * housing_df['AveRooms']* housing_df['AveRooms']

features_analysis = ['HouseAge', 'AveRooms', 'new_HouseAge_AveRooms', 'target']
# 상관관계 행렬 계산
correlation_matrix = housing_df[features_analysis].corr()
print(correlation_matrix['target'])


HouseAge                 0.105623
AveRooms                 0.151948
new_HouseAge_AveRooms    0.034937
target                   1.000000
Name: target, dtype: float64


#### 4. PolynomialFeature을 사용한 다항 특성 생성

In [18]:
from sklearn.preprocessing import PolynomialFeatures
housing_df = housing_df.drop(['new_HouseAge_AveRooms'], axis=1)

# feature 비교를 위한 target 값 제거
housing_df_x = housing_df.drop(['target'],axis=1)

# 다항 특성 생성 - 2차다항, 상수항X
poly = PolynomialFeatures(degree=2, include_bias=False)

# 선택된 특성을 다항 특성으로 변환
features_poly = poly.fit_transform(housing_df_x)

# housing_df_x의 특성 이름 생성
features_names = poly.get_feature_names_out(input_features=housing_df_x.columns)

# Polynomial Feature로 생성된 피처들을 포함하는 새로운 DataFrame 생성
housing_poly_df = pd.DataFrame(features_poly, columns=features_names)

display(features_names)
display(housing_poly_df)

array(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population',
       'AveOccup', 'Latitude', 'Longitude', 'MedInc^2', 'MedInc HouseAge',
       'MedInc AveRooms', 'MedInc AveBedrms', 'MedInc Population',
       'MedInc AveOccup', 'MedInc Latitude', 'MedInc Longitude',
       'HouseAge^2', 'HouseAge AveRooms', 'HouseAge AveBedrms',
       'HouseAge Population', 'HouseAge AveOccup', 'HouseAge Latitude',
       'HouseAge Longitude', 'AveRooms^2', 'AveRooms AveBedrms',
       'AveRooms Population', 'AveRooms AveOccup', 'AveRooms Latitude',
       'AveRooms Longitude', 'AveBedrms^2', 'AveBedrms Population',
       'AveBedrms AveOccup', 'AveBedrms Latitude', 'AveBedrms Longitude',
       'Population^2', 'Population AveOccup', 'Population Latitude',
       'Population Longitude', 'AveOccup^2', 'AveOccup Latitude',
       'AveOccup Longitude', 'Latitude^2', 'Latitude Longitude',
       'Longitude^2'], dtype=object)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc^2,MedInc HouseAge,...,Population^2,Population AveOccup,Population Latitude,Population Longitude,AveOccup^2,AveOccup Latitude,AveOccup Longitude,Latitude^2,Latitude Longitude,Longitude^2
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,69.308955,341.3332,...,103684.0,822.888889,12197.36,-39358.06,6.530864,96.804444,-312.365556,1434.8944,-4630.0724,14940.1729
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,68.913242,174.3294,...,5764801.0,5065.730228,90901.86,-293450.22,4.451433,79.878612,-257.864868,1433.3796,-4627.2492,14937.7284
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,52.669855,377.3848,...,246016.0,1389.920904,18773.60,-60631.04,7.852660,106.065537,-342.548249,1432.6225,-4626.7840,14942.6176
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,31.844578,293.4412,...,311364.0,1421.753425,21120.30,-68215.50,6.492025,96.439726,-311.486301,1432.6225,-4627.1625,14945.0625
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,14.793254,200.0024,...,319225.0,1232.528958,21385.25,-69071.25,4.758799,82.568533,-266.684363,1432.6225,-4627.1625,14945.0625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,2.434536,39.0075,...,714025.0,2163.712121,33360.60,-102321.05,6.556703,101.092727,-310.063788,1558.6704,-4780.6332,14662.7881
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,6.537226,46.0224,...,126736.0,1111.719298,14058.44,-43150.76,9.751924,123.319649,-378.515439,1559.4601,-4786.5829,14691.8641
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,2.890000,28.9000,...,1014049.0,2341.914550,39706.01,-122068.54,5.408579,91.699792,-281.913487,1554.7249,-4779.7046,14694.2884
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,3.486436,33.6096,...,549081.0,1573.297994,29217.63,-89898.12,4.508017,83.718138,-257.587736,1554.7249,-4783.6476,14718.5424


#### 5. 생성된 feature들 간의 상관관계 분석

In [28]:
housing_poly_df['target'] = housing_df['target']

correlation_matrix = housing_poly_df.corr()
correlation_matrix['target'].abs().sort_values(ascending=False)

target                  1.000000
MedInc Longitude        0.689376
MedInc                  0.688075
MedInc Latitude         0.674713
MedInc^2                0.624514
MedInc HouseAge         0.589142
MedInc AveRooms         0.555438
MedInc AveBedrms        0.510058
MedInc Population       0.276269
HouseAge AveRooms       0.210048
AveRooms Longitude      0.153274
AveRooms                0.151948
Latitude^2              0.147865
Latitude                0.144160
AveRooms Latitude       0.121177
HouseAge^2              0.119955
HouseAge Longitude      0.107602
HouseAge                0.105623
Latitude Longitude      0.104524
HouseAge Latitude       0.088341
AveBedrms Latitude      0.063047
AveRooms Population     0.059930
HouseAge AveBedrms      0.053226
AveBedrms               0.046701
Longitude               0.045967
Longitude^2             0.045776
AveBedrms Longitude     0.044955
AveBedrms Population    0.036423
Population Latitude     0.031860
AveBedrms AveOccup      0.029738
AveRooms^2