In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

In [2]:
X, y = fetch_california_housing(return_X_y=True)

In [3]:
#데이터 불러오기
california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = pd.Series(california.target, name='Target')

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [5]:
X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
9226,2.6941,11.0,5.599379,1.085404,2074.0,3.220497,36.93,-120.04
11826,4.8051,21.0,5.857143,1.053391,1874.0,2.704185,39.00,-121.04
4553,1.1979,33.0,2.020725,1.031088,435.0,2.253886,34.06,-118.24
10787,3.1305,25.0,4.310078,1.124031,985.0,1.908915,33.64,-117.92
4104,4.3333,39.0,5.791367,1.000000,511.0,1.838129,34.13,-118.35
...,...,...,...,...,...,...,...,...
11532,1.7675,26.0,3.847093,1.082556,1829.0,1.312994,33.77,-118.09
16065,4.4375,49.0,5.439506,1.004938,1052.0,2.597531,37.75,-122.48
14501,3.0450,16.0,2.708804,1.056433,648.0,1.462754,32.86,-117.23
14555,6.7120,15.0,7.844291,1.010381,1180.0,4.083045,32.96,-117.13


In [6]:
# 단순 선형회귀하려고 중간 소득만 가져옴(MedInc)
X_train_simple = X_train[['MedInc']]
X_test_simple = X_test[['MedInc']]

In [7]:
from sklearn.linear_model import LinearRegression

model_simple = LinearRegression()
model_simple.fit(X_train_simple, y_train)
y_pred_simple = model_simple.predict(X_test_simple)

# 평가 지표

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("단순 선형 회귀 중간 소득만 사용")
print("MAE: ", mean_absolute_error(y_test, y_pred_simple))
print("MSE: ", mean_squared_error(y_test, y_pred_simple))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred_simple))) # 현실의 문제일 제일 잘 반영 / 가장 많이 씀
print("R2: ", r2_score(y_test, y_pred_simple))

단순 선형 회귀 중간 소득만 사용
MAE:  0.6371452810114605
MSE:  0.738424317887508
RMSE:  0.8593161920314942
R2:  0.46511862831417505


# 다중선형회귀

In [9]:
model_multiple = LinearRegression()
model_multiple.fit(X_train, y_train)
y_pred_multiple = model_multiple.predict(X_test)

In [10]:
print("다중 선형 회귀 평가")
print("MAE: ", mean_absolute_error(y_test, y_pred_multiple))
print("MSE: ", mean_squared_error(y_test, y_pred_multiple))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred_multiple))) # 현실의 문제일 제일 잘 반영 / 가장 많이 씀
print("R2: ", r2_score(y_test, y_pred_multiple))

다중 선형 회귀 평가
MAE:  0.5389572480554763
MSE:  0.5417517275769406
RMSE:  0.7360378574346164
R2:  0.6075794091011186


# 다항 회귀 모델 - 중간 소득만 가져와서 사용, 다항 차수: 2

In [11]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train[['MedInc']])
X_test_poly = poly.fit_transform(X_test[['MedInc']])

In [12]:
model_poly = LinearRegression()
model_poly.fit(X_train_poly, y_train)
y_pred_poly = model_poly.predict(X_test_poly)

In [13]:
print("다항 회귀 평가")
print("MAE: ", mean_absolute_error(y_test, y_pred_poly))
print("MSE: ", mean_squared_error(y_test, y_pred_poly))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred_poly))) # 현실의 문제일 제일 잘 반영 / 가장 많이 씀
print("R2: ", r2_score(y_test, y_pred_poly))

다항 회귀 평가
MAE:  0.6381703765909709
MSE:  0.7359182368364805
RMSE:  0.8578567694181124
R2:  0.46693392073840645
