<a href="https://colab.research.google.com/github/chanlenium/Android-Mobile-App/blob/main/DataAnalytics/MultipleLinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

< 다중 선형회귀분석(Regression) 알고리즘 >
- 특정 변수가 다른 변수에 어떤 영향을 미치는지를 수학적 모형으로 설명, 예측하는 기법
- 실제값과 회귀모델에 의해 예측한 값과의 차이를 잔차(오류)라고 하며, 잔차 제곱의 합(RSS, Residual Sum of Square)을 최소로 하는 최소제곱법을 이용하여 잔차의 합이 최소가 되는 모델을 만듦
- 하나의 독립변수가 아닌 여러 개의 독립변수를 사용한 회귀분석

(분석 목표 예) 캘리포니아 인구가구 통계 데이터셋에서 주택중위가치(median_house_value)에 영향을 주는 변수를 찾아보고, 다중선형회구모델을 생성&평가함
- 종속변수 : median_house_value

In [2]:
# 분석에 필요한 패키지 임포트
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

# 선형 회귀모델을 위한 패키지
from sklearn.linear_model import LinearRegression

# 학습/테스트 데이터셋 분리를 위한 패키지
from sklearn.model_selection import train_test_split

In [3]:
# 데이터 불러오기
df = pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [4]:
# 데이터프레임의 info()함수를 사용하여 데이터셋에 결측값 존재 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
# 데이터프레임의 기술통계를 보여줌
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [7]:
# 데이터 전처리
# total_bedrooms 컬럼은 결측데이터가 있기 때문에 결측데이터가 있는 행 전체를 제거함
# "ocean_proximity"는 범주형 값이므로, 분석에서 제외함

# 결측값이 있는 행 전체(axis = 0)를 제거
df = df.dropna(axis = 0)

# "ocean_prozimity" 컬럼 제거
df = df.drop("ocean_proximity", axis = 1)
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [10]:
# 상관관계 분석 
corr = df.corr(method = "pearson")
corr

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.924616,-0.109357,0.04548,0.069608,0.10027,0.056513,-0.01555,-0.045398
latitude,-0.924616,1.0,0.011899,-0.036667,-0.066983,-0.108997,-0.071774,-0.079626,-0.144638
housing_median_age,-0.109357,0.011899,1.0,-0.360628,-0.320451,-0.295787,-0.302768,-0.118278,0.106432
total_rooms,0.04548,-0.036667,-0.360628,1.0,0.93038,0.857281,0.918992,0.197882,0.133294
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.049686
population,0.10027,-0.108997,-0.295787,0.857281,0.877747,1.0,0.907186,0.005087,-0.0253
households,0.056513,-0.071774,-0.302768,0.918992,0.979728,0.907186,1.0,0.013434,0.064894
median_income,-0.01555,-0.079626,-0.118278,0.197882,-0.007723,0.005087,0.013434,1.0,0.688355
median_house_value,-0.045398,-0.144638,0.106432,0.133294,0.049686,-0.0253,0.064894,0.688355,1.0


In [11]:
# 분석 데이터셋 준비
# median_house_value를 제외한 나머지를 독립변수로 함
X = df.drop("median_house_value", axis = 1)
y = df["median_house_value"]

In [12]:
# 분석데이터셋 분할(7:3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 11)

In [13]:
print(X_train.shape)  # 학습 독립변수
print(X_test.shape) # 테스트 독립변수
print(y_train.shape)  # 학습 종속변수
print(y_test.shape) # 테스트 종속변수

(14303, 8)
(6130, 8)
(14303,)
(6130,)


In [14]:
# 데이터 분석 수행
# LinearRegression 객체 생성
lr = LinearRegression()
# 학습 수행
lr.fit(X_train, y_train)

In [15]:
# 회귀식의 기울기와 y절편 출력
print("기울기 a: ", lr.coef_)
print("y절편 b: ", lr.intercept_)

기울기 a:  [-4.32286140e+04 -4.31271391e+04  1.10248793e+03 -8.21710211e+00
  1.10718936e+02 -4.28091235e+01  6.30396069e+01  4.02477126e+04]
y절편 b:  -3620819.2191371867


In [16]:
# 학습이 완료된 dt 객체에서 테스트 데이터셋으로 예측 수행
pred = lr.predict(X_test)
pred

array([ 83510.70774273, 381090.1670114 ,  93263.48250749, ...,
       155189.62954657, 288858.19332176, 131820.83173612])

In [17]:
# 모델 성능 - 정확도 측정
# 선형 회귀분석의 평가는 결정계수인 R^2 점수로 예측 정확도를 판단함
# 결정계수 값이 클수록 모형의 예측 능력이 좋은 것임
# r2_score() 함수를 사용하며, 첫번째 파라메터로 목표변수에 대한 테스트 데이터셋(y_test)을, 두번째 파라메터로 분석 결과 예측된 데이터셋(pred)를 입력함
from sklearn.metrics import r2_score
score = r2_score(y_test, pred)
print(score)

0.6193365273168104


In [None]:
# 모델 성능 - 정확도 측정
# 선형 회귀분석의 평가는 결정계수인 R^2 점수로 예측 정확도를 판단함
# 결정계수 값이 클수록 모형의 예측 능력이 좋은 것임
# 학습 데이터셋을 대상으로도 수행
pred = lr.predict(X_train)

# 모델 성능 평가 - 학습 데이터셋
from sklearn.metrics import r2_score
score = r2_score(y_train, pred)
print(score)

0.7122320393751094
