## 회귀 문제: 보스턴 주택 가격 예측

### 1. 데이터프레임 생성

In [18]:
# [+] SparkSession 설정
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("housing-prices-regression").getOrCreate()

In [19]:
# [+] 주택 가격 데이터에 대한 DataFrame 생성
path = './data/'
file = 'boston.csv'

house_df = spark.read.csv(path + file, inferSchema=True, header=True)

In [20]:
# [+] 데이터프레임 출력
house_df.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+---------+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|CAT. MEDV|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+---------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|        0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|        0|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|        1|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|        1|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|        1|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|        0|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|        0|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|

In [21]:
# [+] 데이터프레임 스키마 출력
house_df.printSchema()

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT. MEDV: integer (nullable = true)



In [23]:
# 캐싱하기: DataFrame 을 메모리에 저장
house_df.cache()

DataFrame[CRIM: double, ZN: double, INDUS: double, CHAS: int, NOX: double, RM: double, AGE: double, DIS: double, RAD: int, TAX: int, PTRATIO: double, B: double, LSTAT: double, MEDV: double, CAT. MEDV: int]

In [22]:
# [+] 기본 통계 출력: 보기 불편하다..
house_df.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|summary|              CRIM|                ZN|             INDUS|              CHAS|                NOX|                RM|               AGE|              DIS|              RAD|               TAX|           PTRATIO|                 B|             LSTAT|              MEDV|          CAT. MEDV|
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|  count|               506|               506|               506|               506|                506|          

In [None]:
# pandas DataFrame 의 형태로 변환하여 출력
house_df.describe().toPandas().transpose()        # 행과 열 바꿈. 전치행렬

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
CRIM,506,3.6135235573122535,8.601545105332491,0.00632,88.9762
ZN,506,11.363636363636363,23.32245299451514,0.0,100.0
INDUS,506,11.136778656126504,6.860352940897589,0.46,27.74
CHAS,506,0.0691699604743083,0.2539940413404101,0,1
NOX,506,0.5546950592885372,0.11587767566755584,0.385,0.871
RM,506,6.284634387351787,0.7026171434153232,3.561,8.78
AGE,506,68.57490118577078,28.148861406903595,2.9,100.0
DIS,506,3.795042687747034,2.10571012662761,1.1296,12.1265
RAD,506,9.549407114624506,8.707259384239366,1,24


### 2. 탐색적 데이터 분석(EDA: Exploratory Data Analysis)

In [24]:
# [+] EDA 를 위해 pandas DataFrame 으로 변환
house_pdf = house_df.toPandas()

In [25]:
house_pdf.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT. MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,1


In [28]:
# seaborn 시각화 라이브러리 임포트 및 설정
import seaborn as sns
sns.set()  # set plot 스타일 적용
sns.set_style("whitegrid")

In [29]:
# [+] 변수간 상관계수 출력
house_pdf.corr()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT. MEDV
CRIM,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621,-0.388305,-0.151987
ZN,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995,0.360445,0.365296
INDUS,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038,-0.483725,-0.366276
CHAS,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929,0.17526,0.108631
NOX,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879,-0.427321,-0.232502
RM,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808,0.69536,0.641265
AGE,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339,-0.376955,-0.191196
DIS,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996,0.249929,0.118887
RAD,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676,-0.381626,-0.197924
TAX,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993,-0.468536,-0.273687


In [30]:
# [+] 목표 변수와의 상관계수 출력
house_pdf.corr(method = 'person')['MEDV']

ValueError: method must be either 'pearson', 'spearman', 'kendall', or a callable, 'person' was supplied

In [27]:
# VectorAssember: DataFrame의 컬럼들을 특징 벡터(feature vector)로 변환
from pyspark.ml.feature import VectorAssembler

In [None]:
# 특징 벡터화 하고자 하는 변수들을 선택
vectorAssembler = VectorAssembler(inputCols=[
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM',
    'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'],
    outputCol = 'features')


In [None]:
# [+] 특징 벡터화된 새로운 DataFrame으로 변환
vhouse_df = vectorAssembler.transform(house_df)

In [None]:
vhouse_df.toPandas()

In [None]:
# [+] 데이터프레임에서 'features'와 'MEDV' 컬럼 선택
vhouse_df = vhouse_df.select(['features', 'MEDV'])

In [None]:
vhouse_df.show()

In [None]:
# [+] 훈련/테스트 데이터셋을 7대3 비율로 구성
train_df, test_df = vhouse_df.randomSplit([0.7, 0.3])

### 3. 선형회귀 모델 학습

In [None]:
# 선형회귀 학습 알고리즘 임포트
from pyspark.ml.regression import LinearRegression

#### 선형회귀 모델 매개변수 설정
+ ```featureCol```: 특징 컬럼
+ ```labelCol```: 레이블 컬럼
+ ```maxIter```: 최대 학습 횟수
+ ```regParam```: 정규화 매개변수
+ ```elasticNetParam```: 정규화 매개변수2

In [None]:
# 선형회귀 학습 알고리즘 객체 생성 및 매개변수 설정
lr = LinearRegression(
    featuresCol = 'features', 
    labelCol='MEDV', 
    maxIter=10, 
    regParam=0.3, 
    elasticNetParam=0.8)

In [None]:
# [+] 모델 학습
model = lr.fit(train_df)

In [None]:
# 회귀계수(가중치)와 intercept(bias) 값 출력
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

In [None]:
# 모델 학습결과 출력
trainingSummary = model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [None]:
train_df.describe().show()

In [None]:
# train 데이터에 대한 모델 예측값 생성 (모델의 학습 성능을 평가하기 위함)
predictions = model.transform(train_df)

In [None]:
y_train = predictions.toPandas()['MEDV']
y_hat_train = predictions.toPandas()['prediction']

In [None]:
# 산점도 시각화
from matplotlib import pyplot as plt

# scatter plot
fig, ax = plt.subplots()
ax.scatter(y_train, y_hat_train, 10)   # 10: marker size
ax.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], '--', lw=1, color='black')
ax.set_xlabel('Observations')
ax.set_ylabel('Predictions')
plt.show()

In [None]:
# [+] test 데이터에 대한 모델 예측값 생성
predictions = model.transform(test_df)

In [None]:
predictions.select("prediction","MEDV","features").show()

In [None]:
# 모델 성능 측정(결정계수)을 위한 evaluator 생성
from pyspark.ml.evaluation import RegressionEvaluator

r2_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MEDV", metricName="r2")

In [None]:
# [+] 결정계수 측정 및 출력
r2 = r2.evaluator.evaluate(predictions)
print("R Squared (R2) on test data = %g" % r2)

In [None]:
# 모델 성능 측정(RMSE)을 위한 evaluator 생성
from pyspark.ml.evaluation import RegressionEvaluator

rmse_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MEDV", metricName="rmse")

In [None]:
# RMSE 측정 및 출력

rmse = rmse_evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
# 산점도 시각화
y_train = predictions.toPandas()['MEDV']
y_hat_train = predictions.toPandas()['prediction']

fig, ax = plt.subplots()
ax.scatter(y_train, y_hat_train, 10)   # 10: marker size
ax.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], '--', lw=1, color='black')
ax.set_xlabel('Observations')
ax.set_ylabel('Predictions')
plt.show()