# 24. 고급 분석과 머신러닝

## 24.4 MLlib 실제로 사용하기
+ 범주형 레이블 1, 범주형 변수 1, 수치형 변수 2 데이터로 실습

In [1]:
# 세션 생성
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Machine Learning examples") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

spark.conf.set('spark.sql.shuffle.partitions', 5)

In [2]:
df = spark.read.json("../BookSamples/data/simple-ml/")
df.orderBy("value2").show()

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
|green| bad|    16|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|green| bad|    16|14.386294994851129|
|green|good|    12|14.386294994851129|
|  red|good|    35|14.386294994851129|
|  red|good|    35|14.386294994851129|
|  red| bad|     2|14.386294994851129|
|  red| bad|    16|14.386294994851129|
|  red| bad|    16|14.386294994851129|
| blue| bad|     8|14.386294994851129|
|green|good|     1|14.386294994851129|
|green|good|    12|14.386294994851129|
| blue| bad|     8|14.386294994851129|
|  red|good|    35|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|  red| bad|    16|14.386294994851129|
|green|good|    12|14.386294994851129|
+-----+----+------+------------------+
only showing top 20 rows



### 24.4.1 변환자를 사용해서 피처 엔지니러잉 수행
+ 모든 입력변수는 Double, Vector[Double] 타입으로 구성 필요
+ RFormula를 사용하여 쉽게 변환 가능

In [3]:
from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()

+-----+----+------+------------------+--------------------+-----+
|color| lab|value1|            value2|            features|label|
+-----+----+------+------------------+--------------------+-----+
|green|good|     1|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
| blue| bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue| bad|    12|14.386294994851129|(10,[2,3,6,9],[12...|  0.0|
|green|good|    15| 38.97187133755819|(10,[1,2,3,5,8],[...|  1.0|
|green|good|    12|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
|green| bad|    16|14.386294994851129|(10,[1,2,3,5,8],[...|  0.0|
|  red|good|    35|14.386294994851129|(10,[0,2,3,4,7],[...|  1.0|
|  red| bad|     1| 38.97187133755819|(10,[0,2,3,4,7],[...|  0.0|
|  red| bad|     2|14.386294994851129|(10,[0,2,3,4,7],[...|  0.0|
|  red| bad|    16|14.386294994851129|(10,[0,2,3,4,7],[...|  0.0|
|  red|good|    45| 38.97187133755819|(10,[0,2,3,4,7],[...|  1.0|
|green|good|     1|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
| blue| ba

+ 데이터의 임의분할

In [4]:
train, test = preparedDF.randomSplit([0.7, 0.3])

### 24.4.2 추정자

In [5]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol='label', featuresCol='features')

In [6]:
# 옵션 검토
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [7]:
fittedLR = lr.fit(train)

In [8]:
fittedLR.transform(train).select("label", "prediction").show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 20 rows



### 24.4.3 워크플로를 파이프라인으로 만들기

+ 변환자 객체나 모델 객체가 다른 파이프라인에서 재사용되지 않도록 함

In [9]:
from pyspark.ml import Pipeline

train, test = df.randomSplit([0.7, 0.3])
rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

### 24.4.4 모델 학습 및 평가

+ 다양한 하이퍼파라미터 테스트 사례
    + 두 개 버전의 RFormual
    + 세 개 다른 옵션의 ElasticNet 파라미터
    + 두 개 서로 다른 옵션의 일반화 파라미터

In [10]:
from pyspark.ml.tuning import ParamGridBuilder

params = ParamGridBuilder()\
    .addGrid(rForm.formula, [
        "lab ~ . + color:value1",
        "lab ~ . + color:value1 + color:value2"])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .addGrid(lr.regParam, [0.1, 0.2])\
    .build()

In [11]:
# 평가

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()\
    .setMetricName("areaUnderROC")\
    .setRawPredictionCol("prediction")\
    .setLabelCol("label")

In [12]:
# 튜닝

from pyspark.ml.tuning import TrainValidationSplit

tvs = TrainValidationSplit()\
    .setTrainRatio(0.75)\
    .setEstimatorParamMaps(params)\
    .setEstimator(pipeline)\
    .setEvaluator(evaluator)

In [13]:
# 드디어 학습

tvsFitted = tvs.fit(train)

In [14]:
# 최종 예측 평가

tvsFitted.transform(test).show()
evaluator.evaluate(tvsFitted.transform(test))

+-----+----+------+------------------+--------------------+-----+--------------------+--------------------+----------+
|color| lab|value1|            value2|            features|label|       rawPrediction|         probability|prediction|
+-----+----+------+------------------+--------------------+-----+--------------------+--------------------+----------+
| blue| bad|     8|14.386294994851129|(7,[2,3,6],[8.0,1...|  0.0|[1.97638440531048...|[0.87829520961714...|       0.0|
| blue| bad|    12|14.386294994851129|(7,[2,3,6],[12.0,...|  0.0|[2.33092400938994...|[0.91140597436592...|       0.0|
| blue| bad|    12|14.386294994851129|(7,[2,3,6],[12.0,...|  0.0|[2.33092400938994...|[0.91140597436592...|       0.0|
| blue| bad|    12|14.386294994851129|(7,[2,3,6],[12.0,...|  0.0|[2.33092400938994...|[0.91140597436592...|       0.0|
| blue| bad|    12|14.386294994851129|(7,[2,3,6],[12.0,...|  0.0|[2.33092400938994...|[0.91140597436592...|       0.0|
| blue| bad|    12|14.386294994851129|(7,[2,3,6]

0.8421052631578947

### 24.4.5 모델 저장 및 적용

+ 특정 알고리즘에 대한 '모델' 버전을 사용하여 디스크에 저장된 모델을 불러와야 함
    + CrossValidator는 CrossValidatorModel이 저장된 버전을 읽어야 하는 식

In [15]:
tvsFitted.bestModel.write().overwrite().save('/tmp/model')

In [16]:
from pyspark.ml import PipelineModel

bestModel = PipelineModel.load('/tmp/model')

In [17]:
evaluator.evaluate(bestModel.transform(test)) # 결과 동일

0.8421052631578947