### Iris 데이터 세트를 로딩하고 Pandas DataFrame으로 변환 후 scikit learn으로 학습 및 예측 수행.

In [5]:
from pyspark.sql import SparkSession

# SparkSession 객체 생성
spark = SparkSession.builder \
    .appName("stock") \
    .getOrCreate()



In [2]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

iris = load_iris()
iris_data = iris.data
iris_label = iris.target
print(type(iris_data), type(iris_label), iris_data.shape, iris_label.shape)
print(iris.feature_names)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> (150, 4) (150,)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [3]:


# iris 데이터 세트를 numpy에서 pandas DataFrame으로 변환 
iris_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

iris_pdf = pd.DataFrame(iris_data, columns=iris_columns)
iris_pdf['target'] = iris_label

print(iris_pdf['target'].value_counts())
display(iris_pdf.head())

0    50
1    50
2    50
Name: target, dtype: int64


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, 
                                                    test_size=0.2, random_state=11)
# DecisionTreeClassifier 객체 생성 
dt_clf = DecisionTreeClassifier(random_state=11, max_depth=5)

# 학습 수행. fit()을 호출하면 DecisionTreeClassifier 객체가 학습 됨.  
dt_clf.fit(X_train, y_train)

# 학습이 완료된 DecisionTreeClassifier 객체에서 테스트 데이터 세트로 예측 수행. 
pred = dt_clf.predict(X_test)
print('테스트 데이터로 예측된 값:', pred)

### Spark DataFrame 생성 후 ML 알고리즘 학습 및 예측 수행. 
* iris_sdf DataFrame을 randomSplit()을 이용하여 train용과 test용 DataFrame으로 분할
* VectorAssembler를 이용하여 모든 feature 컬럼들을 하나의 feature vector로 변환
* Estimator 객체를 생성하고, fit() 메소드를 호출하여 ML Model 반환
* ML Model을 이용하여 테스트 DataFrame에 예측 수행.

In [6]:
iris_sdf = spark.createDataFrame(iris_pdf)
print(type(iris_sdf))
display(iris_sdf.limit(5))

<class 'pyspark.sql.dataframe.DataFrame'>


DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, target: bigint]

In [7]:
'''
iris_sdf DataFrame을 randomSplit()을 이용하여 train용과 test용 DataFrame으로 분할 
'''

train_sdf, test_sdf = iris_sdf.randomSplit([0.8, 0.2], seed=42)
train_sdf.cache()
print(iris_sdf.count(), train_sdf.count(), test_sdf.count())

150 116 34


In [None]:
display(train_sdf)

In [9]:
# VectorAssembler를 이용하여 모든 feature 컬럼들을 하나의 feature vector로 변환.
from pyspark.ml.feature import VectorAssembler

iris_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# 인자로 입력 컬럼 리스트, vectorization될 하나의 컬럼 입력
vec_assembler = VectorAssembler(inputCols=iris_columns, outputCol='features') # VectorAssembler() 생성 인자로 outputCols가 아닌 outputCol이 입력됨에 유의
'''
VectorAssembler 객체의 transform() 메소드를 호출하여 모든 feature 컬럼을 하나의 feature vector로 변환. 
단 VectorAssembler(inputCols=iris_columns, outputCol='features').transform(DataFrame)으로 반환되는 DataFrame은 기존 DataFrame에 outputCol로 지정된 컬럼이 추가된 DataFrame임 
'''

# transform의 인자로 반드시 Spark DataFrame 사용 -> 변환 결과도 Spark DataFrame
train_feature_vector_df = vec_assembler.transform(train_sdf)

print(type(train_feature_vector_df))
train_feature_vector_df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+------------+-----------+------------+-----------+------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|
+------------+-----------+------------+-----------+------+-----------------+
|         4.3|        3.0|         1.1|        0.1|     0|[4.3,3.0,1.1,0.1]|
|         4.4|        2.9|         1.4|        0.2|     0|[4.4,2.9,1.4,0.2]|
|         4.6|        3.4|         1.4|        0.3|     0|[4.6,3.4,1.4,0.3]|
|         4.7|        3.2|         1.3|        0.2|     0|[4.7,3.2,1.3,0.2]|
|         4.8|        3.0|         1.4|        0.1|     0|[4.8,3.0,1.4,0.1]|
|         4.9|        3.0|         1.4|        0.2|     0|[4.9,3.0,1.4,0.2]|
|         5.0|        3.4|         1.5|        0.2|     0|[5.0,3.4,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|     0|[5.0,3.6,1.4,0.2]|
|         5.1|        3.5|         1.4|        0.2|     0|[5.1,3.5,1.4,0.2]|
|         5.1|        3.5|        

In [10]:

from pyspark.ml.classification import DecisionTreeClassifier

# Decision Tree 로 학습 수행. 
# Decisitree Tree 객체 생성 
# feature vectorization한 컬럼, 라벨 컬럼을 반드시 인자로 지정
dt = DecisionTreeClassifier(featuresCol='features', labelCol='target', maxDepth=5)

# ML 알고리즘 객체의 fit()메소드를 이용하여 train feature vector 데이터 세트를 학습하고 이를 ML Model로 반환함.
# scikit learn은 ML 알고리즘 객체로 fit()만 호출하면 해당 객체가 학습이 되었으나 Spark ML을 반드시 ML Model로 반환 받아야 함. 
# 입력이 있으면 출력이 반환되는 구조
# DF -> transform -> DF
# DF -> fit -> Estimator Model(transformation Object)
dt_model = dt.fit(train_feature_vector_df) #dt.fit(X_train, y_label)
print('DecisionTree Estimator type:', type(dt), 'DecisionTree Estimator Model type:', type(dt_model))
print(dt)
print(dt_model)

DecisionTree Estimator type: <class 'pyspark.ml.classification.DecisionTreeClassifier'> DecisionTree Estimator Model type: <class 'pyspark.ml.classification.DecisionTreeClassificationModel'>
DecisionTreeClassifier_178786fb861f
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_178786fb861f, depth=5, numNodes=17, numClasses=3, numFeatures=4


In [11]:
# 테스트 데이터를 feature vector로 변환. 
# 이미 학습데이터로 만들어진 Transformer 객체를 테스트 데이터에도 적용 
test_feature_vector_df = vec_assembler.transform(test_sdf)

print(type(test_feature_vector_df))
display(test_feature_vector_df)

<class 'pyspark.sql.dataframe.DataFrame'>


DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, target: bigint, features: vector]

In [13]:
# 테스트 데이터 + 변환된 feature vector로 구성된 DataFrame을 학습된 모델의 transform() 메소드를 이용하여 예측 수행. 
# 학습된 모델은 ML 알고리즘 객체의 생성자로 featuresCols인자로 'features' 컬럼이 주어졌으므로 feature vector컬럼명을 인지하고 있음. 
. 
# 학습된 모델의 transform 메서드로 예측 수행
predictions = dt_model.transform(test_feature_vector_df)
print(type(predictions))
predictions

<class 'pyspark.sql.dataframe.DataFrame'>


DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, target: bigint, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [14]:
from pyspark.ml.classification import LogisticRegression

# ML 알고리즘 객체 생성. 
lr = LogisticRegression(featuresCol='features', labelCol='target', maxIter=10)

# ML 알고리즘 객체의 fit()메소드를 이용하여 train feature vector 데이터 세트를 학습하고 이를 ML Model로 반환함.
# scikit learn은 ML 알고리즘 객체로 fit()만 호출하면 해당 객체가 학습이 되었으나 Spark ML을 반드시 ML Model로 반환 받아야 함. 
lr_model = lr.fit(train_feature_vector_df)

predictions = lr_model.transform(test_feature_vector_df)
print(type(predictions))
display(predictions)

<class 'pyspark.sql.dataframe.DataFrame'>


DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, target: bigint, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [17]:
predictions.select('target', 'features', 'rawPrediction', "probability", "prediction").show()

+------+-----------------+--------------------+--------------------+----------+
|target|         features|       rawPrediction|         probability|prediction|
+------+-----------------+--------------------+--------------------+----------+
|     0|[4.6,3.1,1.5,0.2]|[17.6803464458088...|[0.99995249938593...|       0.0|
|     0|[4.8,3.4,1.6,0.2]|[18.4506828497453...|[0.99997095853479...|       0.0|
|     0|[4.9,3.1,1.5,0.1]|[17.0314517575985...|[0.99969295723121...|       0.0|
|     0|[5.4,3.7,1.5,0.2]|[18.1605393730324...|[0.99978570374799...|       0.0|
|     0|[4.6,3.6,1.0,0.2]|[22.5809416212887...|[0.99999963120943...|       0.0|
|     0|[5.0,3.0,1.6,0.2]|[14.9838352256527...|[0.99757909666550...|       0.0|
|     0|[5.0,3.2,1.2,0.2]|[17.5999856002554...|[0.99978302841134...|       0.0|
|     0|[5.4,3.4,1.5,0.4]|[14.927251823191,...|[0.99604838067429...|       0.0|
|     0|[4.4,3.2,1.3,0.2]|[19.8466177256402...|[0.99999676934328...|       0.0|
|     0|[5.0,3.5,1.3,0.3]|[18.5901186461

In [18]:
# 다중 분류 모델 평가
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='target', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator_accuracy.evaluate(predictions)
print('정확도:', accuracy)

정확도: 1.0


### Pipeline 적용하기
* Pipeline은 여러개의 개별적인 Transformer의 변환 작업, Estimator의 학습 작업을 일련의 Process 연결을 통해 간단한 API 처리로 구현할 수 있게 만들어 줌
* Pipeline은 개별 변환 및 학습 작업을 Stage로 각각 정의하여 Pipeline에 등록한 뒤 Pipeline의 fit() 메소드를 호출하여 연결된 Stage 작업을 수행. 수행 결과로 PipelineModel이 반환되며, 이 PipelineModel에서 예측을 위한 변환 및 예측 작업을 transform() 메소드로 수행.

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

iris_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# 첫번째 stage는 Feature Vectorization을 위해 VectorAssembler 객체 생성. 
stage_1 = VectorAssembler(inputCols=iris_columns, outputCol='features')
# 두번째 stage는 학습을 위한 결정 트리 Estimator 생성.  
stage_2 = DecisionTreeClassifier(featuresCol='features', labelCol='target', maxDepth=5)

# Feature Vectorization 변환-> 학습 pipeline을 생성. 
pipeline = Pipeline(stages=[stage_1, stage_2])

# Estimator가 포함된 Pipeline객체의 fit(train_sdf)를 호출하면 학습 데이터에 transformation을 적용하여 Estimator의 학습까지 수행된 PipelineModel 객체를 반환.
pipeline_model = pipeline.fit(train_sdf) # train_sdf_vectorized = stage_1.transform(train_sdf) , estimator_model = stage_2.fit(train_sdf_vectorized)

print(type(pipeline), type(pipeline_model))

# asdlk

<class 'pyspark.ml.pipeline.Pipeline'> <class 'pyspark.ml.pipeline.PipelineModel'>


In [None]:
# Pipeline Model은 feature vectorization 객체, EstimatorModel을 가지고 있으므로 이를 이용하여 테스트 데이터의 변환->예측 수행
predictions = pipeline_model.transform(test_sdf) # test_sdf_vectorized = stage_1.transform(test_sdf), estimator_model.transform(test_sdf_vectorized)
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='target', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator_accuracy.evaluate(predictions)
print('정확도:', accuracy)

display(predictions)

정확도: 0.9393939393939394


sepal_length,sepal_width,petal_length,petal_width,target,features,rawPrediction,probability,prediction
4.6,3.1,1.5,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.1, 1.5, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
4.8,3.4,1.6,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.8, 3.4, 1.6, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
4.9,3.1,1.5,0.1,0,"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.1, 1.5, 0.1))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.4,3.7,1.5,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.4, 3.7, 1.5, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
4.6,3.6,1.0,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.6, 1.0, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.0,3.0,1.6,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.0, 1.6, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.0,3.2,1.2,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.2, 1.2, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.4,3.4,1.5,0.4,0,"Map(vectorType -> dense, length -> 4, values -> List(5.4, 3.4, 1.5, 0.4))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.7,3.8,1.7,0.3,0,"Map(vectorType -> dense, length -> 4, values -> List(5.7, 3.8, 1.7, 0.3))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
4.4,3.2,1.3,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.4, 3.2, 1.3, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0


In [None]:
# stages 속성은 pipeline_model이 가지는 stage별 객체를 리스트로 가지고 있음. 
pipeline_model.stages

Out[31]: [VectorAssembler_7516dd401d71,
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_dcf63cd285ce, depth=4, numNodes=11, numClasses=3, numFeatures=4]

In [None]:
# PipelineModel의 stages 속성에서 개별 stage에 있는 객체를 가져 올 수 있음. 
vector_assembler = pipeline_model.stages[0]
dt_model = pipeline_model.stages[-1]

print(dt_model)
print(vector_assembler)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_dcf63cd285ce, depth=4, numNodes=11, numClasses=3, numFeatures=4
VectorAssembler_7516dd401d71


In [None]:
test_feature_vector_df = vec_assembler.transform(test_sdf)
predictions = dt_model.transform(test_feature_vector_df)

accuracy = evaluator_accuracy.evaluate(predictions)
print('정확도:', accuracy)

display(predictions)


정확도: 0.9393939393939394


sepal_length,sepal_width,petal_length,petal_width,target,features,rawPrediction,probability,prediction
4.6,3.1,1.5,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.1, 1.5, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
4.8,3.4,1.6,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.8, 3.4, 1.6, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
4.9,3.1,1.5,0.1,0,"Map(vectorType -> dense, length -> 4, values -> List(4.9, 3.1, 1.5, 0.1))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.4,3.7,1.5,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.4, 3.7, 1.5, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
4.6,3.6,1.0,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.6, 3.6, 1.0, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.0,3.0,1.6,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.0, 1.6, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.0,3.2,1.2,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(5.0, 3.2, 1.2, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.4,3.4,1.5,0.4,0,"Map(vectorType -> dense, length -> 4, values -> List(5.4, 3.4, 1.5, 0.4))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
5.7,3.8,1.7,0.3,0,"Map(vectorType -> dense, length -> 4, values -> List(5.7, 3.8, 1.7, 0.3))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
4.4,3.2,1.3,0.2,0,"Map(vectorType -> dense, length -> 4, values -> List(4.4, 3.2, 1.3, 0.2))","Map(vectorType -> dense, length -> 3, values -> List(36.0, 0.0, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(1.0, 0.0, 0.0))",0.0
