In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("logistic_regression").getOrCreate()

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

In [5]:
training = spark.createDataFrame([
  (1.0, Vectors.dense([0.0, 1.1, 0.1])),
  (0.0, Vectors.dense([2.0, 1.0, -1.0])),
  (0.0, Vectors.dense([2.0, 1.3, 1.0])),
  (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

In [6]:
training.show()

+-----+--------------+
|label|      features|
+-----+--------------+
|  1.0| [0.0,1.1,0.1]|
|  0.0|[2.0,1.0,-1.0]|
|  0.0| [2.0,1.3,1.0]|
|  1.0|[0.0,1.2,-0.5]|
+-----+--------------+



In [7]:
test = spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])

In [8]:
test.show()

+-----+--------------+
|label|      features|
+-----+--------------+
|  1.0|[-1.0,1.5,1.3]|
|  0.0|[3.0,2.0,-0.1]|
|  1.0|[0.0,2.2,-1.5]|
+-----+--------------+



In [9]:
## 회귀 모델 생성
lr = LogisticRegression(maxIter=30, regParam=0.1)

In [10]:
model = lr.fit(training) # 모델 학습

In [11]:
# 예측
prediction = model.transform(test)

In [16]:
prediction.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [18]:
prediction.select(["label", "features", "prediction"]).show()

+-----+--------------+----------+
|label|      features|prediction|
+-----+--------------+----------+
|  1.0|[-1.0,1.5,1.3]|       1.0|
|  0.0|[3.0,2.0,-0.1]|       0.0|
|  1.0|[0.0,2.2,-1.5]|       1.0|
+-----+--------------+----------+

