### 로지스틱(분류) : 0 또는 1 

In [5]:
from pyspark.sql import  SQLContext 
from pyspark import SparkContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
matplotlib.rcParams['font.family'] = 'Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False


In [2]:
sc = SparkContext('local')
sqlctx = SQLContext(sc)

In [7]:
titanic_df = sqlctx.read.csv( '../RDD/data/titanic1.csv',
    header=True, inferSchema=True)

titanic_df.show(5)

+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|
+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+
|       0|     3|22.0|    1|    0|   7.25|          1|    0|      0.0|           0.0|          0.0|
|       1|     1|38.0|    1|    0|71.2833|          1|    0|      1.0|           1.0|          2.0|
|       1|     3|26.0|    0|    0|  7.925|          0|    1|      1.0|           0.0|          1.0|
|       1|     1|35.0|    1|    0|   53.1|          1|    0|      1.0|           0.0|          2.0|
|       0|     3|35.0|    0|    0|   8.05|          0|    1|      0.0|           0.0|          0.0|
+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+
only showing top 5 rows



In [8]:
## 특성데이터 : Pclass, Age, Sex_index, Family_Size

In [9]:
f = VectorAssembler(inputCols=['Pclass','Age','Sex_index','Family_Size'], 
        outputCol='features' )
v_df = f.transform( titanic_df )
v_df.show(5)


+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+------------------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|          features|
+--------+------+----+-----+-----+-------+-----------+-----+---------+--------------+-------------+------------------+
|       0|     3|22.0|    1|    0|   7.25|          1|    0|      0.0|           0.0|          0.0|[3.0,22.0,0.0,1.0]|
|       1|     1|38.0|    1|    0|71.2833|          1|    0|      1.0|           1.0|          2.0|[1.0,38.0,1.0,1.0]|
|       1|     3|26.0|    0|    0|  7.925|          0|    1|      1.0|           0.0|          1.0|[3.0,26.0,1.0,0.0]|
|       1|     1|35.0|    1|    0|   53.1|          1|    0|      1.0|           0.0|          2.0|[1.0,35.0,1.0,1.0]|
|       0|     3|35.0|    0|    0|   8.05|          0|    1|      0.0|           0.0|          0.0|[3.0,35.0,0.0,0.0]|
+--------+------+----+-----+-----+-------+------

In [12]:
v_df.count()

891

In [20]:
# 랜덤하게 데이터 분할 80% 학습용, 20% test용
train_df, test_df = v_df.randomSplit([0.8,0.2])

In [21]:
train_df.count()

717

In [22]:
# 분류 데이터는 로지스틱
lr = LogisticRegression(featuresCol = 'features', 
                          labelCol='Survived',
                          maxIter=100, regParam=0.01)
lrModel = lr.fit(train_df)

In [23]:
print("기울기(w)",lrModel.coefficients )
print("절편(b)", lrModel.intercept )

기울기(w) [-1.1253005370969382,-0.034971757520521854,2.45857233744818,-0.17821004430889856]
절편(b) 2.316547140952024


In [None]:
# y = w1*x1 + w2*x2 + w3*x3 + w4*x4

In [26]:
# 학습된 모델 테스트
lr_predict = lrModel.transform(test_df)
lr_predict.show(5)

+--------+------+----+-----+-----+------+-----------+-----+---------+--------------+-------------+------------------+--------------------+--------------------+----------+
|Survived|Pclass| Age|SibSp|Parch|  Fare|Family_Size|Alone|Sex_index|Embarked_index|Initial_index|          features|       rawPrediction|         probability|prediction|
+--------+------+----+-----+-----+------+-----------+-----+---------+--------------+-------------+------------------+--------------------+--------------------+----------+
|       0|     1|19.0|    3|    2| 263.0|          5|    0|      0.0|           0.0|          0.0|[1.0,19.0,0.0,5.0]|[0.36426701057932...|[0.59007296313283...|       0.0|
|       0|     1|24.0|    0|    0|  79.2|          0|    1|      0.0|           1.0|          0.0|[1.0,24.0,0.0,0.0]|[-0.3519244233625...|[0.41291583134965...|       1.0|
|       0|     1|25.0|    1|    2|151.55|          3|    0|      1.0|           0.0|          2.0|[1.0,25.0,1.0,3.0]|[-2.2408948703635...|[0.0961

In [33]:
# probability의 합은 1
# [0일 확륙, 1일 확륙]
lr_predict.select('Pclass','Age','probability','prediction').show()

+------+----+--------------------+----------+
|Pclass| Age|         probability|prediction|
+------+----+--------------------+----------+
|     1|19.0|[0.59007296313283...|       0.0|
|     1|24.0|[0.41291583134965...|       1.0|
|     1|25.0|[0.09613775358681...|       1.0|
|     1|33.0|[0.49070641905923...|       1.0|
|     1|33.0|[0.49070641905923...|       1.0|
|     1|33.0|[0.49070641905923...|       1.0|
|     1|33.0|[0.49070641905923...|       1.0|
|     1|37.0|[0.56977138438258...|       0.0|
|     1|39.0|[0.54305608416285...|       0.0|
|     1|46.0|[0.60287391215915...|       0.0|
|     1|46.0|[0.64466373800669...|       0.0|
|     1|49.0|[0.70657133437710...|       0.0|
|     1|50.0|[0.67602228665720...|       0.0|
|     1|51.0|[0.68363403428324...|       0.0|
|     1|54.0|[0.66757296488834...|       0.0|
|     1|58.0|[0.76737277506629...|       0.0|
|     1|61.0|[0.71950964759565...|       0.0|
|     1|65.0|[0.77904808627513...|       0.0|
|     2|21.0|[0.69986831876199...|