In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName('classificationExample1').getOrCreate()
spark

# 타이타닉 데이터를 이용한 생존 여부
## 로지스틱 회귀

In [2]:
# CSV 파일 로드
data = spark.read.csv('learning_spark_data/titanic.csv', header=True, inferSchema=True)
data.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|Gender| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [3]:
from pyspark.sql.functions import col, sum, when, isnan
# 결측치 처리
null_counts = data.select(
    [
        sum(when(col(c).isNull() | isnan(c), 1).otherwise(0)).alias(c)
        for c in data.columns
    ]
)

null_counts.show()

+-----------+--------+------+----+------+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Gender|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+------+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|     0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+------+---+-----+-----+------+----+-----+--------+



In [4]:
# feature selection
data_1 = data.select('Survived', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare')
data_1.show(3)

+--------+------+------+----+-----+-----+-------+
|Survived|Pclass|Gender| Age|SibSp|Parch|   Fare|
+--------+------+------+----+-----+-----+-------+
|       0|     3|  male|22.0|    1|    0|   7.25|
|       1|     1|female|38.0|    1|    0|71.2833|
|       1|     3|female|26.0|    0|    0|  7.925|
+--------+------+------+----+-----+-----+-------+
only showing top 3 rows



In [8]:
# Age 결측치 처리 - 중앙값으로 처리
median_age = data_1.approxQuantile("Age", [0.5], 0.01)[0]
median_age

28.0

In [6]:
data_1 = data_1.fillna({'Age':median_age})
data_1.show(10)

+--------+------+------+----+-----+-----+-------+
|Survived|Pclass|Gender| Age|SibSp|Parch|   Fare|
+--------+------+------+----+-----+-----+-------+
|       0|     3|  male|22.0|    1|    0|   7.25|
|       1|     1|female|38.0|    1|    0|71.2833|
|       1|     3|female|26.0|    0|    0|  7.925|
|       1|     1|female|35.0|    1|    0|   53.1|
|       0|     3|  male|35.0|    0|    0|   8.05|
|       0|     3|  male|28.0|    0|    0| 8.4583|
|       0|     1|  male|54.0|    0|    0|51.8625|
|       0|     3|  male| 2.0|    3|    1| 21.075|
|       1|     3|female|27.0|    0|    2|11.1333|
|       1|     2|female|14.0|    1|    0|30.0708|
+--------+------+------+----+-----+-----+-------+
only showing top 10 rows



In [7]:
# 데이터 인코딩 StringIndexer
from pyspark.sql.functions import when
from pyspark.ml.feature import StringIndexer, VectorAssembler

indexer = StringIndexer(inputCol='Gender', outputCol='SexIndexer')
data_1 = indexer.fit(data_1).transform(data_1)
data_1.show(5)

+--------+------+------+----+-----+-----+-------+----------+
|Survived|Pclass|Gender| Age|SibSp|Parch|   Fare|SexIndexer|
+--------+------+------+----+-----+-----+-------+----------+
|       0|     3|  male|22.0|    1|    0|   7.25|       0.0|
|       1|     1|female|38.0|    1|    0|71.2833|       1.0|
|       1|     3|female|26.0|    0|    0|  7.925|       1.0|
|       1|     1|female|35.0|    1|    0|   53.1|       1.0|
|       0|     3|  male|35.0|    0|    0|   8.05|       0.0|
+--------+------+------+----+-----+-----+-------+----------+
only showing top 5 rows



In [10]:
# FeatureVector 생성
assembler = VectorAssembler(
    inputCols=['Pclass', 'SexIndexer', 'Age', 'SibSp', 'Parch', 'Fare'],
    outputCol='features'
)
data_1 = assembler.transform(data_1)
data_1.select('features', 'Survived').show(5)

+--------------------+--------+
|            features|Survived|
+--------------------+--------+
|[3.0,0.0,22.0,1.0...|       0|
|[1.0,1.0,38.0,1.0...|       1|
|[3.0,1.0,26.0,0.0...|       1|
|[1.0,1.0,35.0,1.0...|       1|
|[3.0,0.0,35.0,0.0...|       0|
+--------------------+--------+
only showing top 5 rows



In [11]:
# 데이터 분할
train_data, test_data = data_1.randomSplit([0.8, 0.2], seed=42)

In [12]:
# 모델 생성
lr = LogisticRegression(featuresCol='features', labelCol='Survived')
lr_model = lr.fit(train_data)
predict = lr_model.transform(test_data)
predict.select('features', 'Survived', 'prediction').show(5)

+--------------------+--------+----------+
|            features|Survived|prediction|
+--------------------+--------+----------+
|[1.0,1.0,50.0,0.0...|       0|       1.0|
|[1.0,0.0,21.0,0.0...|       0|       1.0|
|[1.0,0.0,24.0,0.0...|       0|       1.0|
|[1.0,0.0,28.0,0.0...|       0|       1.0|
|[1.0,0.0,28.0,0.0...|       0|       1.0|
+--------------------+--------+----------+
only showing top 5 rows



In [14]:
from pyspark.sql.functions import expr
comp = predict.withColumn('correct', expr('case when Survived = prediction then 1 else 0 end'))
comp.where('correct=0').count()

27

In [16]:
# 틀린 데이터만 필터링
predict.filter(col('Survived') != col('prediction')).select('features', 'Survived', 'prediction').show()

+--------------------+--------+----------+
|            features|Survived|prediction|
+--------------------+--------+----------+
|[1.0,1.0,50.0,0.0...|       0|       1.0|
|[1.0,0.0,21.0,0.0...|       0|       1.0|
|[1.0,0.0,24.0,0.0...|       0|       1.0|
|[1.0,0.0,28.0,0.0...|       0|       1.0|
|[1.0,0.0,28.0,0.0...|       0|       1.0|
|[1.0,0.0,28.0,0.0...|       0|       1.0|
|[1.0,0.0,29.0,0.0...|       0|       1.0|
|[3.0,1.0,9.0,1.0,...|       0|       1.0|
|[3.0,1.0,14.0,0.0...|       0|       1.0|
|[3.0,1.0,22.0,0.0...|       0|       1.0|
|[3.0,1.0,28.0,0.0...|       0|       1.0|
|[3.0,1.0,28.0,1.0...|       0|       1.0|
|[1.0,0.0,40.0,0.0...|       1|       0.0|
|[1.0,0.0,42.0,0.0...|       1|       0.0|
|[1.0,0.0,45.0,0.0...|       1|       0.0|
|[2.0,0.0,2.0,1.0,...|       1|       0.0|
|[3.0,1.0,33.0,3.0...|       1|       0.0|
|[3.0,1.0,38.0,1.0...|       1|       0.0|
|[3.0,1.0,63.0,0.0...|       1|       0.0|
|[3.0,0.0,6.0,0.0,...|       1|       0.0|
+----------

In [17]:
# 정확도 평가
comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

0.8137931034482758

In [18]:
eval = BinaryClassificationEvaluator(
    labelCol='Survived',
    rawPredictionCol='rawPrediction',
    metricName='areaUnderROC'
)

auc = eval.evaluate(predict)
auc

0.865632318501171

AUROC -> X축 FPR, y축 TPR의 곡선 아래 면적 => 1에 가까울 수록 성능 good

In [19]:
spark.stop()

# libsvm 파일 형식의 처리
텍스트 파일 형식, 희소데이터용 압축 파일 -> 메모리, 처리속도 개선 => 머신러닝에서 사용

레이블 행:값 행:값 ...

In [20]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('classificationExample2').getOrCreate()
spark

In [21]:
data2 = spark.read.format('libsvm').load('learning_spark_data/sample_libsvm_data.txt')
data2.count()

100

In [22]:
data2.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 3 rows



In [23]:
train_data, test_data = data2.randomSplit([0.7, 0.3], seed=42)

In [24]:
train_data.count(), test_data.count()

(65, 35)

In [28]:
lr = LogisticRegression(maxIter=10, regParam=0.2, elasticNetParam=0.7)
lrmodel = lr.fit(train_data)
pred = lrmodel.transform(test_data)
pred.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[100,101,102...|[0.61110663774844...|[0.64819319976427...|       0.0|
|  0.0|(692,[123,124,125...|[1.88762878149153...|[0.86848492926088...|       0.0|
|  0.0|(692,[123,124,125...|[1.48203827311578...|[0.81488025243698...|       0.0|
|  0.0|(692,[124,125,126...|[1.32973716405211...|[0.79079715531589...|       0.0|
|  0.0|(692,[124,125,126...|[1.14364775038872...|[0.75834874070092...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [29]:
eval = BinaryClassificationEvaluator(
    metricName='areaUnderROC'
)

auc = eval.evaluate(pred)
auc

1.0

In [31]:
spark.stop()