In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from modules.my_pyspark import *

In [3]:
spark = MyPySpark(session=True)

# 1. Đọc dữ liệu

In [4]:
data_path = r"./data/titanic.csv"

In [5]:
data = spark.readFile(data_path, 'csv')

In [6]:
data.show(3, False)

+-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|Name                                               |Sex   |Age |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|
+-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
|1          |0       |3     |Braund, Mr. Owen Harris                            |male  |22.0|1    |0    |A/5 21171       |7.25   |null |S       |
|2          |1       |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|female|38.0|1    |0    |PC 17599        |71.2833|C85  |C       |
|3          |1       |3     |Heikkinen, Miss. Laina                             |female|26.0|0    |0    |STON/O2. 3101282|7.925  |null |S       |
+-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+--

In [7]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [8]:
data.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

> **Nhận xét**
> * Có missing values
> * Predictor variables: `Pclass`, `Sex`, `Age`, `SibSp`, `Parch`, `Fare`, `Embarked`
> * Target variable: `Survived`

In [9]:
predictor_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target_feature = 'Survived'

* Chọn các thuộc tính cần thiết và xóa các sample chứa missing value

In [10]:
final_data = data.select(predictor_features + [target_feature]).dropna()

In [11]:
final_data.count()

712

# 2. Chuẩn hóa dữ liệu

In [12]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [13]:
new_predictor_features = ['Pclass', 'oh_Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'oh_Embarked']

In [14]:
indexer = StringIndexer(inputCols=['Sex', 'Embarked'], outputCols=['idx_Sex', 'idx_Embarked'])
encoder = OneHotEncoder(inputCols=['idx_Sex', 'idx_Embarked'], outputCols=['oh_Sex', 'oh_Embarked'])
assembler = VectorAssembler(inputCols=new_predictor_features, outputCol='features')
log_model = LogisticRegression(featuresCol='features', labelCol=target_feature)

* Áp dụng Pipeline

In [15]:
pipeline = Pipeline(stages=[indexer, encoder, assembler, log_model])

# 3. Tách dữ liệu thành train và test data

In [16]:
train_data, test_data = final_data.randomSplit((.7, .3))

# 4. Build model

In [17]:
fit_model = pipeline.fit(train_data)

# 5. Đánh giá model

In [18]:
result = fit_model.transform(test_data)

In [19]:
result.select('Survived', 'prediction').show(truncate=False)

+--------+----------+
|Survived|prediction|
+--------+----------+
|0       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|0       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
+--------+----------+
only showing top 20 rows



In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [21]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [22]:
auc = evaluator.evaluate(result)

In [23]:
auc

0.7715572715572716