In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

import warnings
warnings.filterwarnings('ignore')

from pyspark.sql.functions import *

In [2]:
sc = SparkContext()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/28 14:51:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/28 14:51:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark = SparkSession(sc)

In [4]:
data = spark.read.csv('/Users/tranhuonggiang/Documents/BI_DA/KHTN/KHTN_BigData in ML/b7/flights.csv', inferSchema = True, header = True)

In [5]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



**Chuẩn hoá dữ liệu**

In [6]:
#import required function
from pyspark.sql.functions import round

In [7]:
#convert mile to km 
data = data.withColumn('km', round(data.mile*1.60934,0))

In [8]:
#create label column with idicating whether flight delay(1) or not(0)
data = data.withColumn('label', (data.delay >0).cast('integer'))

In [9]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| 542.0|    0|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
only showing top 3 rows



In [10]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [11]:
#Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idex')
#Indexer identifiers categories in the data 
indexer_model = indexer.fit(data)
#Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data)
#Repeat the process for other categorical feature 
data_indexed = StringIndexer(inputCol='org',
                             outputCol='org_idx').fit(data_indexed).transform(data_indexed)


#Create an encoder 
encoder = OneHotEncoder(inputCol='carrier_idex',
                        outputCol='carrier_vec',
                        dropLast=True) #default 
encoder = encoder.fit(data_indexed)
data_indexed = encoder.transform(data_indexed)
data_indexed = OneHotEncoder(inputCol='org_idx',
                             outputCol='org_vec',
                             dropLast=True).fit(data_indexed).transform(data_indexed)

In [12]:
data_indexed.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|carrier_idex|org_idx|  carrier_vec|      org_vec|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|         6.0|    2.0|(8,[6],[1.0])|(7,[2],[1.0])|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|         0.0|    0.0|(8,[0],[1.0])|(7,[0],[1.0])|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| 542.0|    0|         0.0|    1.0|(8,[0],[1.0])|(7,[1],[1.0])|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+
only showing top 3 rows



**Chuyển đổi dữ liệu thành vector**

In [14]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [15]:
data_indexed.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'flight',
 'org',
 'mile',
 'depart',
 'duration',
 'delay',
 'km',
 'label',
 'carrier_idex',
 'org_idx',
 'carrier_vec',
 'org_vec']

In [16]:
assembler = VectorAssembler (
    inputCols = ['mon','dom','dow','carrier_vec','org_vec','km','depart','duration'],
    outputCol = 'features')

In [17]:
data_pre = assembler.transform(data_indexed)

In [18]:
data_pre.show(2)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+--------------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|carrier_idex|org_idx|  carrier_vec|      org_vec|            features|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+--------------------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|         6.0|    2.0|(8,[6],[1.0])|(7,[2],[1.0])|(21,[0,1,2,9,13,1...|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|         0.0|    0.0|(8,[0],[1.0])|(7,[0],[1.0])|(21,[1,2,3,11,18,...|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+--------------------+
only showing top 2 rows



In [19]:
final_data = data_pre.select('features', 'label')

In [20]:
final_data.count()

50000

In [21]:
final_data = final_data.na.drop()
final_data.count()

47022

**Lọc dữ liệu có label null**

In [22]:
new_data = data_pre.select('features','label').filter(data_pre.label.isNull())

In [23]:
new_data.show(3,False)

+--------------------------------------------------------------------+-----+
|features                                                            |label|
+--------------------------------------------------------------------+-----+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|null |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |null |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |null |
+--------------------------------------------------------------------+-----+
only showing top 3 rows



In [24]:
new_data.count()

2978

**Xây dựng model**

In [25]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [26]:
from pyspark.ml.classification import  DecisionTreeClassifier

In [28]:
#create a classifier model object
tree = DecisionTreeClassifier(featuresCol = 'features',
                      labelCol = 'label',
                      predictionCol = 'prediction')

In [29]:
#fit model to data
tree_model = tree.fit(train_data)

**Đánh giá kết quả**

In [30]:
#check test dataset
test_model = tree_model.transform(test_data)

In [31]:
#inspect results
test_model.select('label', 'prediction', 'probability').show(3,False)

+-----+----------+---------------------------------------+
|label|prediction|probability                            |
+-----+----------+---------------------------------------+
|1    |1.0       |[0.2188055642715197,0.7811944357284802]|
|1    |1.0       |[0.3677397942250249,0.6322602057749751]|
|1    |1.0       |[0.3677397942250249,0.6322602057749751]|
+-----+----------+---------------------------------------+
only showing top 3 rows



In [32]:
#create a confusion matrix 
test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  820|
|    0|       0.0| 1117|
|    1|       1.0| 5091|
|    0|       1.0| 2308|
+-----+----------+-----+



In [33]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [34]:
#Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
acc_ = multi_evaluator.evaluate(test_model, {multi_evaluator.metricName: "accuracy"})
acc_

0.6649528706083976

In [35]:
#Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(test_model, {binary_evaluator.metricName: "areaUnderROC"})

In [36]:
auc

0.6231899452585615

> Model is not good

**Lưu và load model**

In [37]:
#save model
tree_model.save('tree_model_Flights_50k_new')

                                                                                

In [39]:
from pyspark.ml.classification import DecisionTreeClassificationModel
#load model from 
tree_model2 = DecisionTreeClassificationModel.load('tree_model_Flights_50k_new')

**Dự đoán mới**

In [41]:
#predict new values (asssuming select test_data null)
unlabeled_data = new_data.select('features')

In [44]:
predictions = tree_model2.transform(unlabeled_data)
predictions.show(5,False)

+--------------------------------------------------------------------+---------------+---------------------------------------+----------+
|features                                                            |rawPrediction  |probability                            |prediction|
+--------------------------------------------------------------------+---------------+---------------------------------------+----------+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|[168.0,292.0]  |[0.3652173913043478,0.6347826086956522]|1.0       |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |[1349.0,981.0] |[0.5789699570815451,0.4210300429184549]|0.0       |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |[3285.0,6782.0]|[0.3263136982219132,0.6736863017780869]|1.0       |
|(21,[0,1,3,11,18,19,20],[5.0,8.0,1.0,1.0,378.0,14.48,79.0])         |[3285.0,6782.0]|[0.3263136982219132,0.6736863017780869]|1.0       |
|(21,[0,1,2,9,14,18,19,20],[1.0,13