In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

import warnings
warnings.filterwarnings('ignore')

from pyspark.sql.functions import *

In [2]:
sc = SparkContext()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/28 16:06:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/28 16:06:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/28 16:06:16 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/05/28 16:06:16 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
spark = SparkSession(sc)

In [4]:
data = spark.read.csv('/Users/tranhuonggiang/Documents/BI_DA/KHTN/KHTN_BigData in ML/b7/flights.csv', inferSchema = True, header = True)

In [5]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



**Chuẩn hoá dữ liệu**

In [6]:
#import required function
from pyspark.sql.functions import round

In [7]:
#convert mile to km 
data = data.withColumn('km', round(data.mile*1.60934,0))

In [8]:
#create label column with idicating whether flight delay(1) or not(0)
data = data.withColumn('label', (data.delay >0).cast('integer'))

In [9]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| 542.0|    0|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+
only showing top 3 rows



In [10]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [11]:
#Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idex')
#Indexer identifiers categories in the data 
indexer_model = indexer.fit(data)
#Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data)
#Repeat the process for other categorical feature 
data_indexed = StringIndexer(inputCol='org',
                             outputCol='org_idx').fit(data_indexed).transform(data_indexed)


#Create an encoder 
encoder = OneHotEncoder(inputCol='carrier_idex',
                        outputCol='carrier_vec',
                        dropLast=True) #default 
encoder = encoder.fit(data_indexed)
data_indexed = encoder.transform(data_indexed)
data_indexed = OneHotEncoder(inputCol='org_idx',
                             outputCol='org_vec',
                             dropLast=True).fit(data_indexed).transform(data_indexed)

In [12]:
data_indexed.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|carrier_idex|org_idx|  carrier_vec|      org_vec|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|         6.0|    2.0|(8,[6],[1.0])|(7,[2],[1.0])|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|         0.0|    0.0|(8,[0],[1.0])|(7,[0],[1.0])|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8| 542.0|    0|         0.0|    1.0|(8,[0],[1.0])|(7,[1],[1.0])|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+
only showing top 3 rows



**Chuyển đổi dữ liệu thành vector**

In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:
data_indexed.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'flight',
 'org',
 'mile',
 'depart',
 'duration',
 'delay',
 'km',
 'label',
 'carrier_idex',
 'org_idx',
 'carrier_vec',
 'org_vec']

In [15]:
assembler = VectorAssembler (
    inputCols = ['mon','dom','dow','carrier_vec','org_vec','km','depart','duration'],
    outputCol = 'features')

In [16]:
data_pre = assembler.transform(data_indexed)

In [17]:
data_pre.show(2)

+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+--------------------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|    km|label|carrier_idex|org_idx|  carrier_vec|      org_vec|            features|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+--------------------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|3465.0| null|         6.0|    2.0|(8,[6],[1.0])|(7,[2],[1.0])|(21,[0,1,2,9,13,1...|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| 509.0|    1|         0.0|    0.0|(8,[0],[1.0])|(7,[0],[1.0])|(21,[1,2,3,11,18,...|
+---+---+---+-------+------+---+----+------+--------+-----+------+-----+------------+-------+-------------+-------------+--------------------+
only showing top 2 rows



In [18]:
final_data = data_pre.select('features', 'label')

In [19]:
final_data.count()

50000

In [20]:
final_data = final_data.na.drop()
final_data.count()

47022

**Lọc dữ liệu có label null**

In [21]:
new_data = data_pre.select('features','label').filter(data_pre.label.isNull())

In [22]:
new_data.show(3,False)

+--------------------------------------------------------------------+-----+
|features                                                            |label|
+--------------------------------------------------------------------+-----+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|null |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |null |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |null |
+--------------------------------------------------------------------+-----+
only showing top 3 rows



In [23]:
new_data.count()

2978

**Xây dựng model**

In [24]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [25]:
from pyspark.ml.classification import  GBTClassifier

In [26]:
#create a classifier model object
gbt = GBTClassifier(featuresCol = 'features',
                      labelCol = 'label',
                      predictionCol = 'prediction')

In [27]:
#fit model to data
gpt_model = gbt.fit(train_data)

In [28]:
#Find the number of trees and the relative importance of features
print('Number of trees: ', gpt_model.getNumTrees)
print('Relative importance of features: ', gpt_model.featureImportances)

Number of trees:  20
Relative importance of features:  (21,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],[0.2183203312026856,0.1598002163434585,0.15668450490728394,0.0034402340625058286,0.017843039044266512,0.016421958640689488,0.004912015250655779,0.009659747442116489,0.009744692395054686,0.015839736932323264,0.007429634384642113,0.03432201354423174,0.034418840787433264,0.018564547243840858,0.025584305616522272,0.01072227864065958,0.01198647405065771,0.002400113292571108,0.06162538543044953,0.1357091264469043,0.04457080434104748])


**Đánh giá kết quả**

In [29]:
#check test dataset
gpt_test_model = gpt_model.transform(test_data)

In [30]:
#inspect results
gpt_test_model.select('label', 'prediction', 'probability').show(3,False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.09699336527190347,0.9030066347280965]|
|1    |1.0       |[0.15914056374497942,0.8408594362550206]|
|1    |1.0       |[0.15231615782700253,0.8476838421729975]|
+-----+----------+----------------------------------------+
only showing top 3 rows



23/05/28 16:09:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/28 16:09:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [31]:
#create a confusion matrix 
gpt_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  738|
|    0|       0.0| 1147|
|    1|       1.0| 5187|
|    0|       1.0| 2318|
+-----+----------+-----+



In [32]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [33]:
#Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
acc_ = multi_evaluator.evaluate(gpt_test_model, {multi_evaluator.metricName: "accuracy"})
acc_

0.6745473908413205

In [34]:
#Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(gpt_test_model, {binary_evaluator.metricName: "areaUnderROC"})
auc

0.7046681157567234

> Model is not good, but the best in 3 Tree models

**Lưu và load model**

In [35]:
#save model
gpt_model.save('gpt_model_Flights_50k_new')

                                                                                

In [36]:
from pyspark.ml.classification import GBTClassificationModel
#load model from 
gpt_model2 = GBTClassificationModel.load('gpt_model_Flights_50k_new')

**Dự đoán mới**

In [37]:
#predict new values (asssuming select test_data null)
unlabeled_data = new_data.select('features')

In [38]:
predictions = gpt_model2.transform(unlabeled_data)
predictions.show(5,False)

+--------------------------------------------------------------------+--------------------------------------------+----------------------------------------+----------+
|features                                                            |rawPrediction                               |probability                             |prediction|
+--------------------------------------------------------------------+--------------------------------------------+----------------------------------------+----------+
|(21,[0,1,2,9,13,18,19,20],[11.0,20.0,6.0,1.0,1.0,3465.0,9.48,351.0])|[-0.24488620151218093,0.24488620151218093]  |[0.37994718515355796,0.620052814846442] |1.0       |
|(21,[0,1,2,4,11,18,19,20],[4.0,2.0,5.0,1.0,1.0,415.0,8.92,65.0])    |[0.050274769712135785,-0.050274769712135785]|[0.5251162275598901,0.47488377244010993]|0.0       |
|(21,[1,2,3,11,18,19,20],[8.0,2.0,1.0,1.0,538.0,11.08,85.0])         |[-0.41951020795530897,0.41951020795530897]  |[0.301741135829808,0.698258864170192]   |1.0 