# Demo Tree Model

### Dataset: flights.csv
- You'll build a regression model to predict flight delay or not 
- With 'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration' as a predictor

First thing to do is start a Spark Session

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('lr_demo').getOrCreate()

In [5]:
# Use Spark to read flights.csv file.
data = spark.read.csv("./data/flights.csv",inferSchema=True,header=True)

In [6]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [7]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [8]:
data.head()

Row(mon=11, dom=20, dow=6, carrier='US', flight=19, org='JFK', mile=2153, depart=9.48, duration=351, delay='NA')

In [9]:
# for item in data.head():
#     print(item)

In [10]:
data.count()

50000

In [11]:
# Remove the 'flight' column
data = data.drop('flight')

In [12]:
# Number of records with missing 'delay' values
data.filter('delay IS NULL').count()

0

In [13]:
# Remove records with missing 'delay' values
data = data.filter('delay IS NOT NULL')

In [14]:
# Remove records with missing values in any column and get the number of remaining rows
data = data.na.drop()
data.count()

50000

In [15]:
# Import the required function
from pyspark.sql.functions import round

In [16]:
# Convert 'mile' to 'km' and drop 'mile' column
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [17]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
data = data.withColumn('label', (data.delay >= 15).cast('integer'))
# Check first five records
data.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
only showing top 3 rows



## Categories data

In [18]:
from pyspark.ml.feature import StringIndexer

In [19]:
# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(data)

# Indexer creates a new column with numeric index values
data_indexed = indexer_model.transform(data)

# Repeat the process for the other categorical feature
data_indexed = StringIndexer(inputCol='org', outputCol='org_idx').fit(data_indexed).transform(data_indexed)

In [20]:
data_indexed.show(3)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|        6.0|    2.0|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+
only showing top 3 rows



## Setting Up DataFrame for Machine Learning 

## Assembling columns

In [21]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [22]:
data_indexed.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'org',
 'mile',
 'depart',
 'duration',
 'delay',
 'km',
 'label',
 'carrier_idx',
 'org_idx']

In [23]:
# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration'
], outputCol='features')

In [24]:
data_pre = assembler.transform(data_indexed)

In [25]:
# Check the resulting column
data_pre.select('features', 'label').show(2, truncate=False)

+-----------------------------------------+-----+
|features                                 |label|
+-----------------------------------------+-----+
|[11.0,20.0,6.0,6.0,2.0,3465.0,9.48,351.0]|null |
|[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |1    |
+-----------------------------------------+-----+
only showing top 2 rows



In [26]:
data_pre.show(3, False)

+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|km    |label|carrier_idx|org_idx|features                                 |
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
|11 |20 |6  |US     |JFK|2153|9.48  |351     |NA   |3465.0|null |6.0        |2.0    |[11.0,20.0,6.0,6.0,2.0,3465.0,9.48,351.0]|
|0  |22 |2  |UA     |ORD|316 |16.33 |82      |30   |509.0 |1    |0.0        |0.0    |[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |
|2  |20 |4  |UA     |SFO|337 |6.17  |82      |-8   |542.0 |0    |0.0        |1.0    |[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |
+---+---+---+-------+---+----+------+--------+-----+------+-----+-----------+-------+-----------------------------------------+
only showing top 3 rows



In [27]:
final_data = data_pre.select("features","label")
final_data.count()

50000

In [28]:
final_data = final_data.na.drop()
final_data.count()

47022

In [29]:
final_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,22.0,2.0,0.0...|    1|
|[2.0,20.0,4.0,0.0...|    0|
|[9.0,13.0,1.0,1.0...|    0|
|[5.0,2.0,1.0,0.0,...|    0|
|[7.0,2.0,6.0,1.0,...|    1|
+--------------------+-----+
only showing top 5 rows



In [30]:
train_data,test_data = final_data.randomSplit([0.8,0.2])

In [31]:
train_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|             37544|
|   mean|0.5117728531855956|
| stddev|0.4998680378530724|
|    min|                 0|
|    max|                 1|
+-------+------------------+



In [32]:
test_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              9478|
|   mean|0.5094956741928677|
| stddev|0.4999361982425833|
|    min|                 0|
|    max|                 1|
+-------+------------------+



# Decision Tree
- ...

In [33]:
from pyspark.ml.classification import DecisionTreeClassifier

In [34]:
tree = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')

In [35]:
tree_model = tree.fit(train_data)

In [36]:
test_model = tree_model.transform(test_data)

In [37]:
test_model.select('label', 'prediction', 'probability').show(3, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.39353176874642243,0.6064682312535775]|
|1    |1.0       |[0.32442233965392187,0.6755776603460781]|
|1    |1.0       |[0.32442233965392187,0.6755776603460781]|
+-----+----------+----------------------------------------+
only showing top 3 rows



# Triển khai random forest

In [38]:
from pyspark.ml.classification import RandomForestClassifier

In [39]:
rfc = RandomForestClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')

In [40]:
rfc_model = rfc.fit(train_data)

_Xem số cây trog rừng_

In [41]:
rfc_model.getNumTrees

20

_Xem vai trò của các thuộc tính xem thuộc tính nào quan trong hơn_

In [42]:
rfc_model.featureImportances

SparseVector(8, {0: 0.1907, 1: 0.0166, 2: 0.0148, 3: 0.0672, 4: 0.2573, 5: 0.0324, 6: 0.3756, 7: 0.0453})

> * Dựa trên các này để chọn feature selection

* Đánh giá kết quả

In [43]:
rfc_test_model = rfc_model.transform(test_data)

In [44]:
rfc_test_model.select('label', 'prediction', 'probability').show(3, False)

+-----+----------+---------------------------------------+
|label|prediction|probability                            |
+-----+----------+---------------------------------------+
|1    |1.0       |[0.402455822191823,0.5975441778081769] |
|1    |1.0       |[0.3622928439402501,0.6377071560597499]|
|1    |1.0       |[0.3622928439402501,0.6377071560597499]|
+-----+----------+---------------------------------------+
only showing top 3 rows



# Đánh giá két quả

In [45]:
rfc_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1448|
|    0|       0.0| 2560|
|    1|       1.0| 3381|
|    0|       1.0| 2089|
+-----+----------+-----+



In [47]:
TN = rfc_test_model.filter('prediction = 0 and label = prediction').count()
TP = rfc_test_model.filter('prediction = 1 and label = prediction').count()
FN = rfc_test_model.filter('prediction = 0 and label != prediction').count()
FP = rfc_test_model.filter('prediction = 1 and label != prediction').count()



In [48]:
accuracy = (TN + TP) / (TN + TP + FN + FP)

In [49]:
accuracy

0.6268200042202996

# Lưu và load model

In [50]:
rfc_model.save('./data/rfc_model_flight_50k')

In [51]:
from pyspark.ml.classification import RandomForestClassificationModel

In [54]:
rfc_model2 = RandomForestClassificationModel.load('./data/rfc_model_flight_50k')

In [55]:
unlabeled_data = test_data.select('features')

In [56]:
predictions = rfc_model2.transform(unlabeled_data)

In [57]:
predictions.show(3, False)

+-------------------------------------+-------------------------------------+---------------------------------------+----------+
|features                             |rawPrediction                        |probability                            |prediction|
+-------------------------------------+-------------------------------------+---------------------------------------+----------+
|(8,[1,5,6,7],[6.0,378.0,21.33,69.0]) |[8.04911644383646,11.950883556163538]|[0.402455822191823,0.5975441778081769] |1.0       |
|(8,[1,5,6,7],[6.0,2438.0,12.0,244.0])|[7.245856878805,12.754143121194996]  |[0.3622928439402501,0.6377071560597499]|1.0       |
|(8,[1,5,6,7],[6.0,2971.0,11.5,291.0])|[7.245856878805,12.754143121194996]  |[0.3622928439402501,0.6377071560597499]|1.0       |
+-------------------------------------+-------------------------------------+---------------------------------------+----------+
only showing top 3 rows



# Boosting - gradient boosted trees

In [59]:
from pyspark.ml.classification import GBTClassifier

In [60]:
gbt = GBTClassifier(featuresCol='features', labelCol='label', predictionCol='prediction')

In [61]:
gbt_model = gbt.fit(train_data)

In [62]:
gbt_model.getNumTrees

20

In [63]:
gbt_model.featureImportances

SparseVector(8, {0: 0.2055, 1: 0.1465, 2: 0.1449, 3: 0.0904, 4: 0.1591, 5: 0.0617, 6: 0.1538, 7: 0.0382})

_Đánh giá kết quả_

In [64]:
gbt_test_model = gbt_model.transform(test_data)

In [65]:
gbt_test_model.select('label', 'prediction', 'probability').show(3, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |1.0       |[0.25586921312132144,0.7441307868786786]|
|1    |1.0       |[0.3445452533501472,0.6554547466498528] |
|1    |1.0       |[0.33729927509134944,0.6627007249086505]|
+-----+----------+----------------------------------------+
only showing top 3 rows



In [67]:
gbt_test_model.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1410|
|    0|       0.0| 2808|
|    1|       1.0| 3419|
|    0|       1.0| 1841|
+-----+----------+-----+



_Lưu model_

In [69]:
gbt_model.save('gbt_model_flight')

In [70]:
from pyspark.ml.classification import GBTClassificationModel

In [71]:
gbt_model2 = GBTClassificationModel.load('gbt_model_flight')

In [72]:
unlabeled_data = test_data.select('features')

In [73]:
predictions = gbt_model2.transform(unlabeled_data)

In [74]:
predictions.select('features', 'probability', 'prediction').show(3, False)

+-------------------------------------+----------------------------------------+----------+
|features                             |probability                             |prediction|
+-------------------------------------+----------------------------------------+----------+
|(8,[1,5,6,7],[6.0,378.0,21.33,69.0]) |[0.25586921312132144,0.7441307868786786]|1.0       |
|(8,[1,5,6,7],[6.0,2438.0,12.0,244.0])|[0.3445452533501472,0.6554547466498528] |1.0       |
|(8,[1,5,6,7],[6.0,2971.0,11.5,291.0])|[0.33729927509134944,0.6627007249086505]|1.0       |
+-------------------------------------+----------------------------------------+----------+
only showing top 3 rows



# So sánh

In [75]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [76]:
dtc_predictions = tree_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [77]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')

In [78]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)


In [79]:
dtc_acc, rfc_acc, gbt_acc

(0.6254484068368854, 0.6268200042202996, 0.6569951466554126)

In [80]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [81]:
evaluator = BinaryClassificationEvaluator()

In [82]:
dtc_acc_2 = evaluator.evaluate(dtc_predictions)
rfc_acc_2 = evaluator.evaluate(rfc_predictions)
gbt_acc_2 = evaluator.evaluate(gbt_predictions)

In [84]:
dtc_acc_2,rfc_acc_2,gbt_acc_2,

(0.6359242603826519, 0.6762839108257399, 0.7151133845264553)