In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

import warnings
warnings.filterwarnings('ignore')

from pyspark.sql.functions import *

In [2]:
sc = SparkContext()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/28 08:03:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark = SparkSession(sc)

In [5]:
data = spark.read.csv('/Users/tranhuonggiang/Documents/BI_DA/KHTN/KHTN_BigData in ML/b7/flights.csv', inferSchema = True, header = True)

In [6]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



**Chuyển đổi dữ liệu thành vector**

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler (
    inputCols = ['mile'],
    outputCol = 'features')

In [9]:
data_pre = assembler.transform(data)

In [10]:
data_pre.show(2)

+---+---+---+-------+------+---+----+------+--------+-----+--------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|features|
+---+---+---+-------+------+---+----+------+--------+-----+--------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|[2153.0]|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30| [316.0]|
+---+---+---+-------+------+---+----+------+--------+-----+--------+
only showing top 2 rows



In [11]:
final_data = data_pre.select('features', 'duration')

In [12]:
final_data.count()

50000

In [13]:
final_data = final_data.na.drop()
final_data.count()

50000

In [14]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [15]:
train_data.describe().show()

+-------+------------------+
|summary|          duration|
+-------+------------------+
|  count|             40111|
|   mean|152.18187030989006|
| stddev| 87.29411140629854|
|    min|                30|
|    max|               560|
+-------+------------------+



In [16]:
test_data.describe().show()

+-------+-----------------+
|summary|         duration|
+-------+-----------------+
|  count|             9889|
|   mean|150.0782687834968|
| stddev|86.01128932400309|
|    min|               30|
|    max|              560|
+-------+-----------------+



**Xây dựng model**

In [17]:
from pyspark.ml.regression import LinearRegression

In [18]:
#create a Linear regression model object
lr = LinearRegression(featuresCol = 'features',
                      labelCol = 'duration',
                      predictionCol = 'prediction')

In [19]:
#fit model to data
lrModel = lr.fit(train_data)

23/05/28 08:16:47 WARN Instrumentation: [ff6a60e8] regParam is zero, which might cause numerical instability and overfitting.
23/05/28 08:16:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/28 08:16:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
23/05/28 08:16:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [20]:
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients, lrModel.intercept))

Coefficients: [0.12167042364354663] Intercept: 44.449894646576816


**Đánh giá kết quả**

In [24]:
test_model = lrModel.transform(test_data)

In [25]:
test_model.select("prediction", "duration").show(5)

+------------------+--------+
|        prediction|duration|
+------------------+--------+
|52.601813030694444|      44|
|52.601813030694444|      44|
|52.601813030694444|      44|
|52.601813030694444|      46|
|52.601813030694444|      46|
+------------------+--------+
only showing top 5 rows



In [26]:
print('RMSE: {}' .format(test_results.rootMeanSquaredError))
print('MSE: {}' .format(test_results.meanSquaredError))
print('R2: {}'.format(test_results.r2))

RMSE: 16.664566132915972
MSE: 277.70776439832997
R2: 0.9624576870410794


> Good result

**Lưu và load model**

In [27]:
#save model
lrModel.save('lrModel_Flights_50k')

                                                                                

In [28]:
from pyspark.ml.regression import LinearRegressionModel
#load model from 
lfModel2 = LinearRegressionModel.load('lrModel_Flights_50k')

In [29]:
#predict new values (asssuming select test_data)
unlabeled_data = test_data.select('features')

In [30]:
predictions = lfModel2.transform(unlabeled_data)
predictions.show(5)

+--------+------------------+
|features|        prediction|
+--------+------------------+
|  [67.0]|52.601813030694444|
|  [67.0]|52.601813030694444|
|  [67.0]|52.601813030694444|
|  [67.0]|52.601813030694444|
|  [67.0]|52.601813030694444|
+--------+------------------+
only showing top 5 rows

