In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from modules.my_spark_regression import *
from modules.my_pyspark import *
from modules.my_drawer import MyDrawer

In [3]:
spark = MyPySpark(session=True, sql=True)
drawer = MyDrawer()

# 3. Xây dựng model

## 3.1. Chuẩn bị & chuẩn hóa dữ liệu, xác định input, output

* Đọc dữ liệu

In [4]:
file_path = r'./data/Ecommerce_Customers.csv'

In [5]:
data = spark.readFile(file_path)

In [6]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [7]:
data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

* Xác định input và output

In [8]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [9]:
input_features = [
    'Avg Session Length',
    'Time on App',
    'Time on Website',
    'Length of Membership'
]

In [10]:
assembler = VectorAssembler(inputCols=input_features, outputCol='features')

In [11]:
data_pre = assembler.transform(data)

In [12]:
data_pre.select('features').show(2, False)

+--------------------------------------------------------------------------+
|features                                                                  |
+--------------------------------------------------------------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]|
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]|
+--------------------------------------------------------------------------+
only showing top 2 rows



In [13]:
final_data = data_pre.select('features', 'Yearly Amount Spent')

In [15]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

## 3.2. Chuẩn bị train/test data

In [16]:
train_data, test_data = final_data.randomSplit((0.7, 0.3))

In [17]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                358|
|   mean| 497.63883959415574|
| stddev|   79.3679312083183|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [18]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                142|
|   mean|   503.537426440758|
| stddev|  79.30326424083776|
|    min|  282.4712457199145|
|    max|  669.9871405017029|
+-------+-------------------+



> * Dữ liệu train và test gần như tương đương, ko có sự chênh lệch cao về mặt thống kê

## 3.3. Xây dựng model

* Tạo model Linear Regression

In [19]:
lr = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent', predictionCol='predict_Yearly Amount Spent')

* Fit model với data và gán model cho một biến nào đó

In [20]:
lrModel = lr.fit(train_data)

* In ra coefficients và intercept

In [21]:
lrModel.coefficients, lrModel.intercept

(DenseVector([25.9994, 39.0849, -0.0119, 61.3706]), -1047.252488380595)

## 3.4. Đánh giá model vs test data

In [22]:
test_results = lrModel.evaluate(test_data)

* Đánh giá phần dư

In [23]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -5.500441931586067|
| 10.327593112099407|
| -4.868492214528658|
| -21.89456809587739|
|-1.4123358263849468|
| 21.860225983500868|
| 3.6401176836935747|
| -4.128576681792765|
| -4.375517854487043|
| -5.365647047029654|
|  -6.92255184825575|
| 1.0879992421271254|
|-2.5856846772807103|
| 12.130763033716732|
| -5.542887580120237|
|-17.981332682782295|
|   5.14091632855434|
| -9.180900570771655|
|  5.814355031745777|
|-0.1644965762902757|
+-------------------+
only showing top 20 rows



* Đánh giá RMSE

In [25]:
test_results.rootMeanSquaredError

10.196406828635983

* Đánh giá mean squared error

In [26]:
test_results.meanSquaredError

103.96671221505449

* Đánh giá $R^2$

In [27]:
test_results.r2

0.9833512583450277

## 3.5. Đánh giá model vs test data

In [29]:
test_model = lrModel.transform(test_data)
test_model.select('predict_Yearly Amount Spent', 'Yearly Amount Spent').show()

+---------------------------+-------------------+
|predict_Yearly Amount Spent|Yearly Amount Spent|
+---------------------------+-------------------+
|          287.9716876515006|  282.4712457199145|
|          451.4531490841305|  461.7807421962299|
|         472.37039264151827|  467.5019004269896|
|         508.84162193564316|  486.9470538397658|
|          422.7389670833363|  421.3266312569514|
|          569.9208634421666|  591.7810894256675|
|         491.53583276578183|  495.1759504494754|
|          534.8952953365547|  530.7667186547619|
|         494.18800585094846|  489.8124879964614|
|         381.70254780395385|  376.3369007569242|
|         510.31043913621625|  503.3878872879605|
|         384.06433874584786|   385.152337987975|
|          558.8838258513274|  556.2981411740467|
|         380.07417041060967|  392.2049334443264|
|          662.5628115177722|  657.0199239376519|
|          565.1072644299811|  547.1259317471988|
|         325.45352970554586|  330.5944460341002|


## 3.6. Lưu trữ & tải model

* Lưu model

In [30]:
file_path1 = r'./data/lrModel_Ecommerce_Customers'

In [31]:
lrModel.save(file_path1)

* Tải model

In [32]:
from pyspark.ml.regression import LinearRegressionModel

In [33]:
lrModel2 = LinearRegressionModel.load(file_path1)

## 3.7. Dự đoán dữ liệu mới

In [34]:
unlabeled_data = test_data.select('features')
preditions = lrModel2.transform(unlabeled_data)

In [36]:
preditions.show()

+--------------------+---------------------------+
|            features|predict_Yearly Amount Spent|
+--------------------+---------------------------+
|[30.4925366965402...|          287.9716876515006|
|[30.7377203726281...|          451.4531490841305|
|[30.8364326747734...|         472.37039264151827|
|[31.1239743499119...|         508.84162193564316|
|[31.2606468698795...|          422.7389670833363|
|[31.2834474760581...|          569.9208634421666|
|[31.3584771924370...|         491.53583276578183|
|[31.4252268808548...|          534.8952953365547|
|[31.5147378578019...|         494.18800585094846|
|[31.6253601348306...|         381.70254780395385|
|[31.7242025238451...|         510.31043913621625|
|[31.8293464559211...|         384.06433874584786|
|[31.8627411090001...|          558.8838258513274|
|[31.9262720263601...|         380.07417041060967|
|[31.9453957483445...|          662.5628115177722|
|[31.9563005605233...|          565.1072644299811|
|[31.9764800614612...|         