# Khởi tạo môi trường

In [1]:
import os 

mongo = "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0"

os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"
os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages {} pyspark-shell".format(mongo))

# Tạo phiên spark

In [2]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession \
        .builder \
        .master("local") \
        .appName("Spark") \
        .config("spark.mongodb.input.uri", "mongodb://172.16.0.2:27017/bigdata.application") \
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
        .getOrCreate()

# Đọc dữ liệu từ Mongodb

Xóa đi thuộc tính _id do mongo tự tạo

In [3]:
df = spark.read.format("mongo").load()

In [4]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- androidVersion: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: integer (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- negative: integer (nullable = true)
 |-- positive: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: integer (nullable = true)
 |-- title: string (nullable = true)



In [5]:
df = df.drop('_id')

In [6]:
df.show(5)

+--------------+----------------+-------------+--------------+--------+----------+--------+--------+-----+-------+-------+---------+----+--------------------+
|androidVersion|        category|contentRating|currentVersion|installs|lastUpdate|negative|positive|price|ratings|reviews|    score|size|               title|
+--------------+----------------+-------------+--------------+--------+----------+--------+--------+-----+-------+-------+---------+----+--------------------+
|            70|    Food & Drink|     Everyone|           125|  500000|1598263608|       9|      31|  0.0|   1600|    926|3.8198757|  36|       Jumbo Extra's|
|            41|News & Magazines| Everyone 10+|           745|   10000|1586900532|      14|      26|  0.0|    190|     88|     2.97|  29|Delaware County D...|
|            43|Health & Fitness|     Everyone|           522| 1000000|1600248328|       8|      32|  0.0|  14646|   8866|4.5125256|  24|Openrider - GPS C...|
|            44|    House & Home|     Everyone

# Quan sát một số thuộc tính của dữ liệu

In [7]:
import pyspark.sql.functions as F
df.describe().select(
                    "summary",
                    F.round("installs", 4).alias("installs"),
                    F.round("negative", 4).alias("negative"),
                    F.round("positive", 4).alias("positive"),
                    F.round("price", 4).alias("price"),
                    F.round("ratings", 4).alias("ratings"),
                    F.round("reviews", 4).alias("reviews"),
                    F.round("size", 4).alias("size"),
                    F.round("score", 4).alias("score"))\
                    .show()

+-------+---------------+--------+--------+------+-----------+-----------+-------+------+
|summary|       installs|negative|positive| price|    ratings|    reviews|   size| score|
+-------+---------------+--------+--------+------+-----------+-----------+-------+------+
|  count|         3008.0|  3008.0|  3008.0|3008.0|     3008.0|     3008.0| 3008.0|3008.0|
|   mean|   2634993.8497|  7.7713|  31.754|0.1054| 59530.0851| 21788.7297|39.7743|3.9423|
| stddev|2.06761073259E7|  5.6657|  5.6946|1.8382|418068.4062|150725.1067|52.4433|0.6381|
|    min|          500.0|     1.0|     8.0|   0.0|       36.0|       29.0|    0.0| 1.017|
|    max|          1.0E9|    32.0|    39.0| 89.99| 1.380783E7|  5304892.0|  989.0|4.9375|
+-------+---------------+--------+--------+------+-----------+-----------+-------+------+



# Tạo dữ liệu cho mô hình học máy

In [8]:
df = df.select('score','size','reviews','ratings','price','positive','negative','installs');

# Feature selecting

In [9]:
feature_cols = ['size','reviews','ratings','price','positive','negative','installs']

# Tạo vector thuộc tính

In [10]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") 

# Khởi tạo mô hình

Mô hình học máy nhóm em lựa chọn là: <b>Gradient-boosted tree regression

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [12]:
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)

# Tạo tập dữ liệu train, test

Dữ liệu được chia theo tỉ lệ 8:2

In [13]:
import numpy as np
rnd_seed=23
np.random.seed=rnd_seed
np.random.set_state=rnd_seed
train_data, test_data = df.randomSplit([.8,.2], seed=rnd_seed)

# Huấn luyện và kiểm thử

Đánh giá mô hình với hàm tính sai số: <b>RMSE(root-mean-square)

In [14]:
gbt = GBTRegressor(featuresCol="indexedFeatures",labelCol="score" ,maxIter=10)

In [15]:
pipeline = Pipeline(stages=[assembler, featureIndexer, gbt])

In [16]:
model = pipeline.fit(train_data)

In [17]:
predictions = model.transform(test_data)

In [18]:
predictions.select("prediction", "score", "features").show()

+------------------+---------+--------------------+
|        prediction|    score|            features|
+------------------+---------+--------------------+
| 2.664432764094531|1.0170068|[23.0,621.0,984.0...|
| 4.048884485688873|     1.59|[20.0,614.0,928.0...|
| 2.399578421695146|      1.6|[15.0,84.0,105.0,...|
|3.3655585545994953|1.6272874|[0.0,14530.0,3457...|
|2.6380712011364307|1.8461539|[32.0,34.0,39.0,0...|
| 3.193976929030702|1.9272727|[29.0,945.0,1104....|
| 4.055340827827446|1.9287671|[53.0,2244.0,3578...|
| 3.304793193200169|2.0067797|[18.0,1709.0,2920...|
|3.7256261307552054|2.0207374|[56.0,5558.0,8686...|
|3.9619794412492606| 2.068716|[22.0,57042.0,991...|
| 2.684336333177493|2.1014493|[49.0,56.0,69.0,0...|
| 2.777383871643496|     2.14|[23.0,165.0,308.0...|
| 2.757009584499807|      2.2|[29.0,258.0,439.0...|
|  4.12330435247285|2.2307692|[0.0,871.0,1511.0...|
| 3.117998467439372|     2.29|[16.0,395.0,705.0...|
| 4.099397973294209|2.3039215|[52.0,370.0,1021....|
|  4.2386726

In [19]:
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.519705


In [20]:
gbtModel = model.stages[1]
print(gbtModel)

VectorIndexerModel: uid=VectorIndexer_868c79263843, numFeatures=7, handleInvalid=error


# Lưu mô hình

In [26]:
model.write().overwrite().save("hdfs://namenode/user/root/model/rating")

# Hàm dự doán

In [27]:
from pyspark.ml.pipeline import PipelineModel
model = PipelineModel.load("hdfs://namenode/user/root/model/rating")

In [43]:
def prediction_score(item):
    prediction = model.transform(item)
    return prediction.select('prediction').collect()[0]["prediction"]

In [29]:
size, reviews, ratings, price, positive, negative, installs = (22, 2872, 6374, 0.0, 34, 6, 100000)
feature_cols = ['size','reviews','ratings','price','positive','negative','installs']
record = [(size, reviews, ratings, price, positive, negative, installs)]
df = spark.createDataFrame(record, feature_cols)

In [44]:
prediction_score(df)

4.19760038586903