# Khởi tạo môi trường

In [None]:
import os, re
from textblob import TextBlob as model
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml import PipelineModel

mongo = "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0"

os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"
os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages {} pyspark-shell".format(mongo))

# Tạo kết nối giữa Spark và MongDB

In [2]:
KAFKA_BROKER = "kafka:9092"
KAFKA_TOPIC = "fixcer"

spark = SparkSession \
        .builder \
        .master("spark://spark-master:7077") \
        .appName("Spark") \
        .config("spark.mongodb.output.uri", "mongodb://172.16.0.2:27017/bigdata.application") \
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
        .getOrCreate()

# Đọc dữ liệu từ HDFS

In [3]:
df = spark.read.format("json") \
            .option("mode", "FAILFAST") \
            .option("inferSchema", "true") \
            .load("hdfs://namenode/user/root/input/")

# model1 = PipelineModel.load("hdfs://namenode/user/root/model")

In [4]:
df.printSchema()

root
 |-- androidVersion: string (nullable = true)
 |-- category: string (nullable = true)
 |-- comments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: string (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: string (nullable = true)
 |-- title: string (nullable = true)



In [None]:
schema =  StructType([
    StructField("androidVersion", StringType(), True),
    StructField("category", StringType(), True),
    StructField("comments", ArrayType(StringType()), True),
    StructField("contentRating", StringType(), True),
    StructField("currentVersion", StringType(), True),
    StructField("installs", LongType(), True),
    StructField("lastUpdate", LongType(), True),
    StructField("price", DoubleType(), True),
    StructField("ratings", LongType(), True),
    StructField("reviews", LongType(), True),
    StructField("score", DoubleType(), True),
    StructField("size", StringType(), True),
    StructField("title", StringType(), True)
]) 

# Tiền xử lý dữ liệu

In [8]:
df = df.na.drop().dropDuplicates()

In [9]:
df = df.filter(size(df['comments']) >= 30)

In [10]:
def convertString(string):
    try:
        string = re.sub('\D', '', string)
        return int(string)
    except:
        return 0
        
convert = udf(convertString, IntegerType())
df = df.withColumn("androidVersion", convert(col("androidVersion")))
df = df.withColumn("currentVersion", convert(col("currentVersion")))
df = df.withColumn("size", convert(col("size")))

In [11]:
df.printSchema()

root
 |-- androidVersion: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- comments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: integer (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: integer (nullable = true)
 |-- title: string (nullable = true)



In [12]:
schema = StructType([StructField('comment', StringType(), True)])

# Gán nhãn comment

In [13]:
def to_sentiment(comments):
    positive, negative = 0, 0
    
    for comment in comments:
        try:
            sentiment = model(comment).polarity
            if sentiment >= 0:
                positive += 1
            else:
                negative += 1
        except:
            negative += 1
            
    return [positive, negative]

convert = udf(to_sentiment, ArrayType(IntegerType()))

In [14]:
df = df.withColumn("comments", convert(col("comments")))

In [15]:
df = df.withColumn('positive', df.comments[0]).withColumn('negative', df.comments[1])

In [16]:
df = df.drop('comments')

In [17]:
df = df.filter(df.positive*df.negative > 0)

In [18]:
df.printSchema()

root
 |-- androidVersion: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: integer (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- positive: integer (nullable = true)
 |-- negative: integer (nullable = true)



# Ghi vào MongoDB

In [19]:
df.write.format("mongo").mode("append").save()

In [20]:
spark.stop()