In [1]:
import os, re
from textblob import TextBlob as model
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml import PipelineModel

mongo = "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0"

os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"
os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages {} pyspark-shell".format(mongo))

In [2]:
KAFKA_BROKER = "kafka:9092"
KAFKA_TOPIC = "fixcer"

spark = SparkSession \
        .builder \
        .master("local") \
        .appName("Spark") \
        .config("spark.mongodb.output.uri", "mongodb://172.16.0.12:27017/bigdata.application") \
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
        .getOrCreate()

In [67]:
df = spark.read.format("json") \
            .option("mode", "FAILFAST") \
            .option("inferSchema", "true") \
            .load("hdfs://namenode/user/root/input/data07.json")

# model1 = PipelineModel.load("hdfs://namenode/user/root/model")

In [68]:
df = df.na.drop().dropDuplicates()

In [69]:
df = df.filter(size(df['comments']) >= 30)

In [70]:
def convertString(string):
    try:
        string = re.sub('\D', '', string)
        return int(string)
    except:
        return 0
        
convert = udf(convertString, IntegerType())
df = df.withColumn("androidVersion", convert(col("androidVersion")))
df = df.withColumn("currentVersion", convert(col("currentVersion")))
df = df.withColumn("size", convert(col("size")))

In [71]:
def to_sentiment(comments):
    positive, negative = 0, 0
    
    for comment in comments:
        try:
            sentiment = model(comment).polarity
            if sentiment >= 0:
                positive += 1
            else:
                negative += 1
        except:
            negative += 1
            
    return [positive, negative]

convert = udf(to_sentiment, ArrayType(IntegerType()))

In [72]:
df = df.withColumn("comments", convert(col("comments")))

In [73]:
df = df.withColumn('positive', df.comments[0]).withColumn('negative', df.comments[1])

In [74]:
df = df.drop('comments')

In [75]:
df = df.filter(df.positive*df.negative > 0)

In [None]:
df.write.format("mongo").mode("append").save()