In [1]:
import os
os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"

In [2]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
            .appName("Project") \
            .getOrCreate()

In [3]:
static = spark.read.format("json").load("hdfs://namenode/user/root/input/data10.json", multiLine = "true")

In [4]:
dataSchema = static.schema

In [5]:
streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1)\
.json("hdfs://namenode/user/root/input/data10.json")

In [6]:
spark.conf.set("spark.sql.shuffle.partitions", 5)

In [9]:
category = streaming.groupBy("category").count()

In [10]:
query = category.writeStream.queryName("spark")\
.format("memory").outputMode("complete")\
.start()

In [12]:
spark.sql('SELECT * FROM spark').show()

+--------+-----+
|category|count|
+--------+-----+
+--------+-----+



In [5]:
df = df.na.drop().dropDuplicates()

In [6]:
df = df.filter(size(df['comments']) >= 30)

In [7]:
df.createOrReplaceTempView("dfTable")

In [8]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 1.1 MB/s eta 0:00:01
[?25hCollecting nltk>=3.1
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 16.5 MB/s eta 0:00:01
[?25hCollecting click
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 1.9 MB/s  eta 0:00:01
[?25hCollecting joblib
  Downloading joblib-0.17.0-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 18.8 MB/s eta 0:00:01
[?25hCollecting regex
  Downloading regex-2020.11.13-cp37-cp37m-manylinux2014_x86_64.whl (719 kB)
[K     |████████████████████████████████| 719 kB 8.1 MB/s eta 0:00:01
[?25hCollecting tqdm
  Downloading tqdm-4.53.0-py2.py3-none-any.whl (70 kB)
[K     |████████████████████████████████| 70 kB 7.2 MB/s  eta 0:00:01
[?25hBuilding wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25

In [9]:
from textblob import TextBlob

def to_sentiment(comments):
    positive, negative, neutral = 0, 0, 0
    for comment in comments:
        sentiment = TextBlob(comment).polarity
        if sentiment > 0:
            positive += 1
        elif sentiment < 0:
            negative += 1
        else:
            neutral += 1
            
    return [positive, negative, neutral]

convert = udf(to_sentiment, ArrayType(IntegerType()))

In [10]:
df = df.withColumn("comments", convert(col("comments")))

In [11]:
df = df.withColumn('positive', df.comments[0]) \
        .withColumn('negative', df.comments[1]) \
        .withColumn('neutral', df.comments[2])

In [12]:
df = df.drop('comments')

In [14]:
df.printSchema()

root
 |-- androidVersion: string (nullable = true)
 |-- category: string (nullable = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: string (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: string (nullable = true)
 |-- title: string (nullable = true)
 |-- positive: integer (nullable = true)
 |-- negative: integer (nullable = true)
 |-- neutral: integer (nullable = true)



In [17]:
df.select('neutral').show()

+-------+
|neutral|
+-------+
|      0|
|     13|
|     15|
|     35|
|     34|
|     12|
|     16|
|      3|
|      6|
|      0|
|      8|
|     24|
|      6|
|     35|
|     19|
|     37|
|      9|
|     10|
|      2|
|      8|
+-------+
only showing top 20 rows



In [None]:
import re

def remove_character(size):
    """
        This UDF takes size as input and returns number of size
    """
    return re.sub(r'[^\d]', '', size)

convert_to_number = udf(remove_character, IntegerType())

In [None]:
df = df.withColumn("size", convert_to_number(col("size")))
df.show(2)

In [None]:
df.select('size').collect()