In [1]:
import os
os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"

In [2]:
import json
from functools import reduce
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
            .appName("Tutorial") \
            .getOrCreate()

In [3]:
df = spark.read.format("json").load("hdfs://namenode/user/root/input/data10.json", multiLine = "true")

In [4]:
df.printSchema()

root
 |-- androidVersion: string (nullable = true)
 |-- category: string (nullable = true)
 |-- comments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: string (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: string (nullable = true)
 |-- title: string (nullable = true)



In [5]:
df = df.na.drop().dropDuplicates()

In [6]:
df = df.filter(size(df['comments']) >= 30)

In [7]:
df.createOrReplaceTempView("dfTable")

In [8]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 800 kB/s eta 0:00:01
[?25hCollecting nltk>=3.1
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 5.3 MB/s eta 0:00:01
[?25hCollecting click
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 880 kB/s eta 0:00:01
[?25hCollecting joblib
  Downloading joblib-0.17.0-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 7.1 MB/s eta 0:00:01
[?25hCollecting regex
  Downloading regex-2020.11.13-cp37-cp37m-manylinux2014_x86_64.whl (719 kB)
[K     |████████████████████████████████| 719 kB 6.3 MB/s eta 0:00:01
[?25hCollecting tqdm
  Downloading tqdm-4.52.0-py2.py3-none-any.whl (71 kB)
[K     |████████████████████████████████| 71 kB 4.5 MB/s  eta 0:00:01
[?25hBuilding wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldo

In [9]:
from textblob import TextBlob

def to_sentiment(comments):
    """
        This UDF takes size as input and returns number of size
    """
    positive, negative, neutral = 0, 0, 0
    for comment in comments:
        sentiment = TextBlob(comment).polarity
        if sentiment > 0:
            positive += 1
        elif sentiment < 0:
            negative += 1
        else:
            neutral += 1
            
    return (positive, negative, neutral)

convert = udf(to_sentiment, StringType())

In [16]:
df = df.withColumn("tmp", convert(col("comments")))
df.select('tmp').show(2)

+--------------------+
|                 tmp|
+--------------------+
|[Ljava.lang.Objec...|
|[Ljava.lang.Objec...|
+--------------------+
only showing top 2 rows



In [10]:
from platform import python_version

print(python_version())

3.7.3


In [11]:
import re

def remove_character(size):
    """
        This UDF takes size as input and returns number of size
    """
    return re.sub(r'[^\d]', '', size)

convert_to_number = udf(remove_character, IntegerType())

In [12]:
df = df.withColumn("size", convert_to_number(col("size")))
df.show(2)

+--------------+-----------------+--------------------+-------------+--------------+--------+----------+-----+-------+-------+-----+----+--------------------+
|androidVersion|         category|            comments|contentRating|currentVersion|installs|lastUpdate|price|ratings|reviews|score|size|               title|
+--------------+-----------------+--------------------+-------------+--------------+--------+----------+-----+-------+-------+-----+----+--------------------+
|           5.0|          Weather|[Recent updates a...| Everyone 10+|     v4.35.5.2|   10000|1602129706|  0.0|    104|     51| 3.51|null|KX Storm Team - N...|
|           4.1|Books & Reference|[no Arabic audio,...|     Everyone|         1.6.3|    5000|1559180891|  0.0|    247|     80| 4.75|null|Quran Audio - Urd...|
+--------------+-----------------+--------------------+-------------+--------------+--------+----------+-----+-------+-------+-----+----+--------------------+
only showing top 2 rows

