In [1]:
import os
os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"

In [2]:
!pip install textblob
!pip install pandas
!pip install numpy

Collecting textblob
  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 789 kB/s eta 0:00:01
[?25hCollecting nltk>=3.1
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 4.4 MB/s eta 0:00:01
[?25hCollecting click
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 258 kB/s eta 0:00:01
[?25hCollecting joblib
  Downloading joblib-0.17.0-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 4.8 MB/s eta 0:00:01
[?25hCollecting regex
  Downloading regex-2020.11.13-cp37-cp37m-manylinux2014_x86_64.whl (719 kB)
[K     |████████████████████████████████| 719 kB 3.5 MB/s eta 0:00:01
[?25hCollecting tqdm
  Downloading tqdm-4.54.1-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 1.4 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldon

In [3]:
from functools import reduce
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
            .appName("DataSentiment") \
            .getOrCreate()

In [4]:
schema = StructType([
    StructField('comment', StringType(), False),
    StructField('sentiment', IntegerType(), False),
])

In [5]:
files = ['{:0>2}'.format(str(i)) for i in list(range(1, 5))]

In [6]:
files

['01', '02', '03', '04']

In [7]:
for file in files:
    df = spark.read.format("json").load("hdfs://namenode/user/root/input/data{name}.json".format(name=file), multiLine = "true")
    df = df.na.drop().dropDuplicates()
    df = df.filter(size(df['comments']) >= 30)
    
    comments = df.select('comments').collect()
    data = []
    from textblob import TextBlob

    for comment in comments:
        for item in comment[0]:
            try:
                sentiment = TextBlob(item).polarity
            
                if sentiment > 0:
                    data.append((item, 1))
                elif sentiment <= 0:
                    data.append((item, -1))
            except:
                # item chua cac ki tu dac biet => None
                pass
            
    df = spark.createDataFrame(data, schema)
    
    if not os.path.isfile('data_sentiment.csv'):
        df.toPandas().to_csv('data_sentiment.csv', index=False)
    else:
        df.toPandas().to_csv('data_sentiment.csv', mode='a', header=False, index=False)