In [1]:
import os
os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"

In [2]:
from functools import reduce
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
            .appName("DataSentiment") \
            .getOrCreate()

In [3]:
schema = StructType([
    StructField('comment', StringType(), False),
    StructField('sentiment', IntegerType(), False),
])

In [4]:
files = ['{:0>2}'.format(str(i)) for i in list(range(1, 5))]

In [5]:
files

['01', '02', '03', '04']

In [6]:
for file in files:
    df = spark.read.format("json").load("hdfs://namenode/user/root/input/data{name}.json".format(name=file), multiLine = "true")
    df = df.na.drop().dropDuplicates()
    df = df.filter(size(df['comments']) >= 30)
    
    comments = df.select('comments').collect()
    data = []
    from textblob import TextBlob

    for comment in comments:
        for item in comment[0]:
            try:
                sentiment = TextBlob(item).polarity
            
                if sentiment > 0:
                    data.append((item, 1))
                elif sentiment <= 0:
                    data.append((item, -1))
            except:
                # item chua cac ki tu dac biet => None
                pass
            
    df = spark.createDataFrame(data, schema)
    
    if not os.path.isfile('data_sentiment.csv'):
        df.toPandas().to_csv('data_sentiment.csv', index=False)
    else:
        df.toPandas().to_csv('data_sentiment.csv', mode='a', header=False, index=False)

In [7]:
spark.stop()