In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [0]:
data_sample = spark.read.option("header","true").parquet("dbfs:/mnt/group12/filtered/")

In [0]:
data_sample.count()

In [0]:
%pip install vaderSentiment

In [0]:
# Vader Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()

def sentiment_vader(text):
  if (text == None or len(text) < 2):
    return -69
  sentiment_dict = vader.polarity_scores(text)
  if ("compound" in sentiment_dict):
    return sentiment_dict["compound"]
  return -69

sentiment_vader_udf = F.udf(lambda x: sentiment_vader(x), FloatType()) 

In [0]:
data_sample_sentiment = data_sample.withColumn("vader_sentiment", sentiment_vader_udf(F.col("body")))

In [0]:
%pip install liwc

In [0]:
# LIWC Linguistic Dimensions Analysis
import liwc
import re
from collections import Counter

parse, category_names = liwc.load_token_parser('/dbfs/mnt/group12/liwc_en2.dic')

def liwc_analysis(text):
    if (text == None or len(text) < 2):
        return {"tokens": 0}
    tokens = text.split(' ')
    liwc_magic = Counter(category for token in tokens for category in parse(token))
    liwc_results = dict(liwc_magic)
    i = len(tokens)
    for category in category_names:
        if category in liwc_results:
            print(category)
            liwc_results[category] = round(((float(liwc_results[category]) / i) * 100), 2)
    liwc_results["tokens"] = i
    return liwc_results
    
sentiment_liwc_udf = F.udf(lambda x: liwc_analysis(x), MapType(StringType(), FloatType())) 

In [0]:
data_sample_sentiment = data_sample_sentiment.withColumn("liwc_sentiment", sentiment_liwc_udf(F.col("body")))

In [0]:
# Writing output
data_sample_sentiment.write.mode('overwrite').partitionBy('created_at_month').parquet("dbfs:/mnt/group12/sentiment/")

In [0]:
# Testing output
data_output = spark.read.option("header","true").parquet("dbfs:/mnt/group12/sentiment/")

In [0]:
data_output.printSchema()

In [0]:
data_output.select('liwc_sentiment').show(100, False)