In [0]:
%pip install nltk

In [0]:
%pip install vaderSentiment

In [0]:
%pip install liwc

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import *
from itertools import islice
import requests
import nltk
import re
import liwc
import re
from collections import Counter
parse, category_names = liwc.load_token_parser('/dbfs/mnt/group12/liwc_en2.dic')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
nltk.download('stopwords')
from nltk.corpus import stopwords
StopWords = stopwords.words("english")

stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords_2 = set(stopwords_list.decode().splitlines()) 
vader = SentimentIntensityAnalyzer()

In [0]:
# Defining udfs

# Stopwords token filtering
def stop_words_filter(x):
    return (~x.isin(StopWords)) & (~x.isin(stopwords_2)) & (x.isNotNull()) & (F.length(x) > 2)

# Text cleaning
def clean_body(x):
    punc='!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~'
    cleaned = x.lower()
    cleaned = re.sub(r'https?:\/\/.*[\r\n]*', '', cleaned, flags=re.MULTILINE)
    cleaned = re.sub(r'[u|a]\d+.*', '', cleaned, flags=re.MULTILINE) # remove special line break characters
    for ch in punc:
        cleaned = cleaned.replace(ch, '')
    return cleaned

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

# Cound word occurences
def word_count_agg(token_lists):
    word_count = {}
    for token_list in token_lists:
        for token in token_list:
            if token not in word_count:
                word_count[token] = 0
            word_count[token] += 1
    sorted_word_count = dict(sorted(word_count.items(), key=lambda item: item[1], reverse=True))
    n_items = take(50, sorted_word_count.items()) #Top 50
    return dict(n_items)

# Vader sentiment analysis
def sentiment_vader(text):
    if (text == None or len(text) < 2):
        return -69
    sentiment_dict = vader.polarity_scores(text)
    if ("compound" in sentiment_dict):
        return sentiment_dict["compound"]
    return -69

# LIWC linguistic analysis
def liwc_analysis(text):
    if (text == None or len(text) < 2):
        return {"tokens": 0}
    tokens = text.split(' ')
    liwc_magic = Counter(category for token in tokens for category in parse(token))
    liwc_results = dict(liwc_magic)
    i = len(tokens)
    for category in category_names:
        if category in liwc_results:
            print(category)
            liwc_results[category] = round(((float(liwc_results[category]) / i) * 100), 2)
    liwc_results["tokens"] = i
    return liwc_results
  
sentiment_liwc_udf = F.udf(lambda x: liwc_analysis(x), MapType(StringType(), FloatType())) 
sentiment_vader_udf = F.udf(lambda x: sentiment_vader(x), FloatType()) 
udf_word_count_agg = F.udf(word_count_agg , MapType(StringType(), IntegerType()))
clean_body_udf = F.udf(clean_body , StringType())


In [0]:
data_output = spark.read.option("header",True).csv("dbfs:/mnt/group12/twitter/data.csv")

In [0]:
data_output.printSchema()

In [0]:
data_chunk_filtered = data_output.withColumn('word_count', F.size(F.split(F.col('text'), ' ')))
data_chunk_filtered = data_chunk_filtered.filter((F.col("word_count") > 5))
data_chunk_tokens = data_chunk_filtered.withColumn('cleaned_body', clean_body_udf(F.col('text')))
data_chunk_tokens = data_chunk_tokens.withColumn('tokens', F.filter(F.split(F.col('cleaned_body'), ' '), stop_words_filter))

In [0]:
data_chunk_tokens.count()

In [0]:
data_sample_sentiment = data_chunk_tokens.withColumn("vader_sentiment", sentiment_vader_udf(F.col("text")))
data_sample_sentiment = data_sample_sentiment.withColumn("liwc_sentiment", sentiment_liwc_udf(F.col("text")))

In [0]:
# Data aggregation by topic
data_grouped = data_sample_sentiment.groupby("additional_field").agg(
    F.count("_id").alias("number_of_messages"),
	F.avg("vader_sentiment").alias("avg_vader_sentiment"), 
    F.create_map(
        F.lit('negative_emotions'), F.avg("liwc_sentiment.negemo"),
        F.lit('positive_emotions'), F.avg("liwc_sentiment.posemo"),
        F.lit('anger'), F.avg("liwc_sentiment.anger"), 
        F.lit('sad'), F.avg("liwc_sentiment.sad"), 
        F.lit('money'), F.avg("liwc_sentiment.money"),	
        F.lit('health'), F.avg("liwc_sentiment.health"), 
        F.lit('social'), F.avg("liwc_sentiment.social"),
        F.lit('anxiety'), F.avg("liwc_sentiment.anx"), 
        F.lit('humans'), F.avg("liwc_sentiment.friends"), 
        F.lit('family'), F.avg("liwc_sentiment.family"), 
        F.lit('friends'), F.avg("liwc_sentiment.friends"), 
        F.lit('focusfuture'), F.avg("liwc_sentiment.focusfuture"), 
        F.lit('focuspast'), F.avg("liwc_sentiment.focuspast"), 
        F.lit('focuspresent'), F.avg("liwc_sentiment.focuspresent"), 
        F.lit('work'), F.avg("liwc_sentiment.work"),
        F.lit('drives'), F.avg("liwc_sentiment.drives"), 
        F.lit('discrepancies'), F.avg("liwc_sentiment.discrep"),
        F.lit('time'), F.avg("liwc_sentiment.time"), 
        F.lit('leisure'), F.avg("liwc_sentiment.leisure"),
        F.lit('death'), F.avg("liwc_sentiment.death"), 
        F.lit('religion'), F.avg("liwc_sentiment.relig"),
    ).alias('liwc_sentiment_map'),
    udf_word_count_agg(F.collect_list("tokens")).alias('word_count')
)

In [0]:
# Write output
data_grouped.coalesce(1).write.mode('overwrite').json("dbfs:/mnt/group12/data_output/topics_twitter.json")

In [0]:
display(dbutils.fs.ls('mnt/group12/data_output/topics_twitter.json'))

path,name,size
dbfs:/mnt/group12/data_output/topics_twitter.json/_SUCCESS,_SUCCESS,0
dbfs:/mnt/group12/data_output/topics_twitter.json/_committed_4440859935142537306,_committed_4440859935142537306,117
dbfs:/mnt/group12/data_output/topics_twitter.json/_started_4440859935142537306,_started_4440859935142537306,0
dbfs:/mnt/group12/data_output/topics_twitter.json/part-00000-tid-4440859935142537306-3f5ca972-fb0a-4248-8756-5249de36167d-424672-1-c000.json,part-00000-tid-4440859935142537306-3f5ca972-fb0a-4248-8756-5249de36167d-424672-1-c000.json,6840
