In [0]:
%pip install nltk

In [0]:
from pyspark.sql.session import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql import functions as F
from pyspark.sql.types import *
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import requests
import numpy as np

from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.linalg import Vector, Vectors, SparseVector
from pyspark.ml.clustering import LDA

# Getting stopwords lists
StopWords = stopwords.words("english")
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords_2 = set(stopwords_list.decode().splitlines()) 

In [0]:
# Defining UDFs

# Text cleaning
def clean_body(x):
  punc='!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~'
  cleaned = x.lower()
  cleaned = re.sub(r'https?:\/\/.*[\r\n]*', '', cleaned, flags=re.MULTILINE)
  cleaned = re.sub(r'[u|a]\d+.*', '', cleaned, flags=re.MULTILINE) # remove special line break characters
  for ch in punc:
    cleaned = cleaned.replace(ch, '')
  return cleaned

# Mapping ID of terms to words of the vocabulary
def map_termID_to_Word(termIndices):
      words = []
      for termID in termIndices:
          words.append(vocabArray[termID])
      return words

# Stopwords tokens filtering
def stop_words_filter(x):
  return (~x.isin(StopWords)) & (~x.isin(stopwords_2)) & (x.isNotNull()) & (F.length(x) > 2)

clean_body_udf = F.udf(clean_body , StringType())
udf_map_termID_to_Word = F.udf(map_termID_to_Word , ArrayType(StringType()))
max_index_udf = F.udf(lambda x: int(np.argmax(x)), IntegerType())
max_value_udf = F.udf(lambda x: float(np.amax(x)), FloatType())

In [0]:
periods = [
  "2016-11", "2016-12", "2017-01", "2017-02", "2017-03", "2017-04", "2017-05", "2017-06", "2017-07", "2017-08",
  "2017-09", "2017-10", 
  "2017-11", 
  "2017-12", 
  "2018-01", "2018-02", "2018-03", "2018-04", "2018-05"
]

# Based on perplexity analysis
topics_per_period = [
  70, 27, 130, 100, 100, 80, 105, 120, 90, 110,
  120, 145, 150, 
  160, 
  180, 160, 190, 180, 25
]

In [0]:
# For each period
for i in range (0,  len(periods)):
    period = periods[i]
    num_topics = topics_per_period[i]
    print (period, num_topics)
    
    # Read the period data and clean text
    data_chunk = spark.read.option("header","true").parquet("dbfs:/mnt/group12/sentiment/created_at_month=" + period + "/")
    data_chunk_tokens = data_chunk.withColumn('cleaned_body', clean_body_udf(F.col('body')))
    data_chunk_tokens = data_chunk_tokens.withColumn('tokens', F.filter(F.split(F.col('cleaned_body'), ' '), stop_words_filter))
    data_chunk_tokens = data_chunk_tokens.filter(F.size(F.col('tokens')) > 2)

    print ('TF-IDF')
    # Text vectorization
    cv = CountVectorizer(inputCol="tokens", outputCol="raw_features", vocabSize=10000, minDF=1, maxDF=0.9)
    cvmodel = cv.fit(data_chunk_tokens)
    result_cv = cvmodel.transform(data_chunk_tokens)
    # TF-IDF
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv) 

    print ('LDA')
    # Topic discovery using LDA
    max_iterations = 100
    lda = LDA(k=num_topics, optimizer="online")
    lda.setMaxIter(max_iterations)
    lda_model = lda.fit(result_tfidf.select("id", "features"))
    lda_data = lda_model.transform(result_tfidf.select("id", "features"))
    topic_description = lda_model.describeTopics()
    vocabArray = cvmodel.vocabulary

    print ('COMPRISING DATA')
    # TOPICS DESCRIPTION
    topic_description_mapped = topic_description.withColumn("topic_desc", udf_map_termID_to_Word(F.col('termIndices')))

    # COMPRISING TOPIC DATA
    lda_data_topic = lda_data.withColumn('topic_discovery_id', max_index_udf(F.col('topicDistribution')))
    lda_data_topic = lda_data_topic.withColumn('message_topic_weight', max_value_udf(F.col('topicDistribution')))
    lda_data_topic = lda_data_topic.select('id', 'topic_discovery_id', 'message_topic_weight').join(topic_description_mapped, topic_description_mapped.topic == lda_data_topic.topic_discovery_id, "inner").select('id', 'topic_discovery_id', 'topic_desc', 'message_topic_weight', F.array_join(F.slice(F.col('topic_desc'), 1, 5), ' | ').alias('topic_discovery_title'), F.concat(F.lit(period + '_'), F.col('topic_discovery_id')).alias('topic_discovery_unique_id'))

    lda_comprised_result = lda_data_topic.join(data_chunk_tokens, ['id'])
    lda_comprised_result = lda_comprised_result.withColumn('created_at_month', F.lit(period))

    print ('SAVING DATA')
    # SAVING DATA
    write_mode = 'append'
    if (i == 0): # On the first iteration we overwrite the directory
        write_mode = 'overwrite'
    lda_comprised_result.write.mode(write_mode).partitionBy('created_at_month').parquet("dbfs:/mnt/group12/topic_discovery/")


  