In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [0]:
# Stopwords list
import requests
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords_2 = set(stopwords_list.decode().splitlines()) 

In [0]:
# Read input data
month_to_analyze = "2018-04" # We changed this variable manually
data_chunk = spark.read.option("header","true").parquet("dbfs:/mnt/group12/sentiment/created_at_month=" + str(month_to_analyze) + "/")

In [0]:
%pip install requests

In [0]:
%pip install nltk

In [0]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re as re

from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.linalg import Vector, Vectors, SparseVector
from pyspark.ml.clustering import LDA

In [0]:
# Text cleaning
import re
def clean_body(x):
  punc='!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~'
  cleaned = x.lower()
  cleaned = re.sub(r'https?:\/\/.*[\r\n]*', '', cleaned, flags=re.MULTILINE)
  cleaned = re.sub(r'[u|a]\d+.*', '', cleaned, flags=re.MULTILINE) # remove special line break characters
  for ch in punc:
    cleaned = cleaned.replace(ch, '')
  return cleaned

clean_body_udf = F.udf(clean_body , StringType())

In [0]:
# Filtering stopwords and invalid tokens
StopWords = stopwords.words("english")
def stop_words_filter(x):
  return (~x.isin(StopWords)) & (~x.isin(stopwords_2)) & (x.isNotNull()) & (F.length(x) > 2)

data_chunk_tokens = data_chunk.withColumn('cleaned_body', clean_body_udf(F.col('body')))
data_chunk_tokens = data_chunk_tokens.withColumn('tokens', F.filter(F.split(F.col('cleaned_body'), ' '), stop_words_filter))


In [0]:
data_chunk_tokens = data_chunk_tokens.filter(F.size(F.col('tokens')) > 2)

In [0]:
# Text vectorization 
cv = CountVectorizer(inputCol="tokens", outputCol="raw_features", vocabSize=8000, minDF=1, maxDF=0.9)
cvmodel = cv.fit(data_chunk_tokens)
result_cv = cvmodel.transform(data_chunk_tokens)
# TF-IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

In [0]:
# Perplexity computing
topics_to_rest = [10, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 400, 500]
likelihoods = []
perplexities = []

for num_topics in topics_to_rest:
    print("Num topics", num_topics)
    num_topics = 180
    max_iterations = 50 

    lda = LDA(k=num_topics, optimizer="online")
    lda.setMaxIter(max_iterations)

    lda_model = lda.fit(result_tfidf.select("id", "features"))
    lda_data = lda_model.transform(result_tfidf.select("id", "features"))

    perplexity = lda_model.logPerplexity(lda_data)
    #likelihood = lda_model.logLikelihood(lda_data)
    perplexities.append(perplexity)

    print ("Likelihood", perplexity)

perplexities
