In [0]:
data_chunk = spark.read.format("binary").json("/mnt/group12/data/part-0070*.bin")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [0]:
data_chunk = data_chunk.withColumn('word_count', F.size(F.split(F.col('body'), ' ')))

In [0]:
data_chunk.select('word_count').display()

word_count
11
16
17
19
2
1
25
14
17
29


In [0]:
data_chunk.where(F.col("category_details.title").isNotNull()).groupBy("category_details.title").count().display()

title,count
Philosophy,886
Science,1116
Faith,1409
Entertainment,1877
Art,1018
Sports,454
Finance,522
Cuisine,201
AMA,16917
Music,1021


In [0]:
data_chunk.filter(F.col("topic.title").isNotNull()).groupBy("topic.title").count().filter(F.col("count") > 500).display()

title,count
Memes,5794
Pettibone & Sellner Detained UK,1439
BritFam,8244
MEME WARS,554
North Korea,1717
Martin Sellner Arrested,842
Yountville Shooting,955
Deutsch,4999
Introduce Yourself,2939
South Africa,514


In [0]:
data_chunk.where(F.col("category_details.title").isin(["News", "Politics"])).select("user.username", "score").groupBy("username").sum().display()

username,sum(score)
OldDannyboy12,111
mattsixteen24,61
suthenboy,33
DDouglas,43
lkusa,11
AgeOfUltraViolence,8
WolverineTongue,454
dazzy,195
RedPillPost,87
goodman,8


In [0]:
data_chunk_filtered = data_chunk.filter((F.col("word_count") > 10) & (F.col("is_reply") == False) & (F.col("only_emoji") == False) & (F.col("category_details.title").isin(["Politics", "News"])) & (F.col("topic.title").isNotNull()))

In [0]:
data_chunk_filtered.count()

In [0]:
# stuff we'll need for text processing
from nltk.corpus import stopwords
import re as re
from pyspark.ml.feature import CountVectorizer , IDF
# stuff we'll need for building the model

from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel

In [0]:
StopWords = stopwords.words("english")
def stop_words_filter(x):
    return (~x.isin(StopWords)) & (x.isNotNull()) & (F.length(x) > 2)

data_chunk_tokens = data_chunk_filtered.withColumn('tokens', F.filter(F.split(F.lower(F.col('body')), ' '), stop_words_filter))


In [0]:
data_chunk_tokens = data_chunk_tokens.filter(F.size(F.col('tokens')) > 2)

In [0]:
data_chunk_tokens.count()

In [0]:

cv = CountVectorizer(inputCol="tokens", outputCol="raw_features", vocabSize=5000, minDF=10.0)
cvmodel = cv.fit(data_chunk_tokens)
result_cv = cvmodel.transform(data_chunk_tokens)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

In [0]:
from pyspark.mllib.linalg import Vectors as MLlibVectors
num_topics = 10
max_iterations = 100

lda_model = LDA.train(result_tfidf.select("id", "features").rdd.mapValues(MLlibVectors.fromML).map(list), k=num_topics, maxIterations=max_iterations)

In [0]:
result_tfidf.dtypes

In [0]:
vocabArray = cvmodel.vocabulary
wordNumbers = 5  
topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = wordNumbers))
def topic_render(topic):
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result
topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()
for topic in range(len(topics_final)):
    print ("Topic" + str(topic) + ":")
    for term in topics_final[topic]:
        print (term)
    print ('\n')