In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [0]:
# Obtain usernames from nested conversation field
def conversation_parser(tree):
    users_in_conversation = []
    max_iter = 0
    while (max_iter != 300): # Iterate over nested JSON
        try:
            users_in_conversation.append(tree["user"]["username"])
            tree = tree["conversation_parent"]
        except:
            break
        max_iter +=1
    return users_in_conversation

def category_filter(category):
    try:
        tmp = category["title"]
        return tmp in ["Politics", "News"]
    except: 
        return False


conversation_udf = F.udf(lambda x: conversation_parser(x), ArrayType(StringType())) 
category_filter_udf = F.udf(lambda x: category_filter(x), BooleanType())

In [0]:
data_chunk = spark.read.format("binary").json("dbfs:/mnt/group12/data/part-*.bin")

In [0]:
# Filtering only standalone and quotes posts
data_chunk_filtered = data_chunk.filter((F.col("is_reply") == False) & (F.col("only_emoji") == False) & (F.col("repost") == False) & (category_filter_udf(F.col("category_details"))))

In [0]:
# Filtering only needed columns
data_chunk_filtered = data_chunk_filtered.select(F.col("category_details.title").alias("category_title"), F.col("body"), F.col("created_at"), F.col("dislike_count"), F.col("id"), F.col("like_count"), F.col("repost_count"), F.col("reply_count"), F.col("score"), F.col("topic.created_at").alias("topic_created_at"), F.col("topic.id").alias("topic_id"), F.col("topic.title").alias("topic_title"), F.col("quote_conversation_parent"), F.col("user.id").alias("user_id"), F.col("user.username").alias("username"), F.col("user.name").alias("user_real_name"), F.col("repost"), conversation_udf(F.col("conversation_parent")).alias("users_in_conversation"))

In [0]:
# Filtering only messages with more than 5 words
data_chunk_filtered = data_chunk_filtered.withColumn('word_count', F.size(F.split(F.col('body'), ' ')))
data_chunk_filtered = data_chunk_filtered.filter((F.col("word_count") > 5))

In [0]:
# Convert string to date type
data_chunk_filtered_date = data_chunk_filtered.withColumn("created_at_dt", F.to_date(F.col("created_at"), "yyyy-MM-dd'T'HH:mm:ssXXX"))
data_chunk_filtered_date = data_chunk_filtered_date.withColumn("created_at_month", F.date_format(F.col("created_at_dt"), "yyyy-MM"))

In [0]:
filtered_output = data_chunk_filtered_date.withColumn("n_users_in_conversation", F.size(F.col("users_in_conversation")))

In [0]:
# Building final output columns
final_output = filtered_output.select(F.col("created_at_dt"), F.col("created_at_month"), F.col("word_count"), F.col("category_title"), F.col("body"), F.col("created_at"), F.col("dislike_count"), F.col("id"), F.col("like_count"), F.col("repost_count"), F.col("reply_count"), F.col("score"), F.col("topic_created_at"), F.col("topic_id"), F.col("topic_title"), F.col("user_id"), F.col("username"), F.col("user_real_name"), F.col("repost"), F.col("quote_conversation_parent"), F.col("users_in_conversation"), F.col("n_users_in_conversation"))

In [0]:
# Writing filtered data partitioned by months
final_output.write.mode('overwrite').partitionBy('created_at_month').parquet("dbfs:/mnt/group12/filtered/")