In [1]:
# Spark setup - alternative setup from Slack as original was not working.
from pyspark.sql import SparkSession
spark_session = SparkSession.builder \
    .master("spark://192.168.2.156:7077") \
    .appName("projectgroup1_reddit-dataset-100k") \
    .config("spark.dynamicAllocation.enabled", True) \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True) \
    .config("spark.shuffle.service.enabled", False) \
    .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
    .config("spark.executor.cores", 2) \
    .config("spark.driver.port",9999)\
    .config("spark.blockManager.port",10005)\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/05 18:05:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Load Reddit dataset which is up on Spark.
file_path = "hdfs://192.168.2.156:9000/data/reddit/reddit_100k.json" # For now use a smaller subset to simply test.

df = spark_session.read.json(file_path)
rows_1 = df.count()
print(f"Current amount of rows: {rows_1}")
df.show(2)

                                                                                

Current amount of rows: 200001
+---------------+----------------+--------------------+--------------------+-----------+-------+--------------------+---------+------------+--------------------+-----------+-----+
|_corrupt_record|          author|                body|             content|content_len|     id|      normalizedBody|subreddit|subreddit_id|             summary|summary_len|title|
+---------------+----------------+--------------------+--------------------+-----------+-------+--------------------+---------+------------+--------------------+-----------+-----+
|           NULL|raysofdarkmatter|I think it should...|I think it should...|        178|c69al3r|I think it should...|     math|    t5_2qh0n|Shifting seasonal...|          8| NULL|
|               |            NULL|                NULL|                NULL|       NULL|   NULL|                NULL|     NULL|        NULL|                NULL|       NULL| NULL|
+---------------+----------------+--------------------+--------------

In [3]:
from pyspark.sql.functions import col
# Select the only two we wanna analyze, the subreddit name and it's contents.
df = df.select(col("body"), col("subreddit"))
rows_2 = df.count()
print(f"Current amount of rows: {rows_2}")
df.show(5)

                                                                                

Current amount of rows: 200001
+--------------------+-----------+
|                body|  subreddit|
+--------------------+-----------+
|I think it should...|       math|
|                NULL|       NULL|
|Art is about the ...|      funny|
|                NULL|       NULL|
|Ask me what I thi...|Borderlands|
+--------------------+-----------+
only showing top 5 rows



In [4]:
# Cleanup the data by removing null rows as we can see some above.
df = df.dropna(subset=["body", "subreddit"])
rows_3 = df.count()
rows_dropped = rows_2 - rows_3
print(f"Current amount of rows: {rows_3}, NULL rows dropped: {rows_dropped}.")
df.show(10)

                                                                                

Current amount of rows: 99668, NULL rows dropped: 100333.
+--------------------+--------------------+
|                body|           subreddit|
+--------------------+--------------------+
|I think it should...|                math|
|Art is about the ...|               funny|
|Ask me what I thi...|         Borderlands|
|In Mechwarrior On...|            gamingpc|
|You are talking a...|              Diablo|
|All but one of my...|   RedditLaqueristas|
|I could give a sh...|               apple|
|So you're saying ...|               apple|
|I love this idea ...|RedditFilmsProduc...|
|Theres an entire ...|       AbandonedPorn|
+--------------------+--------------------+
only showing top 10 rows



In [5]:
# Preprocess the data - in accordance with SparkNLP tutorial
from pyspark.sql.functions import regexp_replace, length # Function to replace substrings in column using regular expressions and just check length.

# Remove empty strings (i.e containing only whitespaces)
df = df.filter(~col("body").rlike("^\s*$"))
# Remove Non-ASCII characters like emojis and non-english characters and replaces with spaces.
df = df.withColumn("body", regexp_replace(col("body"), "[^\x00-\x7F]+", " ")) 
# Keep comments with at least 5 letters in one word.
df = df.filter(col("body").rlike("[a-zA-Z]{5,}"))  
# Remove punctuation and replace with an empty string to keep words intact
df = df.withColumn("body", regexp_replace(col("body"), "[!\"#$%&'()*+,-./:;<=>?@\\[\\\\\\]^_`{|}~]", ""))
# Removes very short comments that don't give any meaning really.
df = df.filter(length(col("body")) > 10)
rows_4 = df.count()
print(f"Rows after pre-processing finalized: {rows_4}, total rows dropped during pre-processing: {rows_2 - rows_4}")
df.show(10)
# After this pre-processing has been finalized

                                                                                

Rows after pre-processing finalized: 99556, total rows dropped during pre-processing: 100445
+--------------------+--------------------+
|                body|           subreddit|
+--------------------+--------------------+
|I think it should...|                math|
|Art is about the ...|               funny|
|Ask me what I thi...|         Borderlands|
|In Mechwarrior On...|            gamingpc|
|You are talking a...|              Diablo|
|All but one of my...|   RedditLaqueristas|
|I could give a sh...|               apple|
|So youre saying t...|               apple|
|I love this idea ...|RedditFilmsProduc...|
|Theres an entire ...|       AbandonedPorn|
+--------------------+--------------------+
only showing top 10 rows



In [6]:
spark_session.stop() # Ending our current spark session.