In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create access point for Spark services
# Use defaultParallelism to determine default number of partitions
# Use getNumPartitions() to verify number is appropriate for data size ~1GB
# Set default parallelism to balance processing speed and hardware limits
spark = SparkSession.builder \
    .appName("reddit loader") \
    .config("spark.default.parallelism", "2") \
    .config("spark.sql.debug.maxToStringFields", "2000") \
    .getOrCreate()

# Reducing sampling ratio used to infer schema to 0.1 to reduce resources needed for schema inference.
# submissions_df = spark.read.option("samplingRatio", 0.05).json("submissions_data.ndjson")

# Create parquet files for more efficient reads and writes
# submissions_df.write.option("compression", "snappy").mode("overwrite").parquet("submissions_data.parquet")

# Read and verify num partitions is appropriate
submissions_df = spark.read.parquet("../data/submissions_data.parquet")
# submissions_df.rdd.getNumPartitions() # --> 3
submissions_df.describe().show() 
submissions_df.count()

your 131072x1 screen size is bogus. expect trouble
24/12/08 23:06:30 WARN Utils: Your hostname, BOOK-80CSR0J7NE resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/12/08 23:06:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/08 23:06:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+-------+---------------+-----------+--------------------+-------------------+-----------------------------+----------------------+------------------------+--------------------+-----------------------+-----------------+---------------+-------------+---------+--------------+--------+------------------+--------------------+-------------------+----------------+---------------+-------------+---------+------------------+--------------------+----+-------+---------+--------------------+--------+-----+---------------------------+--------------------+----------------------+--------------------+---------------------+---------------+--------+-------------+----------------+----------+-----------------+-------------------+-----------+-----------------------+--------------------+---------------+---------+--------------------+-------------------+--------------+----------+-------------------+--------------------+--------------------+--------+------------------+--------------------+--------------------

839181

In [130]:
# Select the columns of data that we find most relevant (based on interest)
submissions_df = submissions_df.select(
    "author", "author_created_utc", 
    "author_fullname", "created_utc", 
    "id", "name", "num_comments", 
    "num_crossposts", "num_reports", 
    "score", "selftext", "subreddit", 
    "subreddit_id", "subreddit_subscribers", 
    "subreddit_type", "title", 
    "total_awards_received", "downs", "ups", 
    "upvote_ratio", "url")
submissions_df.show(5)

# Next let's perform some preprocessing
users = submissions_df.select(
    col("author").alias("username"),
    col("author_fullname"),
    col("author_created_utc").alias("joined_utc").cast("timestamp")
    ).dropDuplicates()
users.show(5)

                                                                                

+-------------+------------------+---------------+-----------+-----+--------+------------+--------------+-----------+-----+--------+------------------+------------+---------------------+--------------+--------------------+---------------------+-----+---+------------+--------------------+
|       author|author_created_utc|author_fullname|created_utc|   id|    name|num_comments|num_crossposts|num_reports|score|selftext|         subreddit|subreddit_id|subreddit_subscribers|subreddit_type|               title|total_awards_received|downs|ups|upvote_ratio|                 url|
+-------------+------------------+---------------+-----------+-----+--------+------------+--------------+-----------+-----+--------+------------------+------------+---------------------+--------------+--------------------+---------------------+-----+---+------------+--------------------+
|ilikebluepens|              NULL|           NULL| 1309564251|iemkn|t3_iemkn|           0|          NULL|       NULL|    4|        |A



+-------------+---------------+-------------------+
|     username|author_fullname|         joined_utc|
+-------------+---------------+-------------------+
|      CMM____|      t2_14lvxs|2017-01-19 22:49:56|
|     Zelduuhh|       t2_9zt4a|2012-12-24 18:56:36|
|OldSoulWisdom|    t2_3y320f6p|2019-06-14 16:08:47|
|  WilmerSt123|    t2_2yflf4z0|2019-05-16 10:27:57|
|    ethan2266|    t2_5eaz4r5z|2020-01-14 16:20:03|
+-------------+---------------+-------------------+
only showing top 5 rows



                                                                                

In [112]:
subreddits = submissions_df.select(
    col("subreddit_id").alias("id"),
    col("subreddit").alias("name"),
    col("subreddit_type")
).dropDuplicates().dropna()
subreddits.show(5)

subscriber_counts = submissions_df.select(
    col("subreddit_id"),
    col("subreddit_subscribers").alias("subscriber_count").cast("int"),
    col("author_created_utc").alias("date").cast("timestamp")
).dropna()
subscriber_counts.show(5)

+---------+------------------+--------------+
|       id|              name|subreddit_type|
+---------+------------------+--------------+
| t5_2r5zc|            edtech|        public|
| t5_2rawx|     climatechange|        public|
| t5_2qh0k|            cogsci|        public|
| t5_2sluh|AcademicPsychology|        public|
|t5_7ipnaj|     ChatGPTCoding|        public|
+---------+------------------+--------------+
only showing top 5 rows

+------------+----------------+-------------------+
|subreddit_id|subscriber_count|               date|
+------------+----------------+-------------------+
|    t5_2sluh|           18442|2017-05-16 18:55:05|
|    t5_2sluh|           18442|2017-02-12 06:52:13|
|    t5_2sluh|           18442|2014-10-17 09:42:41|
|    t5_2sluh|           18442|2018-04-14 20:33:44|
|    t5_2sluh|           18442|2018-03-12 18:40:44|
+------------+----------------+-------------------+
only showing top 5 rows



In [107]:
submissions = submissions_df.select(
    col("id"),
    col("title"),
    col("created_utc").cast("timestamp"),
    col("author"),
    col("score").cast("int"),
    col("num_comments").cast("int"),
    col("num_crossposts").cast("int"),
    col("num_reports").cast("int"),
    col("selftext"),
    col("upvote_ratio"),
    col("url"),
    col("total_awards_received"),
    col("downs"),
    col("ups")
)
submissions.show(5)

[Stage 569:>                                                        (0 + 1) / 1]

+-----+--------------------+-----------+-------------+-----+------------+--------------+-----------+--------+------------+--------------------+---------------------+-----+---+
|   id|               title|created_utc|       author|score|num_comments|num_crossposts|num_reports|selftext|upvote_ratio|                 url|total_awards_received|downs|ups|
+-----+--------------------+-----------+-------------+-----+------------+--------------+-----------+--------+------------+--------------------+---------------------+-----+---+
|iemkn|Sister subreddit ...|       NULL|ilikebluepens|    4|           0|          NULL|       NULL|        |        NULL|http://reddit.com...|                 NULL|    2|  6|
|iellt|Decline of Fluid ...|       NULL| inquilinekea|    5|           1|          NULL|       NULL|        |        NULL|http://www.quora....|                 NULL|    2|  7|
|ielfu|Ecological Moment...|       NULL|       drooze|    6|           3|          NULL|       NULL|        |        NUL

                                                                                

In [None]:
# For submissions, we can probably impute upvote_ratio, num_reports, and num_crossposts.
# For now, we will remove any columns that are more than 50% missing.
threshold = int(0.50 * submissions.count())
columns_to_keep = [column for column in submissions.columns
    if submissions.filter(submissions[column].isNotNull()).count() >= threshold]
submissions = submissions.select(columns_to_keep)
submissions.describe().show()



+-------+--------+--------------------+--------------------+------------------+-----------------+-------------------+--------------------+------+---------------------+
|summary|      id|               title|              author|             score|     num_comments|     num_crossposts|            selftext|   url|total_awards_received|
+-------+--------+--------------------+--------------------+------------------+-----------------+-------------------+--------------------+------+---------------------+
|  count|  839181|              839181|              839181|            839181|           839181|             762691|              839181|838159|               451933|
|   mean|Infinity|8.555564974470297...|1.357251799493589...|19.061613644732184|7.622629682988533|0.01758641441947001|   5.050050055055E11|  NULL|   0.0723138164285414|
| stddev|     NaN|3.081412198113064...|2.694542489993745...| 268.3973863642429| 52.8010183274944|0.22683376522460375|7.141849278507634E11|  NULL|    1.855232829

                                                                                

In [None]:
# Repeat the same steps with comments data
# comments_df = spark.read.option("samplingRatio", 0.05).json("comments_data.ndjson")
# comments_df.write.option("compression", "snappy").mode("overwrite").parquet("comments_data.parquet")
comments_df = spark.read.parquet("../data/comments_data.parquet")
comments_df.describe().show()
comments_df.count()

                                                                                

+-------+---------------+-----------+----------------+-------------+--------------------+-----------------------------+----------------------+------------------------+-----------------+-----------------------+-----------------+---------------+-------------+---------+--------------------+--------------------+--------------------+-------------------------------+--------------------+---------------------+------------+--------------------+-------------------+--------------------+-------------+------+--------------------+--------------------+--------------------+-----+----------+--------+-------------+----------------+-------+----------+-----------+--------------------+--------------------+--------------+-------+-------------------+--------------------+--------+------------------+------------------+------------+-----------------------+--------------+----------------+---------------------+------------------+-------------------+------------------+
|summary|approved_at_utc|approved_by|associat

1000833

In [131]:
comments_df = comments_df.select("author", "author_created_utc", "author_fullname", "body", "controversiality", "created_utc", "distinguished", "downs", "id", "name", "num_reports", "parent_id", "permalink", "replies", "score", "subreddit", "subreddit_id", "subreddit_name_prefixed", "subreddit_type", "total_awards_received", "ups")

users = users.union(comments_df.select(
    col("author").alias("username"),
    col("author_fullname"),
    col("author_created_utc").alias("joined_utc").cast("timestamp")
    )).dropDuplicates()
users.show(5)



+-------------+---------------+-------------------+
|     username|author_fullname|         joined_utc|
+-------------+---------------+-------------------+
|      CMM____|      t2_14lvxs|2017-01-19 22:49:56|
|     Zelduuhh|       t2_9zt4a|2012-12-24 18:56:36|
|OldSoulWisdom|    t2_3y320f6p|2019-06-14 16:08:47|
|  WilmerSt123|    t2_2yflf4z0|2019-05-16 10:27:57|
|    ethan2266|    t2_5eaz4r5z|2020-01-14 16:20:03|
+-------------+---------------+-------------------+
only showing top 5 rows



                                                                                

In [132]:
comments = comments_df.select(
    col("id").alias("comment_id"),
    col("parent_id"),
    col("subreddit_id"),
    col("author"),
    col("created_utc").alias("created_at").cast("timestamp"),
    col("body"),
    col("score").cast("int"),
    col("ups").cast("int"),
    col("downs").cast("int"),
    col("controversiality").cast("int"),
    col("distinguished"),
    col("num_reports").cast("int"),
    col("total_awards_received").cast("int"),
    col("permalink")
).dropna(subset=["comment_id", "body"])
comments.show(5)

[Stage 738:>                                                        (0 + 1) / 1]

+----------+---------+------------+---------+----------+--------------------+-----+---+-----+----------------+-------------+-----------+---------------------+---------+
|comment_id|parent_id|subreddit_id|   author|created_at|                body|score|ups|downs|controversiality|distinguished|num_reports|total_awards_received|permalink|
+----------+---------+------------+---------+----------+--------------------+-----+---+-----+----------------+-------------+-----------+---------------------+---------+
|   c233p63| t3_iejzu|    t5_2sluh|  amayain|      NULL|I agree that it i...|    3|  3|    0|               0|         NULL|       NULL|                 NULL|     NULL|
|   c233q0r| t3_iekel|    t5_2sluh|  amayain|      NULL|I thought I might...|    2|  2|    0|               0|         NULL|       NULL|                 NULL|     NULL|
|   c233vof| t3_iejzu|    t5_2sluh|nicson123|      NULL|Yes, the issue of...|    3|  3|    0|               0|         NULL|       NULL|                 NU

                                                                                

In [134]:
# For now, we will remove any columns that are more than 50% missing.
threshold = int(0.50 * comments.count())
columns_to_keep = [column for column in comments.columns
    if comments.filter(comments[column].isNotNull()).count() >= threshold]
comments = comments.select(columns_to_keep)
comments.describe().show()



+-------+--------------------+--------------------+------------+-------------+--------------------+------------------+--------------------+---------------------+--------------------+
|summary|          comment_id|           parent_id|subreddit_id|       author|                body|             score|    controversiality|total_awards_received|           permalink|
+-------+--------------------+--------------------+------------+-------------+--------------------+------------------+--------------------+---------------------+--------------------+
|  count|             1000833|             1000833|     1000833|      1000833|             1000833|           1000833|             1000833|               738536|              802745|
|   mean|3.043478282608696E28|4.336460094351968...|        NULL|     Infinity|5.406422518721979E75|2.5126129933765173|0.010223483837962977| 0.001618066011677...|                NULL|
| stddev|1.459600897925626...|1.1127094067500521E8|        NULL|          NaN|7.39317

                                                                                