In [2]:
import time

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, count, rank, concat, lit, max, row_number, broadcast


# Create access point for Spark services
# Use defaultParallelism to determine default number of partitions
# Use getNumPartitions() to verify number is appropriate for data size ~1GB
# Set default parallelism to balance processing speed and hardware limits
spark = SparkSession.builder \
    .appName("reddit loader") \
    .config("spark.default.parallelism", "2") \
    .config("spark.sql.debug.maxToStringFields", "2000") \
    .getOrCreate()

# Reducing sampling ratio used to infer schema to 0.1 to reduce resources needed for schema inference.
# submissions_df = spark.read.option("samplingRatio", 0.05).json("submissions_data.ndjson")

# Create parquet files for more efficient reads and writes
# submissions_df.write.option("compression", "snappy").mode("overwrite").parquet("submissions_data.parquet")

# Read and verify num partitions is appropriate
submissions_df = spark.read.parquet("../data/submissions_data.parquet")
# submissions_df.rdd.getNumPartitions() # --> 3

# Repeat the same steps with comments data
# comments_df = spark.read.option("samplingRatio", 0.05).json("comments_data.ndjson")
# comments_df.write.option("compression", "snappy").mode("overwrite").parquet("comments_data.parquet")
comments_df = spark.read.parquet("../data/comments_data.parquet")

print(f"submissions has {submissions_df.count()} records and comments has {comments_df.count()} records. total: {submissions_df.count() + comments_df.count()}")

submissions has 839181 records and comments has 1000833 records. total: 1840014


In [3]:
# Let's go through each dataframe we loaded, view the schema that Apache Spark inferred and the statistics, and process them into normalized tables:
submissions_df.printSchema()
submissions_df.describe().show()

root
 |-- _meta: struct (nullable = true)
 |    |-- is_edited: boolean (nullable = true)
 |    |-- removal_type: string (nullable = true)
 |    |-- retrieved_2nd_on: long (nullable = true)
 |    |-- was_deleted_later: boolean (nullable = true)
 |    |-- was_initially_deleted: boolean (nullable = true)
 |-- all_awardings: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- award_sub_type: string (nullable = true)
 |    |    |-- award_type: string (nullable = true)
 |    |    |-- awardings_required_to_grant_benefits: long (nullable = true)
 |    |    |-- coin_price: long (nullable = true)
 |    |    |-- coin_reward: long (nullable = true)
 |    |    |-- count: long (nullable = true)
 |    |    |-- days_of_drip_extension: long (nullable = true)
 |    |    |-- days_of_premium: long (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- end_date: string (nullable = true)
 |    |    |-- giver_coin_reward: long (nullable = true

                                                                                

+-------+---------------+-----------+--------------------+-------------------+-----------------------------+----------------------+------------------------+--------------------+-----------------------+-----------------+---------------+-------------+---------+--------------+--------+------------------+--------------------+-------------------+----------------+---------------+-------------+---------+------------------+--------------------+----+-------+---------+--------------------+--------+-----+---------------------------+--------------------+----------------------+--------------------+---------------------+---------------+--------+-------------+----------------+----------+-----------------+-------------------+-----------+-----------------------+--------------------+---------------+---------+--------------------+-------------------+--------------+----------+-------------------+--------------------+--------------------+--------+------------------+--------------------+--------------------

In [4]:
# Select the columns of data that we find most relevant (based on interest) and that are not null
submissions_df = submissions_df.select(
    "author", "author_created_utc", 
    "author_fullname", "created_utc", 
    "id", "name", "num_comments", 
    "num_crossposts", "num_reports", 
    "score", "selftext", "subreddit", 
    "subreddit_id", "subreddit_subscribers", 
    "subreddit_type", "title", 
    "total_awards_received", "downs", "ups", 
    "upvote_ratio", "url")
submissions_df.show(5)

# We want to enforce that all ids are formatted with the correct prefix to facilitate joins: t1 refers to comments, t2 users, t3 submissions, t5 subreddits.
# The users table will hold user information: id, username, and time of join. 
# We can formally enforce that all user id and usernames are unique in Postgres, but for now we dropDuplicates().
users = submissions_df.select(
    col("author_fullname").alias("id"),
    col("author").alias("username"),
    col("author_created_utc").alias("joined_utc").cast("timestamp")
    ).dropDuplicates(["id", "username"])
users.show(5)

                                                                                

+-------------+------------------+---------------+-----------+-----+--------+------------+--------------+-----------+-----+--------+------------------+------------+---------------------+--------------+--------------------+---------------------+-----+---+------------+--------------------+
|       author|author_created_utc|author_fullname|created_utc|   id|    name|num_comments|num_crossposts|num_reports|score|selftext|         subreddit|subreddit_id|subreddit_subscribers|subreddit_type|               title|total_awards_received|downs|ups|upvote_ratio|                 url|
+-------------+------------------+---------------+-----------+-----+--------+------------+--------------+-----------+-----+--------+------------------+------------+---------------------+--------------+--------------------+---------------------+-----+---+------------+--------------------+
|ilikebluepens|              NULL|           NULL| 1309564251|iemkn|t3_iemkn|           0|          NULL|       NULL|    4|        |A

[Stage 32:>                                                         (0 + 3) / 3]

+-----------+---------------+-------------------+
|         id|       username|         joined_utc|
+-----------+---------------+-------------------+
|t2_2ev8r79a|      FGCUPsych|2018-10-14 17:14:07|
|   t2_qwu96|       tedSquee|2015-10-03 07:30:58|
|   t2_lctgx|AngelinaLawmeno|2015-02-14 09:05:23|
|  t2_15it1p|  LightSquirrel|2017-02-18 08:11:15|
|t2_47mr1qdn|    jamielearns|               NULL|
+-----------+---------------+-------------------+
only showing top 5 rows



                                                                                

In [5]:
# The subreddit dataframe identifies subreddit information. We want to enforce that id is unique.
subreddits = submissions_df.select(
    col("subreddit_id").alias("id"),
    col("subreddit").alias("name"),
    col("subreddit_type")
).dropDuplicates(["id"])
subreddits.show(5)

# With subscriber counts we want to make sure that only valid timestamps are present, so we drop all null values. TODO: impute subscriber_counts.
# We include subscriber name here for convenience.
subscriber_counts = submissions_df.select(
    col("subreddit_id"),
    col("subreddit").alias("name"),
    col("subreddit_subscribers").alias("subscriber_count").cast("int"),
    col("created_utc").alias("date").cast("long").cast("timestamp")
).dropna(subset=["date"])
subscriber_counts.show(5)

                                                                                

+--------+-------------+--------------+
|      id|         name|subreddit_type|
+--------+-------------+--------------+
|t5_2qh0k|       cogsci|        public|
|t5_2qjfk|       stocks|        public|
|t5_2qqpg|          NLP|        public|
|t5_2r5zc|       edtech|        public|
|t5_2rawx|climatechange|        public|
+--------+-------------+--------------+
only showing top 5 rows

+------------+------------------+----------------+-------------------+
|subreddit_id|              name|subscriber_count|               date|
+------------+------------------+----------------+-------------------+
|    t5_2sluh|AcademicPsychology|            NULL|2011-07-01 16:50:51|
|    t5_2sluh|AcademicPsychology|            NULL|2011-07-01 16:01:37|
|    t5_2sluh|AcademicPsychology|            NULL|2011-07-01 15:54:04|
|    t5_2sluh|AcademicPsychology|            NULL|2011-07-01 15:17:05|
|    t5_2sluh|AcademicPsychology|            NULL|2011-07-01 15:04:47|
+------------+------------------+--------------

In [6]:
# For the submissions dataframe, we format submission id to include the t3_ prefix and select relevant columns.
submissions = submissions_df.select(
    concat(lit("t3_"), col("id")).alias("id"),
    col("title"),
    col("created_utc").cast("long").cast("timestamp"),
    col("author"),
    col("score").cast("int"),
    col("num_comments").cast("int"),
    col("num_crossposts").cast("int"),
    col("num_reports").cast("int"),
    col("selftext"),
    col("upvote_ratio"),
    col("url"),
    col("total_awards_received"),
    col("downs"),
    col("ups")
)
submissions.show(5)

[Stage 39:>                                                         (0 + 1) / 1]

+--------+--------------------+-------------------+-------------+-----+------------+--------------+-----------+--------+------------+--------------------+---------------------+-----+---+
|      id|               title|        created_utc|       author|score|num_comments|num_crossposts|num_reports|selftext|upvote_ratio|                 url|total_awards_received|downs|ups|
+--------+--------------------+-------------------+-------------+-----+------------+--------------+-----------+--------+------------+--------------------+---------------------+-----+---+
|t3_iemkn|Sister subreddit ...|2011-07-01 16:50:51|ilikebluepens|    4|           0|          NULL|       NULL|        |        NULL|http://reddit.com...|                 NULL|    2|  6|
|t3_iellt|Decline of Fluid ...|2011-07-01 16:01:37| inquilinekea|    5|           1|          NULL|       NULL|        |        NULL|http://www.quora....|                 NULL|    2|  7|
|t3_ielfu|Ecological Moment...|2011-07-01 15:54:04|       drooze|

                                                                                

In [7]:
# TODO: impute upvote_ratio, num_reports, and num_crossposts.
# For now, we will remove any columns that are more than 50% missing.
threshold = int(0.50 * submissions.count())
columns_to_keep = [column for column in submissions.columns
    if submissions.filter(submissions[column].isNotNull()).count() >= threshold]
submissions = submissions.select(columns_to_keep)
submissions.describe().show()



+-------+----------+--------------------+--------------------+------------------+-----------------+-------------------+--------------------+------+---------------------+
|summary|        id|               title|              author|             score|     num_comments|     num_crossposts|            selftext|   url|total_awards_received|
+-------+----------+--------------------+--------------------+------------------+-----------------+-------------------+--------------------+------+---------------------+
|  count|    839181|              839181|              839181|            839181|           839181|             762691|              839181|838159|               451933|
|   mean|      NULL|8.555564974470297...|1.357251799493589...|19.061613644732184|7.622629682988533|0.01758641441947001|   5.050050055055E11|  NULL|   0.0723138164285414|
| stddev|      NULL|3.081412198113064...|2.694542489993745...| 268.3973863642429| 52.8010183274944|0.22683376522460375|7.141849278507634E11|  NULL|   

                                                                                

In [8]:
# Repeating the above for comments
comments_df.printSchema()
comments_df.describe().show()

root
 |-- _meta: struct (nullable = true)
 |    |-- is_edited: boolean (nullable = true)
 |    |-- removal_type: string (nullable = true)
 |    |-- retrieved_2nd_on: long (nullable = true)
 |    |-- was_deleted_later: boolean (nullable = true)
 |    |-- was_initially_deleted: boolean (nullable = true)
 |-- all_awardings: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- award_sub_type: string (nullable = true)
 |    |    |-- award_type: string (nullable = true)
 |    |    |-- awardings_required_to_grant_benefits: string (nullable = true)
 |    |    |-- coin_price: long (nullable = true)
 |    |    |-- coin_reward: long (nullable = true)
 |    |    |-- count: long (nullable = true)
 |    |    |-- days_of_drip_extension: long (nullable = true)
 |    |    |-- days_of_premium: long (nullable = true)
 |    |    |-- description: string (nullable = true)
 |    |    |-- end_date: string (nullable = true)
 |    |    |-- giver_coin_reward: long (nullable = tr



+-------+---------------+-----------+----------------+-------------+--------------------+-----------------------------+----------------------+------------------------+-----------------+-----------------------+-----------------+---------------+-------------+---------+--------------------+--------------------+--------------------+-------------------------------+--------------------+---------------------+------------+--------------------+-------------------+--------------------+-------------+------+--------------------+--------------------+--------------------+-----+----------+--------+-------------+----------------+-------+----------+-----------+--------------------+--------------------+--------------+-------+-------------------+--------------------+--------+------------------+------------------+------------+-----------------------+--------------+----------------+---------------------+------------------+-------------------+------------------+
|summary|approved_at_utc|approved_by|associat

                                                                                

In [9]:
comments_df = comments_df.select("author", "author_created_utc", "author_fullname", "body", "controversiality", "created_utc", "distinguished", "downs", "id", "name", "num_reports", "link_id", "parent_id", "permalink", "replies", "score", "subreddit", "subreddit_id", "subreddit_name_prefixed", "subreddit_type", "total_awards_received", "ups")

# We merge users from comments data with our current registry. Each user should be unique. 
users = users.union(comments_df.select(
    col("author_fullname").alias("id"),
    col("author").alias("username"),
    col("author_created_utc").alias("joined_utc").cast("timestamp")
    )).dropDuplicates()
users.show(5)

+-----------+----------------+-------------------+
|         id|        username|         joined_utc|
+-----------+----------------+-------------------+
|t2_1v4hef9h|          Iper91|2018-07-27 01:46:33|
|t2_qb41mdeu|          rccaad|2022-07-26 12:27:34|
|t2_t6n4l0nk|Senior-Storm-727|2022-10-08 07:43:01|
|  t2_128krh|        Tyler119|2016-10-19 07:59:11|
|t2_mbom6ns2|  choprakunaleth|2022-04-24 00:36:40|
+-----------+----------------+-------------------+
only showing top 5 rows



In [11]:
# We select the appropriate columns for the comments dataframe and format the id accordingly.
comments = comments_df.select(
    concat(lit("t1_"), col("id")).alias("comment_id"),
    col("parent_id"),
    col("subreddit_id"),
    col("author"),
    col("created_utc").alias("created_at").cast("long").cast("timestamp"),
    col("body"),
    col("score").cast("int"),
    col("ups").cast("int"),
    col("downs").cast("int"),
    col("controversiality").cast("int"),
    col("distinguished"),
    col("num_reports").cast("int"),
    col("total_awards_received").cast("int"),
    col("permalink"),
    col("link_id").alias("submission_id")
).dropna(subset=["comment_id", "body"])
comments.show(5)

[Stage 98:>                                                         (0 + 1) / 1]

+----------+---------+------------+---------+-------------------+--------------------+-----+---+-----+----------------+-------------+-----------+---------------------+---------+-------------+
|comment_id|parent_id|subreddit_id|   author|         created_at|                body|score|ups|downs|controversiality|distinguished|num_reports|total_awards_received|permalink|submission_id|
+----------+---------+------------+---------+-------------------+--------------------+-----+---+-----+----------------+-------------+-----------+---------------------+---------+-------------+
|t1_c233p63| t3_iejzu|    t5_2sluh|  amayain|2011-07-01 15:03:00|I agree that it i...|    3|  3|    0|               0|         NULL|       NULL|                 NULL|     NULL|     t3_iejzu|
|t1_c233q0r| t3_iekel|    t5_2sluh|  amayain|2011-07-01 15:07:30|I thought I might...|    2|  2|    0|               0|         NULL|       NULL|                 NULL|     NULL|     t3_iekel|
|t1_c233vof| t3_iejzu|    t5_2sluh|nicso

                                                                                

In [12]:
# For now, we will remove any columns that are more than 50% missing.
threshold = int(0.50 * comments.count())
columns_to_keep = [column for column in comments.columns
    if comments.filter(comments[column].isNotNull()).count() >= threshold]
comments = comments.select(columns_to_keep)
comments.describe().show()



+-------+----------+--------------------+------------+-------------+--------------------+------------------+--------------------+---------------------+--------------------+-------------+
|summary|comment_id|           parent_id|subreddit_id|       author|                body|             score|    controversiality|total_awards_received|           permalink|submission_id|
+-------+----------+--------------------+------------+-------------+--------------------+------------------+--------------------+---------------------+--------------------+-------------+
|  count|   1000833|             1000833|     1000833|      1000833|             1000833|           1000833|             1000833|               738536|              802745|      1000833|
|   mean|      NULL|4.336460094351968...|        NULL|     Infinity|5.406422518721979E75|2.5126129933765173|0.010223483837962977| 0.001618066011677...|                NULL|         NULL|
| stddev|      NULL|1.1127094067500521E8|        NULL|          N

                                                                                

In [None]:
# TODO: Load tables into postgres via df.write.jdbc()
# Test Queries Below:

In [13]:
# Query the top 10 subreddits by latest subscriber count
# Window partitioned by subreddit to get latest subscriber counts for each subreddit
# Then we order by subscriber count and get the top 10
window_spec = Window.partitionBy("subreddit_id").orderBy(col("date").desc())
top_subreddits_by_subscribers = (subscriber_counts.withColumn("row_num", row_number().over(window_spec)) \
    .filter(col("row_num") == 1).select("name", "subscriber_count", "date") \
    .orderBy(col("subscriber_count").desc()).limit(10))

# Run the query and evaluate runtime. Observe that Spark runs every action when results are requested
# without storing intermediate results. 
top_subreddits_by_subscribers.explain()
start = time.time()
top_subreddits_by_subscribers.show(10)
end = time.time()

runtime = (end - start) * 1000
print(f"Total execution time: {runtime} miliseconds") 

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=10, orderBy=[subscriber_count#9071 DESC NULLS LAST], output=[name#9068,subscriber_count#9071,date#9073])
   +- Project [name#9068, subscriber_count#9071, date#9073]
      +- Filter (row_num#16808 = 1)
         +- Window [row_number() windowspecdefinition(subreddit_id#981, date#9073 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_num#16808], [subreddit_id#981], [date#9073 DESC NULLS LAST]
            +- WindowGroupLimit [subreddit_id#981], [date#9073 DESC NULLS LAST], row_number(), 1, Final
               +- Sort [subreddit_id#981 ASC NULLS FIRST, date#9073 DESC NULLS LAST], false, 0
                  +- Exchange hashpartitioning(subreddit_id#981, 200), ENSURE_REQUIREMENTS, [plan_id=2530]
                     +- WindowGroupLimit [subreddit_id#981], [date#9073 DESC NULLS LAST], row_number(), 1, Partial
                        +- Sort [subreddit_id#981 ASC N



+--------------------+----------------+-------------------+
|                name|subscriber_count|               date|
+--------------------+----------------+-------------------+
|              stocks|         6185771|2023-12-31 08:30:59|
|ArtificialIntelig...|          356832|2023-12-28 23:45:58|
| ImaginaryTechnology|          260001|2023-12-20 17:28:57|
|  AcademicPsychology|          126222|2023-12-31 06:46:59|
|    StocksAndTrading|          122525|2023-12-30 08:44:58|
|              cogsci|          113613|2023-12-11 12:18:58|
|       ChatGPTCoding|           94935|2023-12-23 12:43:59|
|       climatechange|           87345|2023-12-30 17:45:59|
|             fintech|           32042|2023-12-12 13:31:59|
|    stockstobuytoday|           30874|2023-12-28 04:59:59|
+--------------------+----------------+-------------------+

Total execution time: 1121.004343032837 miliseconds


                                                                                

In [14]:
# Query the controversial comment with the most replies for each subreddit
# First get the number of replies for each controversial comment
controversial_comments = comments.filter(col("controversiality") == 1)
reply_counts = controversial_comments.alias("cc").join(comments.alias("c"), col("cc.comment_id")==col("c.parent_id"), "left") \
                .groupBy(col("cc.subreddit_id").alias("subreddit_id"), col("cc.comment_id").alias("comment_id")).agg(count("c.comment_id").alias("num_replies"))

# Rank the comments for each subreddit by creating windows and ranking the rows
window_spec = Window.partitionBy("subreddit_id").orderBy(col("num_replies").desc())
ranked_comments = reply_counts.withColumn("row_number", row_number().over(window_spec))

# Grab the top-ranked comment for each subreddit
most_engaging_controversial_comments = ranked_comments.filter(col("row_number") == 1) \
                                                      .select("comment_id", "num_replies")

# Retrieve the comment information for the most engaging comments
most_engaging_controversial_comments = broadcast(most_engaging_controversial_comments).join(
    comments, "comment_id").orderBy(col("num_replies").desc())

# Run the query and evaluate runtime. Observe that Spark runs every action when results are requested
# without storing intermediate results. 
most_engaging_controversial_comments.explain()
start = time.time()
most_engaging_controversial_comments.show(13)
end = time.time()

runtime = (end - start) * 1000
print(f"Total execution time: {runtime} miliseconds") 

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [num_replies#16991L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(num_replies#16991L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=2743]
      +- Project [comment_id#16967, num_replies#16991L, parent_id#17059, subreddit_id#17075, author#17009, created_at#15445, body#17026, score#15446, controversiality#15449, total_awards_received#15451, permalink#17060, submission_id#15443]
         +- BroadcastHashJoin [comment_id#16967], [comment_id#15441], Inner, BuildLeft, false
            :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]),false), [plan_id=2739]
            :  +- Project [comment_id#16967, num_replies#16991L]
            :     +- Filter ((row_number#16996 = 1) AND isnotnull(comment_id#16967))
            :        +- Window [row_number() windowspecdefinition(subreddit_id#1206, num_replies#16991L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), curren



+----------+-----------+----------+------------+------------------+-------------------+--------------------+-----+----------------+---------------------+--------------------+-------------+
|comment_id|num_replies| parent_id|subreddit_id|            author|         created_at|                body|score|controversiality|total_awards_received|           permalink|submission_id|
+----------+-----------+----------+------------+------------------+-------------------+--------------------+-----+----------------+---------------------+--------------------+-------------+
|t1_kf1q6ou|         14|t1_kf1j4jb|    t5_2rawx|TransitionProof625|               NULL|China emits 26% o...|    1|               1|                    0|/r/climatechange/...|   t3_18rimou|
|t1_kax8i0d|         13|t1_kax83jo|    t5_3crzr| newExperience2020|               NULL|But why do you ne...|  -14|               1|                    0|/r/ArtificialInte...|   t3_184rjs2|
|t1_hl247bc|          9| t3_qw7l65|    t5_2tf7t|       

                                                                                

In [15]:
# Query the highest scoring comment for each subreddit and its number of replies
# Like before, we'll use a window partitioned by subreddit to get the top scoring comment for each subreddit. 
# We use group by to get num of replies and broadcast join with comments to get the data of the comments. 
highest_scoring_comments = comments.withColumn("row_number", row_number().over(Window.partitionBy("subreddit_id").orderBy(col("score").desc())))
reply_counts = highest_scoring_comments.filter(col("row_number") == 1).alias("h").join(comments.alias("c"), col("h.comment_id") == col("c.parent_id")) \
    .groupBy(col("h.subreddit_id").alias("subreddit_id"), col("h.comment_id").alias("comment_id")).agg(count("c.comment_id").alias("num_replies")) \
    .drop("subreddit_id")
top_scoring_comment_engagement = broadcast(reply_counts).join(comments, "comment_id").orderBy(col("num_replies").desc())

# Run the query and evaluate runtime. Observe that Spark runs every action when results are requested
# without storing intermediate results. 
top_scoring_comment_engagement.explain()
start = time.time()
top_scoring_comment_engagement.show(13)
end = time.time()

runtime = (end - start) * 1000
print(f"Total execution time: {runtime} miliseconds") 

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [num_replies#17332L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(num_replies#17332L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=3334]
      +- Project [comment_id#17307, num_replies#17332L, parent_id#17394, subreddit_id#17410, author#17344, created_at#15445, body#17361, score#15446, controversiality#15449, total_awards_received#15451, permalink#17395, submission_id#15443]
         +- BroadcastHashJoin [comment_id#17307], [comment_id#15441], Inner, BuildLeft, false
            :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]),false), [plan_id=3330]
            :  +- HashAggregate(keys=[subreddit_id#1206, comment_id#15441], functions=[count(comment_id#17250)])
            :     +- Exchange hashpartitioning(subreddit_id#1206, comment_id#15441, 200), ENSURE_REQUIREMENTS, [plan_id=3327]
            :        +- HashAggregate(keys=[subreddit_id#1206, comment_id#15441], functions=



+----------+-----------+----------+------------+--------------------+-------------------+--------------------+-----+----------------+---------------------+--------------------+-------------+
|comment_id|num_replies| parent_id|subreddit_id|              author|         created_at|                body|score|controversiality|total_awards_received|           permalink|submission_id|
+----------+-----------+----------+------------+--------------------+-------------------+--------------------+-----+----------------+---------------------+--------------------+-------------+
|t1_kch9kvt|         45|t3_18dgxau|    t5_2rawx|            burrwati|               NULL|I’m a human right...|  320|               0|                    0|/r/climatechange/...|   t3_18dgxau|
|t1_c0noh6g|         31|  t3_bnr31|    t5_2qh0k|           theonusta|2010-04-07 13:48:31|&gt; Mark Jaffe, ...|  723|               0|                 NULL|                NULL|     t3_bnr31|
|t1_kcxfofc|         26|t3_18g0cha|   t5_7ipn

                                                                                