In [1]:
# imports 
import sys
from pyspark.sql import SparkSession, functions, types

In [2]:
# boilerplate 
spark = SparkSession.builder.appName('reddit averages').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
assert spark.version >= '2.3' # make sure we have Spark 2.3+

In [3]:
# define schema
comments_schema = types.StructType([
    types.StructField('archived', types.BooleanType()),
    types.StructField('author', types.StringType()),
    types.StructField('author_flair_css_class', types.StringType()),
    types.StructField('author_flair_text', types.StringType()),
    types.StructField('body', types.StringType()),
    types.StructField('controversiality', types.LongType()),
    types.StructField('created_utc', types.StringType()),
    types.StructField('distinguished', types.StringType()),
    types.StructField('downs', types.LongType()),
    types.StructField('edited', types.StringType()),
    types.StructField('gilded', types.LongType()),
    types.StructField('id', types.StringType()),
    types.StructField('link_id', types.StringType()),
    types.StructField('name', types.StringType()),
    types.StructField('parent_id', types.StringType()),
    types.StructField('retrieved_on', types.LongType()),
    types.StructField('score', types.LongType()),
    types.StructField('score_hidden', types.BooleanType()),
    types.StructField('subreddit', types.StringType()),
    types.StructField('subreddit_id', types.StringType()),
    types.StructField('ups', types.LongType()),
    #types.StructField('year', types.IntegerType()),
    #types.StructField('month', types.IntegerType()),
])

In [4]:
def main(in_directory, out_directory):
    comments = spark.read.json(in_directory)
#     comments = spark.read.json(in_directory, schema=comments_schema)
#     comments.show()
#     comments.select(comments.subreddit).distinct().show()
    # TODO: calculate averages, sort by subreddit. Sort by average score and output that too.
    groups = comments.groupBy('subreddit')
    averages = groups.agg(functions.avg(comments['score'])).cache()
#     averages.show()
    
    averages_by_subreddit = averages.orderBy('subreddit')
    averages_by_score = averages.orderBy('avg(score)', ascending=False)
#     averages.show()
    averages_by_subreddit.show()
    averages_by_score.show()
    
    averages_by_subreddit.write.csv(out_directory + '-subreddit', mode='overwrite')
    averages_by_score.write.csv(out_directory + '-score', mode='overwrite')

In [5]:
if __name__=='__main__':
    in_directory = 'reddit-1/'
    out_directory = 'output/'
    main(in_directory, out_directory)


+---------+------------------+
|subreddit|        avg(score)|
+---------+------------------+
|  Cameras|1.2222222222222223|
|Genealogy| 1.871313672922252|
|optometry|1.4701986754966887|
|    scala| 1.928939237899073|
|     xkcd| 5.272939881689366|
+---------+------------------+

+---------+------------------+
|subreddit|        avg(score)|
+---------+------------------+
|     xkcd| 5.272939881689366|
|    scala| 1.928939237899073|
|Genealogy| 1.871313672922252|
|optometry|1.4701986754966887|
|  Cameras|1.2222222222222223|
+---------+------------------+

