In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

# Load all Parquet files in the directory (parquet data folder needs to have all 10 parquet files)
df = dd.read_parquet('parquet-data/')

# Inspecting the data, or what are the columns about
partition_1 = df.to_delayed()[0].compute()
for i in range(0,10):
    print(f'Body: {partition_1['body'][i]} \n') # Body = raw post
    print(f'Normalized: {partition_1['normalizedBody'][i]}\n') # Normalized Body = body without html characters
    print(f'Summary: {partition_1['summary'][i]}\n') # summary column = extracted TL;DR
    print(f'Content: {partition_1['content'][i]}\n\n') # Content = NormalizedBody wihout the TL;DR

## Inspecting the subreddits
# Creating a dictionary with subreddits as unique keys and number of posts for each subreddit as value
dict_count = {}
for partition in df.to_delayed(): # this allows to iterate over all dataset without loading it at once
    part_df = partition.compute()
    for elem in list(part_df['subreddit']):
        if elem not in dict_count.keys():
            dict_count[elem] = 1
        else:
            dict_count[elem] += 1

# Creating a sorted frquency list out of dict_count
list_freq = sorted(dict_count.items(), key=lambda item: item[1], reverse=True)

# Ten most frequent subreddits
list_freq[:10] # AskReddit, leagueoflegends, AdviceAnimals, funny, pics, gaming, politics, atheism, WTF, todayilearned

# Numbers of unique subreddits
len(list_freq) # 14 121 subreddits