In [None]:
import os

spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

In [None]:
# Initialize Spark instance
import findspark
findspark.init()

from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

import requests
import bz2

In [None]:
# Start Spark session

spark = SparkSession.builder.appName("RedditNlpEtl").getOrCreate()

In [None]:
# Generate a list of URLs for the reddit files

url_stub = 'https://files.pushshift.io/reddit/comments/'
file_urls = []

for year in range(2006,2007):
    
    # Set extension by year for most files
    
    if year < 2018:
        extension = '.bz2'
    elif year < 2019:
        extension = '.xz'
    else:
        extension = '.zst'
    
    # Loop over each file and print its name
    for month in range(1,13):
        
        # Handle a few special cases    
        if (year == 2017 and month == 12):
            extension = '.xz'
        if (year == 2018 and month in [11,12]):
            extension = '.zst'

        # Create the file name, adding the leading zero
        # if the month is 1 - 9
        if month < 10:
            file = 'RC_' + str(year) + '-0' + str(month) + extension
        else:
            file = 'RC_' + str(year) + '-' + str(month) + extension
            
        file_urls.append(url_stub + file)

print(file_urls)

In [None]:
# Retrieve files

# List of dictionaries to hold file content and identifying information

reddit_files = []

for url in file_urls:
    
    reddit_file_dict = {}
    response = requests.get(url)
    
    # Add file name and content to dict, then add dict to list

    reddit_file_dict['file_name'] = url.split('/')[5]
    reddit_file_dict['file_content'] = response.content

    reddit_files.append(reddit_file_dict)

    # Progress message

    print(reddit_file_dict['file_name'], reddit_file_dict['file_content'][0:30], sep='\t')

In [45]:
for file_dict in reddit_files:
  file_dict['file_unzipped'] = bz2.decompress(file_dict['file_content']).decode()
  print(file_dict['file_name'], file_dict['file_unzipped'][0:500], sep='\t')

RC_2006-01.bz2	{"subreddit":"reddit.com","author_flair_css_class":null,"created_utc":1136074029,"score":0,"ups":0,"body":"early 2006 a probable date","controversiality":0,"link_id":"t3_22569","stickied":false,"subreddit_id":"t5_6","gilded":0,"retrieved_on":1473821517,"distinguished":null,"author_flair_text":null,"author":"jh99","parent_id":"t3_22569","edited":false,"id":"c2715"}
{"id":"c2717","edited":false,"parent_id":"t3_22542","author_flair_text":null,"author":"jpb","retrieved_on":1473821517,"distinguished"
RC_2006-02.bz2	{"created_utc":1138752114,"author_flair_css_class":null,"score":0,"ups":0,"subreddit":"reddit.com","stickied":false,"link_id":"t3_15xh","subreddit_id":"t5_6","body":"THAN the title suggests.  Whoops.","controversiality":1,"retrieved_on":1473820870,"distinguished":null,"gilded":0,"id":"c166b","edited":false,"parent_id":"t3_15xh","author":"gmcg","author_flair_text":null}
{"author_flair_text":null,"author":"joshuaknox","id":"c166d","parent_id":"t3_15tx","edited":false

In [None]:
reddit_files[0]['file_unzipped']

In [42]:
with open('RC_2006-01', 'a') as f:
  for post in reddit_files[0]['file_unzipped']:
    #print(post)
    f.write(post)

In [43]:
with open('RC_2006-01', 'rt') as f:
    content = f.read()

print(content)

Output hidden; open in https://colab.research.google.com to view.

In [44]:
# Read in reddit data into a Spark df

# This way didn't work, so I saved the unzipped files and read them back in
#reddit_files[0]['file_unzipped'] = reddit_files[0]['file_unzipped'].split('\n')
#reddit_df = spark.createDataFrame(reddit_files[0]['file_unzipped'], samplingRatio = None)

reddit_df = spark.read.json('RC_2006-01')

reddit_df.show(5, truncate= False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 author                 | jh99                                                                                                                                                                                                                                  
 author_flair_css_class | null                                                                                                                                                                                                                                  
 author_flair_text      | null                                                                                                                                                                                                       

In [None]:
# Make sure data was read

reddit_df.show(1, truncate=False, vertical=True)

In [None]:
# Begin NLP
# 1. Tokenize

tokened = Tokenizer(inputCol="body", outputCol="post_words")
reddit_df_tokenized = tokened.transform(reddit_df)
reddit_df_tokenized.show(1, truncate=False, vertical=True)

In [None]:
# 2. Remove stopwords

remover = StopWordsRemover(inputCol='post_words', outputCol='post_filtered')
remover.loadDefaultStopWords('english')
reddit_df_filtered = remover.transform(reddit_df_tokenized)
reddit_df_filtered.show(1, truncate=False, vertical=True)

In [None]:
# 3. Hash
# Number of Features is default (262,144)

hasher = HashingTF(inputCol='post_filtered', outputCol='post_hashed')
reddit_df_hashed = hasher.transform(reddit_df_filtered)
reddit_df_hashed.show(1, truncate=False, vertical=True)

In [None]:
# 4. Calculate TF-IDF vectors

tfidf = IDF(inputCol='post_hashed', outputCol='post_tfidf')
tfidfModel = tfidf.fit(reddit_df_hashed)
reddit_df_tfidf = tfidfModel.transform(reddit_df_hashed)

In [None]:
# Check a few rows to make sure everything worked

cols = ['body', 'post_words', 'post_filtered', 'post_hashed', 'post_tfidf']
reddit_df_tfidf.select(cols).show(5, truncate=False, vertical=True)