In [None]:
import os
import atexit
import sys

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=3
tasks_per_node=8 
memory_per_task=1024 #1 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="1:00" #1 hour
os.environ['SBATCH_PARTITION']='lattice' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

sj.wait_to_start()
sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)

INFO:sparkhpc.sparkjob:Submitted batch job 679343

INFO:sparkhpc.sparkjob:Submitted cluster 0


In [None]:
import json
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# process the tweets for sentiment analysis
open the tweets, process them, then save them back into the json format

In [None]:
# open files to read and write from
in_filename = 'classify_rel_labeled_nonbinary.json'
out_filename = 'classify_sen_labeled_nonbinary.json'

in_handle = sc.textFile(in_filename)
out_handle = open(out_filename, 'w', encoding='utf8')

In [None]:
# remove stop words
stop_words = {'elonmusk', 'tesla', 'get', 'at_tesla', 'at_elonmusk'}

tokenizer = RegexpTokenizer(r'\w+') # remove punctuation and keep only words

def remove_stop(tweet: str):
    words = []
    tokens = tokenizer.tokenize(tweet)
    for token in tokens:
        if token not in stop_words:
            words.append(token)
    return ' '.join(words)

In [None]:
# processes the tweet
# https://towardsdatascience.com/the-real-world-as-seen-on-twitter-sentiment-analysis-part-one-5ac2d06b63fb
def process_tweet(tweet: str):
    tweet = tweet.lower() # convert to lowercase
    tweet = re.sub('\s+', ' ', tweet) # remove multiple whitespace
    tweet = remove_stop(tweet)
    tweet.strip() # remove excess leading and trailing whitespace
    return tweet

In [None]:
def parse_then_process(tweetObject: str):
    item = json.loads(tweetObject) # read the tweet object
    item['text'] = process_tweet(item['text']) # process the tweet text
    return json.dumps(item)

In [None]:
def filter_relevant(tweetObject: str):
    item = json.loads(tweetObject)
    if int(item['isRelevant']) == 1:
        return True
    else:
        return False

In [None]:
filtered = in_handle.filter(lambda x: filter_relevant(x))

processed = filtered.map(lambda x: parse_then_process(x))

for item in processed.collect():
    out_handle.write(item)
    out_handle.write('\n')

In [None]:
out_handle.close()
print('output file closed')