In [3]:
import os
import atexit
import sys

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=3
tasks_per_node=8 
memory_per_task=1024 #1 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="1:00" #1 hour
os.environ['SBATCH_PARTITION']='single' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

sj.wait_to_start()
sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)

INFO:sparkhpc.sparkjob:Submitted batch job 630667

INFO:sparkhpc.sparkjob:Submitted cluster 1


ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=spark://cn004:7077) created by __init__ at /global/software/jupyterhub-spark/anaconda3/lib/python3.7/site-packages/sparkhpc-0.3.post4-py3.7.egg/sparkhpc/sparkjob.py:533 

In [4]:
import json
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import emoji

In [5]:
# optional step if stopwords is not installed on your Spark
#import nltk
#nltk.download('stopwords')

# process the tweets for relevant/irrelevant classification
this is to process the tweets for the naive bayes classification

open the tweets, process them, then save them back into the json format

In [6]:
# open files to read and write from
in_filename1 = 'LabeledData1.json'
in_filename2 = 'LabeledData2.json'
in_filename3 = 'LabeledData3.json'
out_filename = 'classify.json'

# join the 3 labeled data files together
rdd1 = sc.textFile(in_filename1)
rdd2 = sc.textFile(in_filename1)
rdd3 = sc.textFile(in_filename1)
joined = rdd1.union(rdd2)
joined = joined.union(rdd3)

out_handle = open(out_filename, 'w', encoding='utf8')

In [7]:
# process the tweet and adds in negation terms
# adds a not to every word after a token of logical negation
# until the next puncutation mark
puncuation = {'"', ',', '.', '?', '!'}

# adapted from https://stackoverflow.com/questions/23384351/how-to-add-tags-to-negated-words-in-strings-that-follow-not-no-and-never
def check_and_negate(tweet: str):
    tweet = tweet.strip()
    # check if the end of the tweet has a puncuation mark and add if not
    if len(tweet) > 0 and tweet[-1] not in puncuation:
        tweet += '.'
    tweet = re.sub('n\'t', ' not', tweet) # replace n't with not
    tweet = re.sub('n’t', ' not', tweet)
    # add NOT_ to the beginning of each word until a end of sentence mark (.,!?) occurs
    tweet = re.sub(r'\b(?:not|never|no)\b[\'\w\s]+[.,?!"]', 
                   lambda match: re.sub(r'(\s+)(\w+)', r'\1not_\2', match.group(0)), 
                   tweet,
                   flags=re.IGNORECASE)
    return tweet

In [8]:
# remove stop words
stop_words = set(stopwords.words('english'))

# words used in the filter
filterWords = {"tesla", "elon", "musk", "elonmusk", "tsla", "roadster", "supercharger", "powerwall", "powerpack", "modely",
               "model3", "modelx", "teslamodely", "teslamodels", "teslamodel3", "teslamodelx", "spacex",
               "teslasuv", "teslascience"}

# bigrams used in the filter
filterBigrams = {("model", "y"), ("model", "s"), ("model", "3"), ("model", "x"), ("electric", "vehicle"),
                 ("electric", "car"), ("electric", "suv"), ("electric", "supercar")}

# stop words found using the word cloud (manual extraction)
cloud = {'year', 'amp', 'us'}

# add filter words to the stop words
for word in filterWords:
    stop_words.add(word)
    
# add filter bigrams to the stop words
for bi1, bi2 in filterBigrams:
    stop_words.add(bi1)
    stop_words.add(bi2)
    
# add cloud to the stop words
for word in cloud:
    stop_words.add(word)

tokenizer = RegexpTokenizer(r'\w+') # remove punctuation and keep only words
ps = PorterStemmer()

def remove_stop_and_stem(tweet: str):
    words = []
    tokens = tokenizer.tokenize(tweet)
    for token in tokens:
        if token not in stop_words: # remove stopwords
            stemmed = ps.stem(token) # stem the word
            if stemmed not in words: # ensure only unique words for binary naive bayes
                words.append(ps.stem(token))
    return ' '.join(words)

In [9]:
# http://www.aclweb.org/anthology/W16-2610

# transform emojis into text
def transform_emojis(tweet: str):
    result = ''
    for character in tweet:
        temp = character
        if temp in emoji.UNICODE_EMOJI:
            temp = ' emoji_' + emoji.demojize(temp)[1:-1]
        result += temp
    return result

In [10]:
# processes the tweet
# https://towardsdatascience.com/the-real-world-as-seen-on-twitter-sentiment-analysis-part-one-5ac2d06b63fb
def process_tweet(tweet: str):
    tweet = tweet.lower() # convert to lowercase
    tweet = re.sub('#', '', tweet) # remove hashtags
    tweet = re.sub('@tesla', 'at_tesla', tweet) # change @Tesla to at_tesla
    tweet = re.sub('@elonmusk', 'elonmusk', tweet) # change @elonmusk to at_elonmusk
    tweet = re.sub('@\S+', '', tweet) # remove @username
    tweet = re.sub('\s+', ' ', tweet) # remove multiple whitespace
    # https://stackoverflow.com/questions/6038061/regular-expression-to-find-urls-within-a-string
    tweet = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', tweet) # remove links
    tweet = transform_emojis(tweet) # transform emojis
    tweet = check_and_negate(tweet)# add negations
    tweet = remove_stop_and_stem(tweet) # remove stop words and stem
    tweet.strip() # remove excess leading and trailing whitespace
    return tweet

In [11]:
def filter_text(tweet: str):
    # convert json object into python dict
    item = json.loads(tweet)
    if 'text' in item: # check that the dictionary contains the key
        return True
    else:
        return False

In [12]:
def parse_then_process(tweetObject: str):
    item = json.loads(tweetObject) # read the tweet object
    item['text'] = process_tweet(item['text']) # process the tweet text
    return json.dumps(item)

In [13]:
text_tweets = joined.filter(lambda x: filter_text(x))

processed = text_tweets.map(lambda x: parse_then_process(x))

for item in processed.collect():
    out_handle.write(item)
    out_handle.write('\n')

In [14]:
out_handle.close()
print('output file closed')

output file closed
