In [1]:
import os
import atexit
import sys
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob
import pandas
from pyspark.sql import SparkSession

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=3
tasks_per_node=8 
memory_per_task=1024 #1 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="60:00" #60 min 
os.environ['SBATCH_PARTITION']='lattice' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

try:
    print('Cleaning up Spark Job')
    sj.stop()
except:
    pass

sj.wait_to_start()

try:
    print('Cleaning up Spark Context')
    sc.stop()
except:
    pass

sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)

print("Running")

INFO:sparkhpc.sparkjob:Submitted batch job 609508

INFO:sparkhpc.sparkjob:Submitted cluster 1


Cleaning up Spark Job
Cleaning up Spark Context
Running


In [8]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

#If I add the word "the" to filterWords, then rdd1 will not be empty and I can write to it
filterWords = {"tesla", "elon", "musk", "elonmusk", "tsla", "roadster", "supercharger", "powerwall", "powerpack", "modely",
               "model3", "modelx", "teslamodely", "teslamodels", "teslamodel3", "teslamodelx", "spacex",
               "teslasuv", "teslascience"}
filterBigrams = {("model", "y"), ("model", "s"), ("model", "3"), ("model", "x"), ("electric", "vehicle"),
                 ("electric", "car"), ("electric", "suv"), ("electric", "supercar")}
langs = {"en", "und"}

tokenizer = RegexpTokenizer(r'\w+')
ps = PorterStemmer()

#I am taking the filter words, bigrams, trigrams and putting them in a stemmed set
#I will run the tweet through all 6 of these sets and if there are any matches in any of these 4 sets, then it should pass
filterWordsStemmed = set()
filterBigramsStemmed = set()

for word in filterWords:
    filterWordsStemmed.add(ps.stem(word))
for bigram in filterBigrams:
    first, second = bigram
    first = ps.stem(first)
    second = ps.stem(second)
    together = (first, second)
    filterBigramsStemmed.add(together)



In [9]:
def isRelated(jsonLine):
    jDict = json.loads(jsonLine)
    if 'id' not in jDict.keys():
        return False
    if 'user' in jDict.keys() and jDict['user']['screen_name'] == 'elonmusk': #this checks for tweets from @elonmusk regardless of content
        return True
    if 'lang' in jDict.keys() and jDict['lang'] not in langs: #checks for "en" and "und" languages
        return False
    if 'text' in jDict.keys():
        words = tokenizer.tokenize(jDict['text'].lower()) #splits up tweet into individual words
        if len(words)<3:
            return False
        for word in words:
            if word in filterWords: #checks the filterWords set
                return True
        bigrams = list(nltk.bigrams(words)) #puts the tweet into bigrams
        for bigram in bigrams:
            if bigram in filterBigrams: #checks the filterBigrams set
                return True

        wordstems = list()
        for word in words:
            wordstems.append(ps.stem(word)) #stems the individual words of the tweet

        for word in wordstems:
            if word in filterWordsStemmed: #checks the filterWordsStemmed set
                return True

        bigramstems = list(nltk.bigrams(wordstems)) # puts the stemmed tweet into bigram stems
        for bigram in bigramstems:
            if bigram in filterBigramsStemmed: # checks the filterBigramStemmed set
                return True

    return False #returns false if in none of the sets

In [None]:

from os import listdir
from os.path import isfile, join
dataPath = './data'
dataFiles = [f for f in listdir(dataPath) if isfile(join(dataPath, f))]
dataFiles.sort(key=lambda x: int(x.partition(".")[0]))
print(dataFiles)

In [None]:
%%time
counts = []
for fileName in dataFiles:
    print("Filtering ", fileName)
    inFile = sc.textFile('./data/'+fileName)
    relatedLines = inFile.filter(lambda x: isRelated(x))
    counts.append(relatedLines.count())

In [None]:
print(sum(counts))

In [10]:
%%time
counts = []
for fileName in dataFiles:
    print("Filtering", fileName)
    inFile = sc.textFile('./data/'+fileName)
    relatedLines = inFile.filter(lambda x: isRelated(x))
    outNum = fileName.partition(".")[0]
    outFileName = './filteredData/' + outNum + 'f.json'
    with open(outFileName,"w") as f:
        for line in relatedLines.collect():
            f.write(line+"\n")
print("Done")

Filtering 1.json
Filtering 2.json
Filtering 3.json
Filtering 4.json
Filtering 5.json
Filtering 6.json
Filtering 7.json
Filtering 8.json
Filtering 9.json
Filtering 10.json
Filtering 11.json
Filtering 12.json
Filtering 13.json
Filtering 15.json
Filtering 16.json
Filtering 17.json
Filtering 18.json
Filtering 19.json
Filtering 20.json
Filtering 21.json
Filtering 23.json
Filtering 25.json
Filtering 28.json
Done
CPU times: user 1.96 s, sys: 884 ms, total: 2.84 s
Wall time: 1h 24min 1s
