In [2]:
# Import packages
from pyspark import SparkContext, SparkConf

In [3]:
# Configure and create the Spark Context object 

conf = SparkConf().setAppName("Project1").setMaster("local[*]")
sc = SparkContext(conf=conf)

In [13]:

# Read the documents locally, one record correspond to one doc
XTrainOri = sc.textFile("data_for_initial_local_training/X_train_vsmall.txt").cache()
YTrainOri = sc.textFile("data_for_initial_local_training/Y_train_vsmall.txt").cache()


# To read the documents on GCP
#XTrainOri = sc.textFile("gs://uga-dsp/project1/train/X_train_vsmall.txt").cache()
#YTrainOri = sc.textFile("gs://uga-dsp/project1/train//Y_train_vsmall.txt").cache()


In [78]:
# split documents into words, remove punctuation, trainsform words to lower case

# ? still need to remove quot from words, and punctuation in the middle of a word

punctuation = sc.broadcast(".,:;'!?&_-")
XTrainCP = XTrainOri.zipWithIndex().flatMap(lambda x: map(lambda e: (e, x[1]), x[0].split()))\
                    .map(lambda s: (s[0].strip(punctuation.value).lower(),s[1])) 
                    
print(XTrainCP.take(10))

[('a', 0), ('dedicated', 0), ('quot;snow', 0), ('desk&quot', 0), ('has', 0), ('been', 0), ('set', 0), ('up', 0), ('by', 0), ('the', 0)]


In [102]:
# Remove Stopwords using the file stopwords.txt provided in project0

# ? Change source of stopwords
stopWordsFile = sc.textFile("stopwords.txt")
stopWords = sc.broadcast(stopWordsFile.flatMap(lambda s: s.split()).collect())

XTrainCPS = XTrainCP.filter(lambda s: s if s[0] not in stopWords.value and s[0] else None) \
                    
print(XTrainCPS.take(10))

[('dedicated', 0), ('quot;snow', 0), ('desk&quot', 0), ('set', 0), ('up', 0), ('new', 0), ('york', 0), ('new', 0), ('jersey', 0), ('port', 0)]


In [103]:
# count the # of (word,index of document) pairs 

XTrainCPSC = XTrainCPS.map(lambda s: (s,1))\
                        .reduceByKey( lambda a,b: a + b)
print(XTrainCPSC.take(100))    

[(('rebounding', 46), 1), (('now', 18), 1), (('quot;foreign', 1), 1), (('generally', 51), 1), (('intentions', 24), 1), (("thursday's", 35), 1), (('bounce', 8), 1), (('noting', 3), 1), (('annual', 67), 1), (("don't", 3), 1), (('snow-melters', 0), 1), (('control', 47), 1), (('improvement', 4), 1), (("quot;there's", 61), 1), (('passengers,&quot', 50), 1), (('+0.4', 19), 1), (('network', 58), 1), (('audit', 29), 2), (('49', 17), 2), (('monday', 36), 1), (('several', 12), 2), (('negotiating,&quot', 9), 1), (('term', 34), 1), (('49,744', 57), 1), (('643.75', 14), 1), (('access', 25), 1), (('paid', 50), 1), (('sickened', 12), 1), (('television', 34), 1), (('life', 51), 1), (('traditionally', 47), 1), (('buzz', 12), 1), (('declined', 38), 1), (('obstruction', 55), 1), (('wednesday', 67), 1), (('penalty', 36), 1), (('calls', 24), 1), (('prepare', 22), 1), (('two', 56), 1), (('garage', 5), 2), (('island', 61), 3), (('quot;i', 12), 1), (("i'm", 47), 1), (('7', 61), 1), (('before', 34), 2), (('gro

In [104]:
# the structure of output: (index of document, dictionary of {word: count of word})

XTrainDict = XTrainCPSC.map(lambda x: (x[0][1],(x[0][0],x[1])))\
                            .groupByKey()\
                            .map(lambda x: (x[0],dict(x[1])))


print(XTrainDict.take(2))

[(0, {'7706': 1, '250': 1, 'conditions': 1, 'quot;we': 2, 'jfk': 1, 'kennedy': 1, 'feeley': 1, 'round': 1, 'new': 3, 'almost': 1, 'f': 1, 'blowers': 1, 'including': 1, 'ground': 1, 'blast': 1, 'snow': 1, 'massive': 1, 'officer': 1, 'few': 1, 'employees': 1, "don't": 1, '171': 2, 'standby': 1, 'help': 1, 'latest': 1, 'carefully': 1, 'national': 1, 'laguadria': 1, 'operations': 2, 'temperatures': 1, 'airport': 2, 'accordingly': 1, 'prevent': 1, 'quot;snow': 2, 'key': 2, 'private': 1, 'anticipate': 1, 'air': 2, 'facility-specific': 1, 'airports': 3, 'more': 1, 'inground': 1, 'humidity': 1, 'pieces': 1, 'day': 1, 'plan': 1, 'staff': 1, 'chief': 1, 'tracks': 1, 'react': 1, '5,100': 1, 'snow-melters': 1, 'technology': 1, "york's": 1, 'tons': 1, 'year': 2, 'moving': 1, 'newsroom': 1, 'de-icing': 1, 'david': 1, 'transmitting': 1, '542': 2, 'special': 1, 'york': 1, 'updated': 1, 'data': 1, 'companies': 1, "what's": 1, 'desk&quot': 2, 'use': 1, 'sand': 1, 'reports': 2, 'set': 1, 'deploy': 1, "da