In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from argparse import ArgumentParser
import os
import json

In [2]:
'''
parser = ArgumentParser(description='PySpark Data Processor')
parser.add_argument('--minInitialCount', type=int, default=2, metavar='M',
                    help='The minimum number of times a token must appear in a document\
                    to be considered for the global calculation')
parser.add_argument('--byteDir', type=str, default='.', metavar='bD',
                    help='The directory from which to pull binaries')
parser.add_argument('--asmDir', type=str, default='.', metavar='aD',
                    help='The directory from which to pull .asm files')
parser.add_argument('--dest', type=str, default='.', metavar='D',
                    help='The directory in which to store any output files')
'''
bucket='gs://uga-dsp'
minInitialCount=2
bytesDir="{}/project1/data/bytes/".format(bucket)
asmDir="{}/project1/data/asm/".format(bucket)
filesDir="{}/project1/files/".format(bucket)
dest='home/zainmeekail/daphne-p1/outputs/'

In [3]:
conf = SparkConf().setAppName("FirstNotebook")
sc = SparkContext.getOrCreate(conf=conf)

In [4]:
'''
Defines parsing steps given a .txt file which specifies the hashes
of binary/asm files to include. 

NOTE: Currently the function will only deal with binaries, reflecting
the approach decided on by the group. This may later be modified to
accommodate asm files as well as binaries, or asm processing may occur
in a seperate but similar function
'''
def datasetParser(file):
    out=sc.textFile(file)\
        .flatMap(lambda x: x.split())
    return out

In [5]:
def main(dataset,func, outName):
    # Creates an empty list which is then filled by the processed RDD of each doc
    docs=[]
    
    # Set a basic loop over the data directory to catch all relevant txt files
    for filename in datasetParser(filesDir+dataset).collect():
        print('processing: {}'.format(filename))
        docs.append(func(bytesDir+filename+'.bytes'))
    
    # Initializes the final result using the first doc in the list
    out=docs[0]

    # For each doc in the list after master, merge with master
    print('merging')
    for d in docs[1:]:
        out=out.union(d)

    # Reduce to achieve a 'net count' accross docs
    out=out.reduceByKey(lambda x,y : x+y)
    print('reducing')

    # JSON dump the final, reduced master, which now contains the token count across ALL docs
    print('dumping')
    with open(dest+outName, 'w') as f:
        json.dump(dict(out.top(40, key=lambda x: x[1])), f)


In [7]:
'''
Defines ops for a single document. This function handles all the operations
which are fully contained within a single document (e.g. word count but not IDF). 
This function specifically generates a word-count for the document.

'''

def wordCount(file):
    print "Reading file: "+file
    out=sc.textFile(file)\
        .flatMap(lambda x: x.split())\
        .filter(lambda x: len(x)==2)\
        .map(lambda x:(x,1))\
        .reduceByKey(lambda x, y: x + y)
    return out

In [8]:
main("X_small_train.txt",wordCount,"X_small_train_wc.json")

processing: DvdM5Zpx96qKuN3cAt1y
Reading file: gs://uga-dsp/project1/data/bytes/DvdM5Zpx96qKuN3cAt1y.bytes
processing: 5QpgRV2cqU9wvjBist1a
Reading file: gs://uga-dsp/project1/data/bytes/5QpgRV2cqU9wvjBist1a.bytes
processing: 2F6ZfVCQRi3vrwcj4zxL
Reading file: gs://uga-dsp/project1/data/bytes/2F6ZfVCQRi3vrwcj4zxL.bytes
processing: 1KjZ4An78sOytkzgRL0E
Reading file: gs://uga-dsp/project1/data/bytes/1KjZ4An78sOytkzgRL0E.bytes
processing: JnHGRI2v5NuB9lpsEOCS
Reading file: gs://uga-dsp/project1/data/bytes/JnHGRI2v5NuB9lpsEOCS.bytes
processing: fvTz3rSPVQJGelO49Rn0
Reading file: gs://uga-dsp/project1/data/bytes/fvTz3rSPVQJGelO49Rn0.bytes
processing: Akbqr8c0E31QoOTwKefL
Reading file: gs://uga-dsp/project1/data/bytes/Akbqr8c0E31QoOTwKefL.bytes
processing: 7wFaWUhcDgonLtQdqpy2
Reading file: gs://uga-dsp/project1/data/bytes/7wFaWUhcDgonLtQdqpy2.bytes
processing: C1IklXQF4DJiB8stvR39
Reading file: gs://uga-dsp/project1/data/bytes/C1IklXQF4DJiB8stvR39.bytes
processing: fOoBREwIcNmD8u6Kts75
Read

processing: HmyrPzn4LNuxQetUMlfK
Reading file: gs://uga-dsp/project1/data/bytes/HmyrPzn4LNuxQetUMlfK.bytes
processing: CdBnaqstfPMA9NXroOJh
Reading file: gs://uga-dsp/project1/data/bytes/CdBnaqstfPMA9NXroOJh.bytes
processing: iWsF18Ct4BScbnKHTXEO
Reading file: gs://uga-dsp/project1/data/bytes/iWsF18Ct4BScbnKHTXEO.bytes
processing: e7FhuKwUW8LfMA2bxEDl
Reading file: gs://uga-dsp/project1/data/bytes/e7FhuKwUW8LfMA2bxEDl.bytes
processing: 4unYl0BDo2NvIZGiEJWj
Reading file: gs://uga-dsp/project1/data/bytes/4unYl0BDo2NvIZGiEJWj.bytes
processing: 6mvWVJXheHiotAd9CZrx
Reading file: gs://uga-dsp/project1/data/bytes/6mvWVJXheHiotAd9CZrx.bytes
processing: 4XvpUxtrIGC8Y2LPAmhS
Reading file: gs://uga-dsp/project1/data/bytes/4XvpUxtrIGC8Y2LPAmhS.bytes
processing: csiyQIjSzhrTfGtYKlq5
Reading file: gs://uga-dsp/project1/data/bytes/csiyQIjSzhrTfGtYKlq5.bytes
processing: i5qAV936TsMzYyWbw0HX
Reading file: gs://uga-dsp/project1/data/bytes/i5qAV936TsMzYyWbw0HX.bytes
processing: GyRxVEwf1DFl5jJrsZaM
Read

processing: 8vLGE0QNUBHaK3dt5hyl
Reading file: gs://uga-dsp/project1/data/bytes/8vLGE0QNUBHaK3dt5hyl.bytes
processing: BfZpbY7j0PIw3Orygsvq
Reading file: gs://uga-dsp/project1/data/bytes/BfZpbY7j0PIw3Orygsvq.bytes
processing: 2pdHWrsICS6VPXhfyEwl
Reading file: gs://uga-dsp/project1/data/bytes/2pdHWrsICS6VPXhfyEwl.bytes
processing: arzldLNUhfoE3cjIZ12b
Reading file: gs://uga-dsp/project1/data/bytes/arzldLNUhfoE3cjIZ12b.bytes
processing: AJ3Bb6XsL925OtWelfUK
Reading file: gs://uga-dsp/project1/data/bytes/AJ3Bb6XsL925OtWelfUK.bytes
processing: 9BrzKQYJWfIU7C2whoPT
Reading file: gs://uga-dsp/project1/data/bytes/9BrzKQYJWfIU7C2whoPT.bytes
processing: jhF9uVUQXrSw7bOt3Jkg
Reading file: gs://uga-dsp/project1/data/bytes/jhF9uVUQXrSw7bOt3Jkg.bytes
processing: 5DTUziJFLRQ17phBsOfE
Reading file: gs://uga-dsp/project1/data/bytes/5DTUziJFLRQ17phBsOfE.bytes
processing: 2zN5EbfhwyZKgGodjiQu
Reading file: gs://uga-dsp/project1/data/bytes/2zN5EbfhwyZKgGodjiQu.bytes
processing: EARfujkVMXW1wQO9gcGq
Read

processing: IrD2NqBzMT4euJQbXyv0
Reading file: gs://uga-dsp/project1/data/bytes/IrD2NqBzMT4euJQbXyv0.bytes
processing: cuZrzSw2lXa4fq5FgmOj
Reading file: gs://uga-dsp/project1/data/bytes/cuZrzSw2lXa4fq5FgmOj.bytes
processing: iEVTh37Mo4NWlAaqCFkU
Reading file: gs://uga-dsp/project1/data/bytes/iEVTh37Mo4NWlAaqCFkU.bytes
processing: FDHjobQnCUSs6atXldJL
Reading file: gs://uga-dsp/project1/data/bytes/FDHjobQnCUSs6atXldJL.bytes
processing: iBwVcOQ6hltTIaZp1SFk
Reading file: gs://uga-dsp/project1/data/bytes/iBwVcOQ6hltTIaZp1SFk.bytes
processing: 4KZRhfsL2Fx6SXrzQDAt
Reading file: gs://uga-dsp/project1/data/bytes/4KZRhfsL2Fx6SXrzQDAt.bytes
processing: 8QzroqkiRDNMZIF69E7v
Reading file: gs://uga-dsp/project1/data/bytes/8QzroqkiRDNMZIF69E7v.bytes
processing: HWK4xCdIALnTeX5N2QVs
Reading file: gs://uga-dsp/project1/data/bytes/HWK4xCdIALnTeX5N2QVs.bytes
processing: EiD3lRHWhCw4zY9SjXVI
Reading file: gs://uga-dsp/project1/data/bytes/EiD3lRHWhCw4zY9SjXVI.bytes
processing: etiXx5SkyzfIu0aqWJ7g
Read

processing: a9qc1ZCzupOiSsjHdx8e
Reading file: gs://uga-dsp/project1/data/bytes/a9qc1ZCzupOiSsjHdx8e.bytes
processing: cgOFjPAr76QExV2piI1H
Reading file: gs://uga-dsp/project1/data/bytes/cgOFjPAr76QExV2piI1H.bytes
processing: Ax8CPSOsnjuY1cDyNbqH
Reading file: gs://uga-dsp/project1/data/bytes/Ax8CPSOsnjuY1cDyNbqH.bytes
processing: f4BqH0J1xLkYKI3TEQWm
Reading file: gs://uga-dsp/project1/data/bytes/f4BqH0J1xLkYKI3TEQWm.bytes
processing: 8LiHPj7EOaFpnN0uTAW9
Reading file: gs://uga-dsp/project1/data/bytes/8LiHPj7EOaFpnN0uTAW9.bytes
processing: 4lFcSekUOy0MsrNxt6wH
Reading file: gs://uga-dsp/project1/data/bytes/4lFcSekUOy0MsrNxt6wH.bytes
processing: fnuUPBWtXN7bSsMvRDem
Reading file: gs://uga-dsp/project1/data/bytes/fnuUPBWtXN7bSsMvRDem.bytes
processing: 0i4ENysvVrgFnbaHUuJK
Reading file: gs://uga-dsp/project1/data/bytes/0i4ENysvVrgFnbaHUuJK.bytes
processing: H0ifOVETDIjMrgGWSa4k
Reading file: gs://uga-dsp/project1/data/bytes/H0ifOVETDIjMrgGWSa4k.bytes
processing: IHyNhQvYnwlsr2fpOMXL
Read