In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, LongType, FloatType
from argparse import ArgumentParser
import os
import json
import pandas as pd

In [None]:
'''
parser = ArgumentParser(description='PySpark Data Processor')
parser.add_argument('--minInitialCount', type=int, default=2, metavar='M',
                    help='The minimum number of times a token must appear in a document\
                    to be considered for the global calculation')
parser.add_argument('--byteDir', type=str, default='.', metavar='bD',
                    help='The directory from which to pull binaries')
parser.add_argument('--asmDir', type=str, default='.', metavar='aD',
                    help='The directory from which to pull .asm files')
parser.add_argument('--dest', type=str, default='.', metavar='D',
                    help='The directory in which to store any output files')
'''
bucket='gs://uga-dsp'
minInitialCount=2
bytesDir=f"{bucket}/project1/data/bytes/"
asmDir=f"{bucket}/project1/data/asm/"
filesDir=f"{bucket}/project1/files/"
dest='/home/zainmeekail/daphne-p1/features/'

In [None]:
conf = SparkConf().setAppName("FirstNotebook")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

In [None]:
'''
Defines parsing steps given a .txt file which specifies the hashes
of binary/asm files to include. 

NOTE: Currently the function will only deal with binaries, reflecting
the approach decided on by the group. This may later be modified to
accommodate asm files as well as binaries, or asm processing may occur
in a seperate but similar function
'''
def datasetParser(file):
    out=sc.textFile(file)\
        .flatMap(lambda x: x.split())
    return out

In [None]:
'''
Defines the main loop for computing features over a dataset
@param dataset: the path to the dataset indicator function (abslute path preferred)
@param func: the main processing function which can be specified per `main` call
@param outName: the name of the output file, given as a path (abslute path preferred)
                -- if outName is not defined, then the output will not be saved, only returned
'''
def main(dataset,func, outName=None):

    # Declare df as a variable for later use
    df=None
    
    # Construct schema
    schema=buildSchema()
    
    # Initialize count for progress tracking
    count=1
    
    # Collects the hash-list provided by the indicator file
    dataList=datasetParser(filesDir+dataset).collect()

    # Set a basic loop over the data to process all relevant files
    for filename in dataList:
        print(f'Processing hash: {filename}\nProgress: {count}/{len(dataList)} | {count/len(dataList)*100:.2f}%')
        
        # Processes a single into a row entry using `func`
        row=func(filename)
        count+=1
        
        # First row creates the dataframe, the subsequent ones add to it
        if df==None:
            df=spark.createDataFrame([tuple(row)],schema=schema)
        else:
            df=df.union(spark.createDataFrame([tuple(row)],schema=schema))
    
    # Save dataframe as a csv if outName is defined
    if outName is not None:
        df.toPandas().to_csv(outName)

    return df            

In [None]:
def buildRow(file_hash):   
    
    row=[file_hash]
    file=bytesDir+file_hash+'.bytes'
    wc=wordCount(file)
    X=wc.collectAsMap()
    
    row+=[safeCheck(X,hexGen(i)) for i in range(256)]+[safeCheck(X,'??')]
    del X
    
    _net=wc.map(lambda x:x[1])
    net=_net.reduce(lambda x,y:x+y)
    _rel=wc.map(lambda x:('rel_'+x[0],x[1]/net))

    X=_rel.collectAsMap()
    row+=[safeCheck(X,'rel_'+hexGen(i),.0) for i in range(256)]+[safeCheck(X,'rel_??',.0)] 
    del X
    
    row+=[net]
    return row
#relative=temp.map(lambda x:x/net)

In [None]:
'''
Defines ops for a single document. This function handles all the operations
which are fully contained within a single document (e.g. word count but not IDF). 
This function specifically generates a word-count for the document.

'''

def wordCount(file):
    #print("Reading file: "+file)
    out=sc.textFile(file)\
        .flatMap(lambda x: x.split())\
        .filter(lambda x: len(x)==2)\
        .map(lambda x:(x,1))\
        .reduceByKey(lambda x, y: x + y)
    return out

In [None]:
def safeCheck(X,key,val=0):
    return X[key] if key in X else val

In [None]:
def hexGen(i):
    return ('0'+str(hex(i)).upper()[2:])[-2:]

In [None]:
def _assembleSchema():
    out=[hexGen(i) for i in range(256)]+['??']\
       +['rel_'+hexGen(i) for i in range(256)]+['rel_??']
    return out

In [None]:
_schema=sc.broadcast(_assembleSchema())

In [None]:
def buildSchema():
    schema = StructType([StructField('hash',StringType())]\
                        +[StructField(_schema.value[i],LongType()) for i in range(257)]\
                        +[StructField(_schema.value[i],FloatType()) for i in range(257,514)]\
                        +[StructField('total_count',LongType())])
    return schema

In [None]:
df=main("X_small_train.txt",buildRow,outName=dest+'X_small_train.csv')

In [None]:
df.toPandas().to_csv(dest+'X_small_train.csv')

In [None]:
del df

In [None]:
row=buildRow('DvdM5Zpx96qKuN3cAt1y')

In [None]:
schema=buildSchema()

# Note To Self:

Must account for additional index column that is added during csv writing. Potentially use 'index=False' on writing, though unknown if this is supported in pyspark

In [None]:
df = spark.read.format("csv").load('gs://micky-practicum/'+'X_small_train.csv',schema=schema)

In [None]:
DF=spark.read.csv('gs://micky-practicum/'+'X_small_train.csv',schema=schema,index=False)

In [None]:
df.select('total_count').head(n=5)

In [None]:
DF.head(n=2)