In [1]:
import pyspark as ps
from pyspark import SparkContext
from pyspark import SQLContext
from pyspark import SparkConf
from pyspark.ml.feature import *
from pyspark.ml import Pipeline
from pyspark.sql.session import SparkSession
from pyspark.ml.classification import NaiveBayes
import sys
import requests
import re
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import *
from pyspark.sql import functions




In [None]:
def spark_session_setup():
    """
    creates a spark context
    >>> sc = spark_session_setup()
    """
    # in order to be bale to change log level
    conf = ps.SparkConf()
    conf.set('spark.logConf', 'true')
    conf.set('spark.executor.memory', '12G')
    conf.set('spark.driver.memory', '12G')
#     conf.set('spark.driver.maxResultSize', '10G')
    # create a spark session
    sc = ps.SparkContext(appName='word_count', conf=conf)
    # change log level to ERROR
    sc.setLogLevel("ERROR")
    return sc
sc = spark_session_setup()

sc = SparkContext.getOrCreate()#SparkConf().setMaster("local[*]"))
sql_context = ps.sql.SQLContext(sc)


In [None]:
asm_data_path = 'https://storage.googleapis.com/uga-dsp/project1/data/asm/'
byte_data_path = 'https://storage.googleapis.com/uga-dsp/project1/data/bytes/'
x_small_train_path ='https://storage.googleapis.com/uga-dsp/project1/files/X_small_train.txt'
y_small_train_path ='https://storage.googleapis.com/uga-dsp/project1/files/y_small_train.txt'
x_small_test_path ='https://storage.googleapis.com/uga-dsp/project1/files/X_small_test.txt'
y_small_test_path ='https://storage.googleapis.com/uga-dsp/project1/files/y_small_test.txt'
text = requests.get(x_small_train_path).text
data = sc.parallelize(text.splitlines(),numSlices=80)

In [None]:
filenames = requests.get(x_small_train_path).text.split('\n')
labels = requests.get(y_small_train_path).text.split('\n')
filename_label_dict = {}
for filename, label in zip(filenames, labels):
    filename_label_dict[filename] = label

broadcast_filename_label_dict = sc.broadcast(filename_label_dict)

def add_asm_texts_to_features(x): 
    path = asm_data_path+x+'.asm'
    text1 = requests.get(path).text.splitlines()
    text2 = [element.partition(':')[0] for element in text1]
    fname = x
    label = int(broadcast_filename_label_dict.value[fname])
    return((fname,label,text2))


train_data_with_asm=data.map(lambda x: add_asm_texts_to_features(x))


In [None]:
text_test = requests.get(x_small_test_path).text
test_data = sc.parallelize(text_test.splitlines(),numSlices=80)

filenames_test = requests.get(x_small_test_path).text.split('\n')
labels_test = requests.get(y_small_test_path).text.split('\n')
filename_label_dict_test = {}
for filename, label in zip(filenames_test, labels_test):
    filename_label_dict_test[filename] = label

broadcast_filename_label_dict_test = sc.broadcast(filename_label_dict_test)

def add_asm_texts_to_features_test(x): 
    path = asm_data_path+x+'.asm'
    text1 = requests.get(path).text.splitlines()
    text2 = [element.partition(':')[0] for element in text1]
    fname = x
    label = int(broadcast_filename_label_dict_test.value[fname])
    return((fname,label,text2))


test_data_with_asm=test_data.map(lambda x: add_asm_texts_to_features_test(x))

In [None]:
test_data_df = sql_context.createDataFrame(test_data_with_asm, ['doc', 'label', 'text'])

In [None]:
train_data_df = sql_context.createDataFrame(train_data_with_asm, ['doc', 'label', 'text'])

In [None]:
#Training: Tokenize, Frequency, TF-IDF
# remover = StopWordsRemover(inputCol="text", outputCol='filtered', stopWords=['??'])#, '00'])
ngram = NGram(n=3, inputCol='text', outputCol='ngrams')
hashingTF = HashingTF(inputCol="ngrams", outputCol="features") #, numFeatures=256)
#idf = IDF(inputCol='freqs', outputCol='features')
nb = NaiveBayes(smoothing=1)
#ML Pipeline Model
pipeline = Pipeline(stages=[ngram, hashingTF, nb])
model = pipeline.fit(train_data_df)
#model.save('NB_Best_Model')
predictions = model.transform(test_data_df)

#Evaluate Model Accuracy

predictions = predictions.withColumn('label',predictions['label'].cast(DoubleType()))
add_one= functions.udf(lambda x:x+1)
predictions=predictions.withColumn('addedprediction',add_one('prediction'))
predictions = predictions.withColumn('addedprediction',predictions['addedprediction'].cast(DoubleType()))
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="addedprediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

In [None]:
# this code gives 33.13% accuracy , which is pretty bad.. lets tweak and few things and see what happens..//ran on gcp ...