In [12]:
import pyspark as ps
from pyspark import SparkContext
from pyspark import SQLContext
from pyspark import SparkConf
from pyspark.ml.feature import *
from pyspark.ml import Pipeline
from pyspark.sql.session import SparkSession
from pyspark.ml.classification import NaiveBayes
import sys
import requests
import re
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import *

In [2]:
def spark_session_setup():
    """
    creates a spark context
    >>> sc = spark_session_setup()
    """
    # in order to be bale to change log level
    conf = ps.SparkConf()
    conf.set('spark.logConf', 'true')
    conf.set('spark.executor.memory', '12G')
    conf.set('spark.driver.memory', '12G')
#     conf.set('spark.driver.maxResultSize', '10G')
    # create a spark session
    sc = ps.SparkContext(appName='word_count', conf=conf)
    # change log level to ERROR
    sc.setLogLevel("ERROR")
    return sc
sc = spark_session_setup()

sc = SparkContext.getOrCreate()#SparkConf().setMaster("local[*]"))
sql_context = ps.sql.SQLContext(sc)

In [3]:
#asm data_path
asm_data_path = 'https://storage.googleapis.com/uga-dsp/project1/data/asm/'
byte_data_path = 'https://storage.googleapis.com/uga-dsp/project1/data/bytes/'
x_small_train_path ='https://storage.googleapis.com/uga-dsp/project1/files/X_small_train.txt'
y_small_train_path ='https://storage.googleapis.com/uga-dsp/project1/files/y_small_train.txt'
x_small_test_path ='https://storage.googleapis.com/uga-dsp/project1/files/X_small_test.txt'
y_small_test_path ='https://storage.googleapis.com/uga-dsp/project1/files/y_small_test.txt'
text = requests.get(x_small_train_path).text
data = sc.parallelize(text.splitlines(),numSlices=40)
data.take(1)

['DvdM5Zpx96qKuN3cAt1y']

In [4]:
filenames = requests.get(x_small_train_path).text.split('\n')
labels = requests.get(y_small_train_path).text.split('\n')
filename_label_dict = {}
for filename, label in zip(filenames, labels):
    filename_label_dict[filename] = label

broadcast_filename_label_dict = sc.broadcast(filename_label_dict)

def find_file(x): 
    path = byte_data_path+x+'.bytes'
    text1 = requests.get(path).text
    return(x,text1)


train_data=data.map(lambda x: find_file(x))
train_data.take(1)

[('DvdM5Zpx96qKuN3cAt1y',
  '00401000 2D BA 2C 37 D0 D3 5E D4 FA FF 4A 9D CD 73 6E C5\r\n00401010 D6 B6 09 AC 76 22 29 F4 43 0D 06 CB CE 69 25 DF\r\n00401020 31 5A 40 81 00 1B D9 FE C6 F9 7C 78 57 51 55 9F\r\n00401030 05 EB C3 E6 28 51 B0 57 E3 4F 0F 51 16 7B C4 0A\r\n00401040 56 55 41 64 3F 27 82 C3 E4 A1 D6 59 24 61 8F 27\r\n00401050 42 A0 0F 09 1B 19 44 A0 52 1F AF 91 16 8B 1B 02\r\n00401060 2E FF AC 43 12 F8 D7 16 14 B1 ED 56 34 4F B3 95\r\n00401070 BF 7F F6 5E 70 76 0C 15 7F 95 D7 92 DC 57 F8 14\r\n00401080 60 55 97 13 B5 A1 E6 CE C2 4A 34 AA F3 89 78 B8\r\n00401090 77 9E 73 2D 02 C0 06 AF 0F 19 F4 C4 CD 77 BA FF\r\n004010A0 39 84 4B 86 C7 67 46 FA EB 1F 9E 44 17 9E 32 A8\r\n004010B0 08 C4 07 72 85 47 2F 5D 2B 84 0C 91 D0 E0 1F 62\r\n004010C0 14 26 10 A2 13 25 95 4F FA 9D 32 AA A9 EE C9 47\r\n004010D0 A4 3D E9 18 84 66 55 9C D8 27 98 C1 7C FA 2B A8\r\n004010E0 7B 2F 0C 68 EC D7 7C 84 EE B4 5C 16 02 FD 94 1F\r\n004010F0 BC 94 DC 6D 37 45 09 84 86 D4 17 39 D4 F7 40 9D\r\n00401100 63

In [5]:
def pre_process(x):
    fname = x[0]
    label = int(broadcast_filename_label_dict.value[fname])
    word_list = list(filter(lambda x: len(x)==2 and x!='??', re.split('\r\n| ', x[1])))
    return (fname, label, word_list)


train_data_with_labels=train_data.map(lambda x: pre_process(x))

In [6]:
train_data_with_labels.take(1)

[('DvdM5Zpx96qKuN3cAt1y',
  6,
  ['2D',
   'BA',
   '2C',
   '37',
   'D0',
   'D3',
   '5E',
   'D4',
   'FA',
   'FF',
   '4A',
   '9D',
   'CD',
   '73',
   '6E',
   'C5',
   'D6',
   'B6',
   '09',
   'AC',
   '76',
   '22',
   '29',
   'F4',
   '43',
   '0D',
   '06',
   'CB',
   'CE',
   '69',
   '25',
   'DF',
   '31',
   '5A',
   '40',
   '81',
   '00',
   '1B',
   'D9',
   'FE',
   'C6',
   'F9',
   '7C',
   '78',
   '57',
   '51',
   '55',
   '9F',
   '05',
   'EB',
   'C3',
   'E6',
   '28',
   '51',
   'B0',
   '57',
   'E3',
   '4F',
   '0F',
   '51',
   '16',
   '7B',
   'C4',
   '0A',
   '56',
   '55',
   '41',
   '64',
   '3F',
   '27',
   '82',
   'C3',
   'E4',
   'A1',
   'D6',
   '59',
   '24',
   '61',
   '8F',
   '27',
   '42',
   'A0',
   '0F',
   '09',
   '1B',
   '19',
   '44',
   'A0',
   '52',
   '1F',
   'AF',
   '91',
   '16',
   '8B',
   '1B',
   '02',
   '2E',
   'FF',
   'AC',
   '43',
   '12',
   'F8',
   'D7',
   '16',
   '14',
   'B1',
   'ED',
   '56

In [7]:
text_test = requests.get(x_small_test_path).text
test_data = sc.parallelize(text_test.splitlines(),numSlices=40)

filenames_test = requests.get(x_small_test_path).text.split('\n')
labels_test = requests.get(y_small_test_path).text.split('\n')
filename_label_dict_test = {}
for filename, label in zip(filenames_test, labels_test):
    filename_label_dict_test[filename] = label

broadcast_filename_label_dict_test = sc.broadcast(filename_label_dict_test)

test_data_new=test_data.map(lambda x: find_file(x))
def pre_process_test(x):
    fname = x[0]
    label = int(broadcast_filename_label_dict_test.value[fname])
    word_list = list(filter(lambda x: len(x)==2 and x!='??', re.split('\r\n| ', x[1])))
    return (fname, label, word_list)


test_data_with_labels=test_data_new.map(lambda x: pre_process_test(x))

In [8]:
test_data_with_labels.take(1)

[('jYWpNLrtc2ns0kMizPgX',
  6,
  ['C7',
   '45',
   'FC',
   '01',
   '00',
   '00',
   '00',
   'EB',
   '00',
   '85',
   'FF',
   '7E',
   '59',
   'F6',
   '46',
   '0A',
   '01',
   '75',
   '53',
   'FF',
   'B6',
   '30',
   '02',
   '00',
   '00',
   'FF',
   '15',
   '60',
   'C1',
   '48',
   '00',
   '59',
   '6A',
   '00',
   'FF',
   '15',
   '10',
   'C1',
   '48',
   '00',
   'F6',
   '46',
   '0A',
   '01',
   '59',
   '75',
   '37',
   'FF',
   'B6',
   '30',
   '02',
   '00',
   '00',
   'FF',
   '15',
   '2C',
   'C1',
   '48',
   '00',
   '59',
   '8B',
   '45',
   '10',
   '2B',
   'C7',
   '3D',
   '00',
   '40',
   '00',
   '00',
   '7C',
   '4B',
   'B8',
   'FD',
   '90',
   '45',
   '00',
   'FF',
   '75',
   '14',
   '50',
   '0F',
   '84',
   '09',
   '5D',
   '00',
   '00',
   '6A',
   '38',
   '8D',
   '45',
   '84',
   '50',
   '56',
   'E8',
   '26',
   '43',
   '00',
   '00',
   '83',
   'C4',
   '0C',
   '85',
   'C0',
   '75',
   '30',
   '66',
   '81

In [9]:
def add_asm_texts_to_features(x): 
    path = asm_data_path+x[0]+'.asm'
    text1 = requests.get(path).text.splitlines()
    text2 = [element.partition(':')[0] for element in text1]
    text2.extend(x[2])
    return((x[0],x[1],text2))


test_data_with_asm=test_data_with_labels.map(lambda x: add_asm_texts_to_features(x))
test_data_with_asm.take(1)

[('jYWpNLrtc2ns0kMizPgX',
  6,
  ['HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   'HEADER',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '.text',
   '

In [10]:
test_data_df_repar = sql_context.createDataFrame(test_data_with_asm, ['doc', 'label', 'text'])
test_data_df_repar.show(n=5)

+--------------------+-----+--------------------+
|                 doc|label|                text|
+--------------------+-----+--------------------+
|jYWpNLrtc2ns0kMizPgX|    6|[HEADER, HEADER, ...|
|exRaSnLFu2VyNGJhDWlg|    3|[HEADER, HEADER, ...|
|4GUJ7HBXO8I56WyRSMQd|    4|[HEADER, HEADER, ...|
|B2Vfj0zqxgmZb7E8QpGK|    3|[HEADER, HEADER, ...|
|3k1meX0gV2WMjAvGDrCq|    3|[HEADER, HEADER, ...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [11]:
test_data_df_repar.rdd.getNumPartitions()

40

In [None]:
# generate features from ASM files # raw ones , that can be put into a list of above...

In [None]:
path = asm_data_path +'01IsoiSMh5gxyDYTl4CB.asm'
text1 = requests.get(path).text.splitlines()

In [None]:
print(text1[1200])
text2 = [element.partition(':')[0] for element in text1]
print(text2[1200])

In [None]:
print(text2[:50])

In [None]:
train_data_with_labels.take(1)

In [None]:
def add_asm_texts_to_features(x): 
    path = asm_data_path+x[0]+'.asm'
    text1 = requests.get(path).text.splitlines()
    text2 = [element.partition(':')[0] for element in text1]
    text2.extend(x[2])
    return((x[0],x[1],text2))


train_data_with_asm=train_data_with_labels.map(lambda x: add_asm_texts_to_features(x))
train_data_with_asm.take(1)

In [None]:
train_data_df = sql_context.createDataFrame(train_data_with_asm, ['doc', 'label', 'text'])
train_data_df.show(n=5)

In [None]:
train_data_df.rdd.getNumPartitions()

In [None]:
# ngram = NGram(n=1, inputCol='text', outputCol='ngrams')
# ngramed_df= ngram.transform(train_data_df)
# hashingTF = HashingTF(inputCol="ngrams", outputCol="features")
# hashedTF=hashingTF.transform(ngramed_df)
# hashedTF.show(n=2)

In [None]:
#Training: Tokenize, Frequency, TF-IDF
# remover = StopWordsRemover(inputCol="text", outputCol='filtered', stopWords=['??'])#, '00'])
ngram = NGram(n=1, inputCol='text', outputCol='ngrams')
hashingTF = HashingTF(inputCol="ngrams", outputCol="features") #, numFeatures=256)
#idf = IDF(inputCol='freqs', outputCol='features')
nb = NaiveBayes(smoothing=1)
#ML Pipeline Model
pipeline = Pipeline(stages=[ngram, hashingTF, nb])
model = pipeline.fit(train_data_df)
#model.save('NB_Best_Model')
predictions = model.transform(test_data_df_repar)

#Evaluate Model Accuracy

predictions = predictions.withColumn('label',test_data_df_repar['label'].cast(DoubleType()))
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))



# print('NB Model Accuracy ', (correct / len(test_predictions)))