In [1]:
import pyspark
import numpy as np
from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer, HashingTF
import os
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover, NGram, Word2Vec
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel, RandomForestClassifier
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer

In [2]:
def spark_session_setup():
    """
    creates a spark context
    >>> sc = spark_session_setup()
    """

    # in order to be bale to change log level
    conf = pyspark.SparkConf()
    conf.set('spark.logConf', 'true')
    conf.set('spark.executor.memory', '4G')
    conf.set('spark.driver.memory', '4G')
#     conf.set('spark.driver.maxResultSize', '10G')

    # create a spark session
    sc = pyspark.SparkContext(appName='word_count', conf=conf)

    # change log level to ERROR
    sc.setLogLevel("ERROR")
    return sc

sc = spark_session_setup()

In [3]:
# path = "../dataset/data/bytes_original//"
# new_path = "../dataset/data/bytes/"
# files = os.listdir(path)
# for f in files:
#     with open(new_path + f, 'w') as outfile:
#         with open(path + f) as fp:
#             for line in fp:
#                 content = line.strip().split(' ')[1:]
#                 if '??' not in content or len(set(content)) != 1:
#                     outfile.write(' '.join(content) + ' ')

In [4]:
train_files = open('../dataset/files/X_small_train.txt').read().split('\n')
train_labels = open('../dataset/files/y_small_train.txt').read().split('\n')
train_dict = {}
for filename, label in zip(train_files, train_labels):
    train_dict[filename] = label

broadcast_train_dict = sc.broadcast(train_dict)

In [5]:
test_files = open('../dataset/files/X_small_test.txt').read().split('\n')
test_labels = open('../dataset/files/y_small_test.txt').read().split('\n')
test_dict = {}
for filename, label in zip(test_files, test_labels):
    test_dict[filename] = label

broadcast_test_dict = sc.broadcast(test_dict)

In [6]:
def pre_process(x):
    fname = x[0].split('/')[-1][:-6]
    if fname in broadcast_train_dict.value:
        label = int(broadcast_train_dict.value[fname])
        words = x[1]
        return (fname, words, label)
    elif fname in broadcast_test_dict.value:
        label = int(broadcast_test_dict.value[fname])
        words = x[1]
        return (fname, words, label)

In [7]:
spark = SparkSession(sc)

In [8]:
data = sc.wholeTextFiles('../dataset/data/bytes') #sys.argv[1]
fp = open('../dataset/files/X_small_train.txt')
train_names = fp.read().split()
file_path = 'file:' + os.path.realpath('../dataset/data/bytes') + '/' #sys.argv[1]
for i in range(len(train_names)):
    train_names[i] = file_path + train_names[i] + '.bytes'
train_names = sc.broadcast(train_names)

fp = open('../dataset/files/y_small_train.txt')
train_labels = sc.broadcast(fp.read().split())

In [9]:
fp = open('../dataset/files/X_small_test.txt')
test_names = fp.read().split()
file_path = 'file:' + os.path.realpath('../dataset/data/bytes') + '/' #sys.argv[1]
for i in range(len(test_names)):
    test_names[i] = file_path + test_names[i] + '.bytes'
test_names = sc.broadcast(test_names)

#Testing Labels
fp = open('../dataset/files/y_small_test.txt')
test_labels = sc.broadcast(fp.read().split())

In [10]:
train_data = data.filter(lambda x: x[0] in train_names.value).map(pre_process)
train_df = train_data.toDF(['id', 'text', 'label'])

In [11]:
test_data = data.filter(lambda x: x[0] in test_names.value).map(pre_process)
test_df = test_data.toDF(['id', 'text', 'label'])

In [12]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
ngram = NGram(n=2, inputCol='words', outputCol='ngrams')
# ngram2 = NGram(n=3, inputCol='ngrams', outputCol='ngrams2')
indexer = StringIndexer(inputCol="ngrams", outputCol="features")
# hashingTF = HashingTF(inputCol="ngrams2", outputCol="features")
# word2vec = Word2Vec(inputCol='ngrams', outputCol='features')
nb = NaiveBayes(smoothing=1)
rf = RandomForestClassifier(maxDepth=30, numTrees=70)

In [13]:
pipeline = Pipeline(stages=[tokenizer, ngram, indexer, rf])

In [14]:
model = pipeline.fit(train_df)

In [15]:
model.save('../saved_models/my_rf_2_3_gram')

In [16]:
# sameModel = PipelineModel.load("../saved_models/my_nb/")

In [17]:
prediction = model.transform(test_df)

In [18]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")

In [19]:
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.8402366863905325
