In [1]:
import pyspark
import numpy as np
from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer, HashingTF

In [2]:
def spark_session_setup():
    """
    creates a spark context
    >>> sc = spark_session_setup()
    """

    # in order to be bale to change log level
    conf = pyspark.SparkConf()
    conf.set('spark.logConf', 'true')
    conf.set('spark.executor.memory', '4G')
    conf.set('spark.driver.memory', '4G')
#     conf.set('spark.driver.maxResultSize', '10G')

    # create a spark session
    sc = pyspark.SparkContext(appName='word_count', conf=conf)

    # change log level to ERROR
    sc.setLogLevel("ERROR")
    return sc

spark_session = spark_session_setup()

In [3]:
sql_context = pyspark.sql.SQLContext(spark_session)

In [4]:
import re

In [5]:
filenames = open('../dataset/files/X_small_train.txt').read().split('\n')
labels = open('../dataset/files/y_small_train.txt').read().split('\n')
filename_label_dict = {}
for filename, label in zip(filenames, labels):
    filename_label_dict[filename] = label

broadcast_filename_label_dict = spark_session.broadcast(filename_label_dict)

In [6]:
rdd_files  = spark_session.wholeTextFiles("../dataset/data/train/")

In [7]:
# rdd_files.take(1)

In [8]:
filter(lambda x: len(x)==2, ['as', 'de', 'sadas'])

<filter at 0x7f2ad73b8a58>

In [9]:
def pre_process(x):
    fname = x[0].split('/')[-1][:-6]
    label = int(broadcast_filename_label_dict.value[fname])
    word_list = list(filter(lambda x: len(x)==2 and x!='??', re.split('\r\n| ', x[1])))
    return (fname, label, word_list)

In [10]:
bag_of_docs = rdd_files.map(pre_process)

In [11]:
bag_of_docs.take(1)[0]

('lS0IVqXeJrN6Dzi9Pap1',
 4,
 ['EB',
  '2F',
  '60',
  '19',
  'DE',
  'EB',
  '42',
  'BF',
  '8C',
  'D5',
  'EA',
  'DB',
  '78',
  '51',
  'B6',
  'B7',
  '24',
  '8D',
  '42',
  '53',
  '90',
  '89',
  '8E',
  'AF',
  'BC',
  '45',
  '9A',
  'CB',
  'A8',
  'C1',
  '66',
  'A7',
  '54',
  'FD',
  'F2',
  '43',
  'C0',
  'F9',
  '3E',
  '9F',
  'EC',
  'B5',
  '4A',
  'BB',
  'D8',
  '31',
  '16',
  '97',
  '84',
  'EB',
  'D2',
  '6D',
  'A2',
  '33',
  'F0',
  'EB',
  '2B',
  '69',
  'EE',
  '8F',
  '1C',
  '25',
  'FA',
  'AB',
  '08',
  'A1',
  'C6',
  '87',
  'B4',
  'DD',
  '52',
  '23',
  '20',
  'EB',
  'EC',
  'D9',
  '9E',
  '7F',
  '4C',
  '95',
  'AA',
  '9B',
  '38',
  '11',
  '76',
  '77',
  'E4',
  '4D',
  '02',
  '13',
  '50',
  '49',
  '4E',
  '6F',
  '7C',
  '05',
  '5A',
  '8B',
  '68',
  '81',
  '50',
  'E9',
  '03',
  '00',
  '00',
  '00',
  '26',
  '67',
  '14',
  '50',
  '68',
  '03',
  'C9',
  'EB',
  '43',
  '8B',
  'C4',
  'E9',
  '08',
  '00',
  '00',
  '

In [12]:
doc_label_df = sql_context.createDataFrame(bag_of_docs, ['doc', 'label', 'text'])

In [13]:
doc_label_df.show(1)

+--------------------+-----+--------------------+
|                 doc|label|                text|
+--------------------+-----+--------------------+
|lS0IVqXeJrN6Dzi9Pap1|    4|[EB, 2F, 60, 19, ...|
+--------------------+-----+--------------------+
only showing top 1 row



In [14]:
# tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [15]:
# words_data = tokenizer.transform(doc_label_df)

In [16]:
tf = HashingTF(inputCol="text", outputCol="counts")

In [17]:
counts = tf.transform(doc_label_df)

In [18]:
counts.show(1)

+--------------------+-----+--------------------+--------------------+
|                 doc|label|                text|              counts|
+--------------------+-----+--------------------+--------------------+
|lS0IVqXeJrN6Dzi9Pap1|    4|[EB, 2F, 60, 19, ...|(262144,[992,1251...|
+--------------------+-----+--------------------+--------------------+
only showing top 1 row



In [19]:
idf = IDF(inputCol="counts", outputCol="features")

In [None]:
idf_model = idf.fit(counts)

In [None]:
rescaled_data = idf_model.transform(counts)

In [None]:
rescaled_data.show(1)

In [None]:
from pyspark.ml.classification import NaiveBayes
from pyspark.mllib.util import MLUtils
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
training, test = rescaled_data.randomSplit([0.6, 0.4])

In [None]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

In [None]:
model = nb.fit(training)

In [None]:
predictions = model.transform(test)
predictions.show()

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))