In [None]:
bucket="gs://micky-practicum/"
indicator_path='gs://uga-dsp/project1/files/'

In [None]:
from math import log
import re
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, FloatType, ArrayType, StringType
import numpy as np
from pyspark.sql import functions as F

In [None]:
_train = sqlContext.read.load(f'{bucket}counts/X_train.parquet').rdd

In [None]:
# A method to get the two-letter hex-string corresponding to a word_index
# @param i: int in [0,256] mapped to corresponding hex in [00,...,FF,??]
def hexGen(i):
    return ('0'+str(hex(i)).upper()[2:])[-2:] if i <256 else '??'

In [None]:
train_hashes=sc.textFile(f'{indicator_path}X_train.txt').collect()
train_labels=sc.textFile(f'{indicator_path}y_train.txt').collect()
hash_labels=sc.broadcast({train_hashes[i]:train_labels[i] for i in range(len(train_hashes))})
del train_hashes
del train_labels

In [None]:
def hash_to_label(x):
    out=[int(hash_labels.value[x[0]])]
    out+=x[1:]
    return out

In [None]:
def buildSchema(mode):
    t=None
    if mode=='int':
        t=IntegerType
    elif mode=='long':
        t=LongType
    elif mode=='float':
        t=FloatType
    schema = StructType([StructField('Y',LongType())]+[StructField(hexGen(i),t()) for i in range(257)])
    return schema

In [None]:
# Create dataframe with training data
df_train=spark.createDataFrame(_train.map(hash_to_label),schema=buildSchema('long'))

# Reduce df to save space and make computation easier
df_train_reduced=spark.createDataFrame(df_train.groupBy('Y').sum().drop('sum(Y)').rdd.map(lambda x:tuple([x[0]]+[x[1+i]+1 for i in range(257)])),buildSchema('long'))

In [None]:
# Count how many instances of each class
class_table=df_train.groupBy('Y').count().orderBy('Y')

# Count how many instances there are total
dataset_length=class_table.groupBy().sum().collect()[0][1]

# Create prior distribution
pc=spark.createDataFrame(class_table.rdd.map(lambda x:(x[0],x[1]/dataset_length)))

In [None]:
# calculate ln(P(yk)) and collect into an array
temp=pc.collect()
class_probabilities=sc.broadcast([log(x[1]) for x in temp])
del temp
#print(class_probabilities.value)

In [None]:
# Generate a word count dictionary to be broadcast and used to normalize columns

wordCounts=sc.broadcast(df_train_reduced.drop('Y').rdd.reduce(lambda x,y:[x[i]+y[i] for i in range(len(x))]))

In [None]:
# Column-wise normalized to reflect naive bayes formula
train_log_weighted=df_train_reduced.rdd.map(lambda x:tuple([x[0]]+[log(x[i+1]/wordCounts.value[i]) for i in range(257)]))

# Constructin dataframe off of the normalized RDD
df_train_log_weighted=spark.createDataFrame(train_log_weighted,schema=buildSchema('float'))

In [None]:
# Create an array that contains the values corresponding to ln(P(x|y))
_word_log_probabilities=df_train_log_weighted.orderBy('Y').drop('Y').collect()

# Broadcast for distribution
word_log_probabilities=sc.broadcast(_word_log_probabilities)

In [None]:
df_test_X = sqlContext.read.load(f'{bucket}counts/X_test.parquet')

In [None]:
# Compute the probability of a particular word appearing, given a class
# Note that @param word_index refers to an int value between [0,256]
# Corresponding to the hex words [00,...,FF,??]
def pWordGivenClass(word_index,c):
    return word_log_probabilities.value[c][word_index]

In [None]:
# Function factory that provides a lambda function for later in the pipeline
# The function produced takes a document's wordcount and produces the top-k
# Words by frequency
def pClassGivenDoc(doc):
    likelihoods=[class_probabilities.value[i] for i in range(9)]
    for i in range(9):
        for w in range(257):
            prob=pWordGivenClass(w,i)
            #print(f'Likelihood of class {i+1}|{w}({hexGen(w)})={prob}')
            likelihoods[i]+=prob*doc[1+w]
    return [doc[0]]+likelihoods

In [None]:
# Creates the schema for the log-likelihood dataframe
def buildLikelihoodSchema():
    schema = StructType([StructField('hash',StringType())]+[StructField(f'{i+1}',FloatType()) for i in range(9)])
    return schema

In [None]:
# Computes class likelihoods and outputs the class which maximizes NLL
def findMaxLikelihood(x):
    row_as_list=list(x[1:])
    #print(row_as_list)
    array=np.array(row_as_list)
    #print(array)
    max_arg=array.argmax()
    #print(max_arg)
    return [x[0]]+[int(max_arg)]

In [None]:
# Create a dataframe which stores the top k-many words per document
df_test_top_k=spark.createDataFrame(df_test_X.rdd.map(pClassGivenDoc),schema=buildLikelihoodSchema())

# Cast predictions 
df_predictions=spark.createDataFrame(df_test_top_k.rdd.map(findMaxLikelihood),schema=StructType([StructField('hash',StringType()),StructField('class',IntegerType())]))

In [None]:
predictions=df_predictions.rdd.collectAsMap()

In [None]:
test_hashes=sc.textFile(f'{indicator_path}X_test.txt').collect()
with open('prediction.txt', 'a') as the_file:
    for h in test_hashes:
        the_file.write(f'{predictions[h]}\n')