In [None]:
# install Spark
# !apt-get install openjdk-8-jdk-headless -qq > /dev/null
# !wget -q https://www-us.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
# !tar xf spark-3.0.1-bin-hadoop2.7.tgz

In [None]:
# !pip install -q findspark


In [None]:
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"
# import findspark
# findspark.init()

In [None]:
# parameters for random forest
bucket="gs://micky-practicum/"
TRAIN_PATH = f"{bucket}X_small_train.csv"
TEST_PATH = f"{bucket}X_small_test.csv"
LABELS = f"{bucket}y_small_train.txt"
LABELS_TEST = f"{bucket}y_small_test.txt"
APP_NAME = "Naive Bayes Classifier"
SPARK_URL = "local[*]"

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import lit, row_number
#from operator import add
#from functools import reduce
from math import log
import re
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, FloatType, ArrayType
import numpy as np
from pyspark.sql import functions as F
from pyspark.sql import SQLContext, Row, SparkSession, Window

In [None]:
conf = SparkConf().setAppName("FirstNotebook")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

In [None]:
# read csv features file into dataframe
df_train_X = spark.read.options(header = "true", inferschema = "true").csv(TRAIN_PATH)

# drop columns that are relitive values and the index column and "hash" column
columns = []
for name in df_train_X.schema.names:
  if 'rel' in name:
    columns.append(name)
df_train_X = df_train_X.drop(*columns,'_c0','hash','total_count')

# read txt labels file into dataframe
df_train_Y = spark.read.load(LABELS, format="csv", sep=" ", inferSchema="true", header="false").toDF('Y')


# combine features and labels into one dataframe
# first, create a row index list w
w = Window.orderBy(lit(1))

# add row indexs to dataframes X and Y
df_X=df_train_X.withColumn("rn",row_number().over(w)-1)
df_Y=df_train_Y.withColumn("rn",row_number().over(w)-1)

# join X dataframe and Y dataframe; and drop the row index
df_train = df_X.join(df_Y,["rn"]).drop("rn")

### if one would like to view the dataframe, please uncomment below line.
df_train.show()

In [None]:
def buildSchema(mode):
    _schema=df_train.schema.fields
    t=None
    if mode=='int':
        t=IntegerType
    elif mode=='long':
        t=LongType
    elif mode=='float':
        t=FloatType
    schema = StructType([StructField('Y',LongType())]+[StructField(_schema[i].name,t()) for i in range(257)])
    return schema

In [None]:
# Reduce df to save space and make computation easier
df_train_reduced=spark.createDataFrame(df_train.groupBy('Y').sum().drop('sum(Y)').rdd.map(lambda x:tuple([x[0]]+[x[1+i]+1 for i in range(257)])),buildSchema('long'))

# Cache for reuse
#spark.catalog.clearCache()
df_train_reduced.cache()

In [None]:
# How to access total count of a single character across entire dataset
def netCount(word):
    return df_train_reduced.select(word).groupBy().sum().collect()[0][0]

In [None]:
# Example of using netCount
netCount('01')

In [None]:
# Count how many instances of each class
class_table=df_train.groupBy('Y').count().orderBy('Y')

# Count how many instances there are total
dataset_length=class_table.groupBy().sum().collect()[0][1]

# Create prior distribution
pc=spark.createDataFrame(class_table.rdd.map(lambda x:(x[0],x[1]/dataset_length)))

In [None]:
# calculate ln(P(yk)) and collect into an array
temp=pc.collect()
class_probabilities=sc.broadcast([log(x[1]) for x in temp])
del temp
#print(class_probabilities.value)

In [None]:
# Generate a word count dictionary to be broadcast and used to normalize columns

wordCounts=sc.broadcast([netCount(word) for word in df_train.schema.names[:-1]])

# Clear cache since netCount needn't be called anymore
spark.catalog.clearCache()

In [None]:
# Column-wise normalized to reflect naive bayes formula
train_log_weighted=df_train_reduced.rdd.map(lambda x:tuple([x[0]]+[log(x[i+1]/wordCounts.value[i]) for i in range(257)]))

# Constructin dataframe off of the normalized RDD
df_train_log_weighted=spark.createDataFrame(train_log_weighted,schema=buildSchema('float'))

In [None]:
# Create an array that contains the values corresponding to ln(P(x|y))
_word_log_probabilities=df_train_log_weighted.orderBy('Y').drop('Y').collect()

# Broadcast for distribution
word_log_probabilities=sc.broadcast(_word_log_probabilities)

In [None]:
# read test features.csv into dataframe.
df_test_X = spark.read.options(header = "true", inferschema = "true").csv(TEST_PATH)

# drop columns that are relitive values and the index column and "hash" column
columns = [name for name in df_test_X.schema.names if 'rel' in name]
df_test_X = df_test_X.drop(*columns,'_c0','hash','total_count')

In [None]:
# Compute the probability of a particular word appearing, given a class
# Note that @param word_index refers to an int value between [0,256]
# Corresponding to the hex words [00,...,FF,??]
def pWordGivenClass(word_index,c):
    return word_log_probabilities.value[c][word_index]

In [None]:
# Function factory that provides a lambda function for later in the pipeline
# The function produced takes a document's wordcount and produces the top-k
# Words by frequency
def pClassGivenDocLambda(k):
    def _pClassGivenDoc(doc):
        _doc=np.array([doc[i] for i in range(len(doc))])
        top_k=_doc.argsort()[::-1][:k].tolist()
        likelihoods=[class_probabilities.value[i] for i in range(9)]
        print(top_k)
        for i in range(9):
            for w in top_k:
                prob=pWordGivenClass(w,i)
                #print(f'Likelihood of class {i+1}|{w}({hexGen(w)})={prob}')
                likelihoods[i]+=prob
        return likelihoods
    return _pClassGivenDoc

In [None]:
# A method to get the two-letter hex-string corresponding to a word_index
# @param i: int in [0,256] mapped to corresponding hex in [00,...,FF,??]
def hexGen(i):
    return ('0'+str(hex(i)).upper()[2:])[-2:] if i <256 else '??'

In [None]:
# Debug cell (ignore)
'''
_test_top_k=df_test_X.rdd.map(pClassGivenDocLambda(4))
_test_top_k.collect()[1]
'''

In [None]:
# Creates the schema for the log-likelihood dataframe
def buildLikelihoodSchema():
    schema = StructType([StructField(f'{i+1}',FloatType()) for i in range(9)])
    return schema

In [None]:
# Computes class likelihoods and outputs the class which maximizes NLL
def findMaxLikelihood(x):
    row_as_list=list(x)
    #print(row_as_list)
    array=np.array(row_as_list)
    #print(array)
    max_arg=array.argmax()
    #print(max_arg)
    return [int(max_arg)]

In [None]:
# Create a dataframe which stores the top k-many words per document
k=50
df_test_top_k=spark.createDataFrame(df_test_X.rdd.map(pClassGivenDocLambda(k)),schema=buildLikelihoodSchema())

# Cast predictions 
df_predictions=spark.createDataFrame(df_test_top_k.rdd.map(findMaxLikelihood)).selectExpr("_1 as Class")

In [None]:
df_predictions.show()

In [None]:
df_predictions.coalesce(1).write.save(f"{bucket}NB_small_test.csv", format="csv",mode='overwrite')

#df_predictions.write.format("csv").mode("overwrite").save(f"{bucket}NB_small_test.csv")