In [None]:
# install Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz

In [None]:
!pip install -q findspark


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"
import findspark
findspark.init()

In [None]:
# parameters for random forest
TRAIN_PATH = "X_small_train.csv"
TEST_PATH = "X_small_test.csv"
LABELS = "y_small_train.txt"
LABELS_TEST = "y_small_test.txt"
APP_NAME = "Naive Bayes Classifier"
SPARK_URL = "local[*]"

In [274]:
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest
from time import *
from pyspark.sql import *
# from pyspark.sql.functions import *
from operator import add


spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()

In [None]:
# read csv features file into dataframe
df_train_X = spark.read.options(header = "true", inferschema = "true").csv(TRAIN_PATH)

# drop columns that are relitive values and the index column and "hash" column
columns = []
for name in df_train_X.schema.names:
  if 'rel' in name:
    columns.append(name)
df_train_X = df_train_X.drop(*columns,'_c0','hash','total_count')

# read txt labels file into dataframe
df_train_Y = spark.read.load(LABELS, format="csv", sep=" ", inferSchema="true", header="false").toDF('Y')


# combine features and labels into one dataframe
# first, create a row index list w
w = Window.orderBy(lit(1))

# add row indexs to dataframes X and Y
df_X=df_train_X.withColumn("rn",row_number().over(w)-1)
df_Y=df_train_Y.withColumn("rn",row_number().over(w)-1)

# join X dataframe and Y dataframe; and drop the row index
df_train = df_X.join(df_Y,["rn"]).drop("rn")

### if one would like to view the dataframe, please uncomment below line.
# df_train.show()

In [None]:
# split dataframes by the labels (Y=1, 2, 3, ..., 9)
df_train.write.partitionBy("Y").saveAsTable("dataframes")

In [None]:
# read splitted dataframes for each class (from 1 to 9)
df1 = spark.read.parquet("spark-warehouse/dataframes/Y=1/*.parquet")
df2 = spark.read.parquet("spark-warehouse/dataframes/Y=2/*.parquet")
df3 = spark.read.parquet("spark-warehouse/dataframes/Y=3/*.parquet")
df4 = spark.read.parquet("spark-warehouse/dataframes/Y=4/*.parquet")
df5 = spark.read.parquet("spark-warehouse/dataframes/Y=5/*.parquet")
df6 = spark.read.parquet("spark-warehouse/dataframes/Y=6/*.parquet")
df7 = spark.read.parquet("spark-warehouse/dataframes/Y=7/*.parquet")
df8 = spark.read.parquet("spark-warehouse/dataframes/Y=8/*.parquet")
df9 = spark.read.parquet("spark-warehouse/dataframes/Y=9/*.parquet")

# calculate P(yk).
pc1 = df1.count()/df_train.count()
pc2 = df2.count()/df_train.count()
pc3 = df3.count()/df_train.count()
pc4 = df4.count()/df_train.count()
pc5 = df5.count()/df_train.count()
pc6 = df6.count()/df_train.count()
pc7 = df7.count()/df_train.count()
pc8 = df8.count()/df_train.count()
pc9 = df9.count()/df_train.count()

In [None]:
+# creates empty lists for storing conditional probabilities for each class. (e.g. p1 for class 1)
# each list has 257 elements, relating to 257 words in "vocabulary", in the order of: 00,01,02,...,??
class1 = []
class2 = []
class3 = []
class4 = []
class5 = []
class6 = []
class7 = []
class8 = []
class9 = []

# iteration from "00" to "??", totoally 257 interations
for word in df1.schema.names:
  # calculate total count of one word among all documents
  total = df_train.agg(sum(word)).collect()[0][0]

  # append probabilities to each class. count of each word in the documents belong to the specific class, divided by total count of this word among all documents.
  class1.append(df1.agg(sum(word)).collect()[0][0]/total)
  class2.append(df2.agg(sum(word)).collect()[0][0]/total)
  class3.append(df3.agg(sum(word)).collect()[0][0]/total)
  class4.append(df4.agg(sum(word)).collect()[0][0]/total)
  class5.append(df5.agg(sum(word)).collect()[0][0]/total)
  class6.append(df6.agg(sum(word)).collect()[0][0]/total)
  class7.append(df7.agg(sum(word)).collect()[0][0]/total)
  class8.append(df8.agg(sum(word)).collect()[0][0]/total)
  class9.append(df9.agg(sum(word)).collect()[0][0]/total)

# should run around 8 minutes on Google Colab.

In [None]:
# read test features.csv into dataframe.
df_test_X = spark.read.options(header = "true", inferschema = "true").csv(TEST_PATH)

# drop columns that are relitive values and the index column and "hash" column
columns = []
for name in df_test_X.schema.names:
  if 'rel' in name:
    columns.append(name)
df_test_X = df_test_X.drop(*columns,'_c0','hash','total_count')

# empty list to store predicted labels
labels_predicted = []

# iterations for each row of testing dataset (# of testing data)
for r in range(df_test_X.count()):

  # create an empty list to store conditional probabilities for all 9 classes.
  prob = []

  # iteration for each word (257 in total)
  for c in range(len(df1.columns)):
    # calculate probabilty of each class and append to prob list.
    prob.append(class1[c]**df_test_X.collect()[r][c]*pc1)
    prob.append(class2[c]**df_test_X.collect()[r][c]*pc2)
    prob.append(class3[c]**df_test_X.collect()[r][c]*pc3)
    prob.append(class4[c]**df_test_X.collect()[r][c]*pc4)
    prob.append(class5[c]**df_test_X.collect()[r][c]*pc5)
    prob.append(class6[c]**df_test_X.collect()[r][c]*pc6)
    prob.append(class7[c]**df_test_X.collect()[r][c]*pc7)
    prob.append(class8[c]**df_test_X.collect()[r][c]*pc8)
    prob.append(class9[c]**df_test_X.collect()[r][c]*pc9)
  
  # get the class with max probability
  maximum = 0
  for p in prob:
    if p >= maximum:
      maximum = p

  # append predicted class to a list
  labels_predicted.append(prob.index(maximum)+1)
  
# write predicted labels into txt file from the list we finished from above line
with open("files.txt", "w") as output:
  for row in range(df_test_X.count()-1):
    output.write(str(int(labels_predicted[row])) + '\n')
  output.write(str(int(labels_predicted[row])))