In [1]:
# install Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz

In [2]:
!pip install -q findspark


In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"
import findspark
findspark.init()

In [4]:
# parameters for random forest
TRAIN_PATH = "X_small_train.csv"
TEST_PATH = "X_small_test.csv"
LABELS = "y_small_train.txt"
LABELS_TEST = "y_small_test.txt"
APP_NAME = "Naive Bayes Classifier"
SPARK_URL = "local[*]"

In [5]:
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest
from time import *
from pyspark.sql import *
from pyspark.sql.functions import *
from operator import add
from functools import reduce
from math import log10


spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()

In [7]:
# read csv features file into dataframe
df_train_X = spark.read.options(header = "true", inferschema = "true").csv(TRAIN_PATH)

# drop columns that are relitive values and the index column and "hash" column
columns = []
for name in df_train_X.schema.names:
  if 'rel' in name:
    columns.append(name)
df_train_X = df_train_X.drop(*columns,'_c0','hash','total_count')

# read txt labels file into dataframe
df_train_Y = spark.read.load(LABELS, format="csv", sep=" ", inferSchema="true", header="false").toDF('Y')


# combine features and labels into one dataframe
# first, create a row index list w
w = Window.orderBy(lit(1))

# add row indexs to dataframes X and Y
df_X=df_train_X.withColumn("rn",row_number().over(w)-1)
df_Y=df_train_Y.withColumn("rn",row_number().over(w)-1)

# join X dataframe and Y dataframe; and drop the row index
df_train = df_X.join(df_Y,["rn"]).drop("rn")

### if one would like to view the dataframe, please uncomment below line.
df_train.show()

+------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+-----+------+-----+-----+-----+-----+-----+-----+-----+-----+-----+----+-----+------+----+----+----+----+----+----+----+----+-----+-----+-----+-----+----+----+----+----+----+----+----+------+----+----+----+----+-----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+-----+-----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+------+----+----+----+----+----+------+----+-----+----+----+----+----+----+-----+----+----+----+----+-----+----+----+-----+-----+-----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-

In [8]:
# split dataframes by the labels (Y=1, 2, 3, ..., 9)
df_train.write.partitionBy("Y").saveAsTable("dataframes")

In [68]:
# read splitted dataframes for each class (from 1 to 9)
# df1 = spark.read.parquet("spark-warehouse/dataframes/Y=1/*.parquet")
# df2 = spark.read.parquet("spark-warehouse/dataframes/Y=2/*.parquet")
# df3 = spark.read.parquet("spark-warehouse/dataframes/Y=3/*.parquet")
# df4 = spark.read.parquet("spark-warehouse/dataframes/Y=4/*.parquet")
# df5 = spark.read.parquet("spark-warehouse/dataframes/Y=5/*.parquet")
# df6 = spark.read.parquet("spark-warehouse/dataframes/Y=6/*.parquet")
# df7 = spark.read.parquet("spark-warehouse/dataframes/Y=7/*.parquet")
# df8 = spark.read.parquet("spark-warehouse/dataframes/Y=8/*.parquet")
# df9 = spark.read.parquet("spark-warehouse/dataframes/Y=9/*.parquet")

df = {}
for x in range(1, 10):
    df[str(x)] = spark.read.parquet("spark-warehouse/dataframes/Y="
                                + str(x) + "/*.parquet")
    
# calculate P(yk).
# pc1 = df1.count()/df_train.count()
# pc2 = df2.count()/df_train.count()
# pc3 = df3.count()/df_train.count()
# pc4 = df4.count()/df_train.count()
# pc5 = df5.count()/df_train.count()
# pc6 = df6.count()/df_train.count()
# pc7 = df7.count()/df_train.count()
# pc8 = df8.count()/df_train.count()
# pc9 = df9.count()/df_train.count()

pc = {}
for x in range(1, 10):
    pc[str(x)] = df[str(x)].count()/df_train.count()

In [71]:
# creates empty lists for storing conditional probabilities for each class. (e.g. p1 for class 1)
# each list has 257 elements, relating to 257 words in "vocabulary", in the order of: 00,01,02,...,??
# class1 = []
# class2 = []
# class3 = []
# class4 = []
# class5 = []
# class6 = []
# class7 = []
# class8 = []
# class9 = []

# iteration from "00" to "??", total 257 interations
# for word in df1.schema.names:
  # calculate total count of one word among all documents
  # total = df_train.agg(sum(word)).collect()[0][0]

  # append probabilities to each class. count of each word in the documents belong to the specific class, divided by total count of this word among all documents.
  # class1.append(df1.agg(sum(word)).collect()[0][0]/total)
  # class2.append(df2.agg(sum(word)).collect()[0][0]/total)
  # class3.append(df3.agg(sum(word)).collect()[0][0]/total)
  # class4.append(df4.agg(sum(word)).collect()[0][0]/total)
  # class5.append(df5.agg(sum(word)).collect()[0][0]/total)
  # class6.append(df6.agg(sum(word)).collect()[0][0]/total)
  # class7.append(df7.agg(sum(word)).collect()[0][0]/total)
  # class8.append(df8.agg(sum(word)).collect()[0][0]/total)
  # class9.append(df9.agg(sum(word)).collect()[0][0]/total)
  
  # print('iteration(',df1.schema.names.index(word)+1,'of 257 )')
# should run around 8 minutes on Google Colab.

classes = {}
for i in range(1, 10):
  classes[str(i)] = []

names = df["1"].schema.names
# iteration from "00" to "??", total 257 interations
for word in names:
  # calculate total count of one word among all documents
  total = df_train.agg(sum(word)).collect()[0][0]

  # append probabilities to each class. count of each word in the documents belong 
  # to the specific class, divided by total count of this word among all documents.
  for i in range(1, 10):
    classes[str(i)].append(df[str(i)].agg(sum(word)).collect()[0][0]/total)
  
  print('iteration(', names.index(word) + 1,'of 257 )')
# should run around 8 minutes on Google Colab.

iteration( 1 of 257 )
iteration( 2 of 257 )
iteration( 3 of 257 )
iteration( 4 of 257 )
iteration( 5 of 257 )
iteration( 6 of 257 )
iteration( 7 of 257 )
iteration( 8 of 257 )
iteration( 9 of 257 )
iteration( 10 of 257 )
iteration( 11 of 257 )
iteration( 12 of 257 )
iteration( 13 of 257 )
iteration( 14 of 257 )
iteration( 15 of 257 )
iteration( 16 of 257 )
iteration( 17 of 257 )
iteration( 18 of 257 )
iteration( 19 of 257 )
iteration( 20 of 257 )
iteration( 21 of 257 )
iteration( 22 of 257 )
iteration( 23 of 257 )
iteration( 24 of 257 )
iteration( 25 of 257 )
iteration( 26 of 257 )
iteration( 27 of 257 )
iteration( 28 of 257 )
iteration( 29 of 257 )
iteration( 30 of 257 )
iteration( 31 of 257 )
iteration( 32 of 257 )
iteration( 33 of 257 )
iteration( 34 of 257 )
iteration( 35 of 257 )
iteration( 36 of 257 )
iteration( 37 of 257 )
iteration( 38 of 257 )
iteration( 39 of 257 )
iteration( 40 of 257 )
iteration( 41 of 257 )
iteration( 42 of 257 )
iteration( 43 of 257 )
iteration( 44 of 257

In [121]:
# read test features.csv into dataframe.
df_test_X = spark.read.options(header = "true", inferschema = "true").csv(TEST_PATH)

# drop columns that are relitive values and the index column and "hash" column
columns = []
for name in df_test_X.schema.names:
  if 'rel' in name:
    columns.append(name)
df_test_X = df_test_X.drop(*columns,'_c0','hash','total_count')

# df_test_X.show()

In [130]:
# log_class1 = list(map(lambda x: log10(x), class1))
# log_class2 = list(map(lambda x: log10(x), class2))
# log_class3 = list(map(lambda x: log10(x), class3))
# log_class4 = list(map(lambda x: log10(x), class4))
# log_class5 = list(map(lambda x: log10(x), class5))
# log_class6 = list(map(lambda x: log10(x), class6))
# log_class7 = list(map(lambda x: log10(x), class7))
# log_class8 = list(map(lambda x: log10(x), class8))
log_class9 = list(map(lambda x: log10(x), classes))
# for i in classes:
#   print(classes[str(i)])

for k,v in classes.items():
  for j in range(1, 10):
      lc = list(map(lambda x: log10(x), list(v)))


TypeError: ignored

In [35]:
from builtins import max
# predicting test dataset
# empty list to store predicted labels for all virus
labels_predicted = []


# iterations for each coloumn(word) of testing dataset (# of testing data)
for r in range(df_test_X.count()):
# for r in range(5):

  print('iteration:',r,'of',range(df_test_X.count()))

  # tem_list to store a row(iteration) of (count of words) from test dataset
  tem_list = []
  # to store conditional probabilities for each class.
  # e.g. tem_list1 will contains 257 values, indicating 257 conditional probabilities for each word.
  tem_list1 = []
  tem_list2 = []
  tem_list3 = []
  tem_list4 = []
  tem_list5 = []
  tem_list6 = []
  tem_list7 = []
  tem_list8 = []
  tem_list9 = []

  # list of 9 probabilities predicted for every class.
  prob = [0,0,0,0,0,0,0,0,0]

  # iteration of each word(column) in the present row(virus).
  for c in range(len(df1.columns)):
    # stores a row(iteration) of (count of words) from test dataset
    tem_list.append(df_test_X.collect()[r][c])
  
  # to store conditional probabilities for each class.
  tem_list1 = list(map(lambda x,y: x*y, log_class1,tem_list))
  tem_list2 = list(map(lambda x,y: x*y, log_class2,tem_list))
  tem_list3 = list(map(lambda x,y: x*y, log_class3,tem_list))
  tem_list4 = list(map(lambda x,y: x*y, log_class4,tem_list))
  tem_list5 = list(map(lambda x,y: x*y, log_class5,tem_list))
  tem_list6 = list(map(lambda x,y: x*y, log_class6,tem_list))
  tem_list7 = list(map(lambda x,y: x*y, log_class7,tem_list))
  tem_list8 = list(map(lambda x,y: x*y, log_class8,tem_list))
  tem_list9 = list(map(lambda x,y: x*y, log_class9,tem_list))

  # multiply conditional probabilities together, then multiply by frequency of each class in training set. finally to get a probability for each class.
  prob[0] = reduce(lambda x, y: x+y, tem_list1)+log10(pc1)
  prob[1] = reduce(lambda x, y: x+y, tem_list2)+log10(pc2)
  prob[2] = reduce(lambda x, y: x+y, tem_list3)+log10(pc3)
  prob[3] = reduce(lambda x, y: x+y, tem_list4)+log10(pc4)
  prob[4] = reduce(lambda x, y: x+y, tem_list5)+log10(pc5)
  prob[5] = reduce(lambda x, y: x+y, tem_list6)+log10(pc6)
  prob[6] = reduce(lambda x, y: x+y, tem_list7)+log10(pc7)
  prob[7] = reduce(lambda x, y: x+y, tem_list8)+log10(pc8)
  prob[8] = reduce(lambda x, y: x+y, tem_list9)+log10(pc9)

  # get the class with max probability
  maximum = max(prob)

  # append predicted class to a list
  labels_predicted.append(prob.index(maximum)+1)


iteration: 0 of range(0, 169)
iteration: 1 of range(0, 169)
iteration: 2 of range(0, 169)
iteration: 3 of range(0, 169)
iteration: 4 of range(0, 169)
iteration: 5 of range(0, 169)
iteration: 6 of range(0, 169)
iteration: 7 of range(0, 169)
iteration: 8 of range(0, 169)
iteration: 9 of range(0, 169)
iteration: 10 of range(0, 169)
iteration: 11 of range(0, 169)
iteration: 12 of range(0, 169)
iteration: 13 of range(0, 169)
iteration: 14 of range(0, 169)
iteration: 15 of range(0, 169)
iteration: 16 of range(0, 169)
iteration: 17 of range(0, 169)
iteration: 18 of range(0, 169)
iteration: 19 of range(0, 169)
iteration: 20 of range(0, 169)
iteration: 21 of range(0, 169)
iteration: 22 of range(0, 169)
iteration: 23 of range(0, 169)
iteration: 24 of range(0, 169)
iteration: 25 of range(0, 169)
iteration: 26 of range(0, 169)
iteration: 27 of range(0, 169)
iteration: 28 of range(0, 169)
iteration: 29 of range(0, 169)
iteration: 30 of range(0, 169)
iteration: 31 of range(0, 169)
iteration: 32 of r

In [36]:
# write predicted labels into txt file from the list we finished from above line
with open("files.txt", "w") as output:
  for row in range(df_test_X.count()-1):
    output.write(str(int(labels_predicted[row])) + '\n')
  output.write(str(int(labels_predicted[row])))