In [1]:
!pip install numpy
!pip install matplotlib
!pip install scikit-learn
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-04-11 21:23:24--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.26
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.26|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-04-11 21:23:24--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1593 (1.6K) [text/plain]
Saving to: ‘STDOUT’


2021-04-11 21:23:24 (30.2 MB/s) - written to stdout [1593/1593]

setup Colab for PySpark 3.1.1 and Spark NLP 3.0.1
[K     |████████████████████████████████|

In [2]:
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.functions import desc
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf,col
import os
# tools
import re
import math
import json
import requests
import itertools
import numpy as np
import pandas as pd
import time
from datetime import datetime, timedelta
import string

In [3]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.evaluation import ClusteringEvaluator

In [4]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark Naive Bayes") \
        .getOrCreate()
    return spark
spark = init_spark()

In [6]:
'''
Read Lemma data
'''
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType

data = spark.read.csv("lemma-100days-wsbdata.csv", header=True)
function_array = udf(lambda r: r.split("|"), ArrayType(StringType()))
function_toNumerical = udf(lambda r: int(r), IntegerType())
text_lemmas = data.withColumn('finished_lemmas', function_array('text')).drop('text').withColumn('label', function_toNumerical('label'))
print(text_lemmas.count())
text_lemmas.show()

1144
+------+-----+--------------------+
|    id|label|     finished_lemmas|
+------+-----+--------------------+
|ks1tzw|    0|[all, right, all,...|
|ksjhuu|    0|[bldr, $, pt, pos...|
|kt79b2|    0|[we, might, be, h...|
|kt9enf|    0|[bear, feast, on,...|
|ktbu75|    0|[so, today, i, go...|
|ktfori|    0|[what, a, week, i...|
|kup2e2|    1|[start, to, feel,...|
|kurzyz|    0|[i, honestly, don...|
|kusng4|    0|[soi, recently, o...|
|kuku2g|    1|[nio, be, disrupt...|
|kutslm|    1|[listen, up, you,...|
|kv7k8k|    0|[have, anyone, no...|
|kvarzs|    0|[now, that, the, ...|
|kvcard|    0|[we, all, know, t...|
|kvcmhk|    1|[it, do, not, dil...|
|kva2kt|    0|[we, know, you, l...|
|kvacad|    1|[so, i, hear, you...|
|kvcimd|    1|[look, at, hour, ...|
|kvtk4b|    1|[giga, berain, co...|
|kw2vwm|    1|[see, the, end, o...|
+------+-----+--------------------+
only showing top 20 rows



In [8]:
'''
Get the Corpus.
Removing stop words from the text lemmas. 
'''
remover = StopWordsRemover(inputCol="finished_lemmas", outputCol="text")
filtered_df = remover.transform(text_lemmas)
filtered_df.show()

+------+-----+--------------------+--------------------+
|    id|label|     finished_lemmas|                text|
+------+-----+--------------------+--------------------+
|ks1tzw|    0|[all, right, all,...|[right, artist, f...|
|ksjhuu|    0|[bldr, $, pt, pos...|[bldr, $, pt, pos...|
|kt79b2|    0|[we, might, be, h...|[might, hear, fir...|
|kt9enf|    0|[bear, feast, on,...|[bear, feast, low...|
|ktbu75|    0|[so, today, i, go...|[today, go, games...|
|ktfori|    0|[what, a, week, i...|[week, steel, big...|
|kup2e2|    1|[start, to, feel,...|[start, feel, rea...|
|kurzyz|    0|[i, honestly, don...|[honestly, dont, ...|
|kusng4|    0|[soi, recently, o...|[soi, recently, o...|
|kuku2g|    1|[nio, be, disrupt...|[nio, disrupt, te...|
|kutslm|    1|[listen, up, you,...|[listen, degenera...|
|kv7k8k|    0|[have, anyone, no...|[anyone, notice, ...|
|kvarzs|    0|[now, that, the, ...|[boy, rc, get, po...|
|kvcard|    0|[we, all, know, t...|[know, 🌈🐻s, cal...|
|kvcmhk|    1|[it, do, not, dil..

In [29]:
'''
HashingTF: Create Document-Term Matrix
'''
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

hashingTF = HashingTF(inputCol="text", outputCol="rawFeatures", numFeatures=50)
featurizedData = hashingTF.transform(filtered_df)

featurizedData.show(truncate=False)

+------+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [31]:
'''
TF-IDF, followed from HashingTF.
'''
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [32]:
selectedData = rescaledData.select('id', 'label','features', 'text')
selectedData.show(truncate=False)

+------+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
'''
Separate data into training/test used for Naive-Bayes
'''
# training_zero, test_zero = selectedData.where(selectedData.label == 0).randomSplit([0.7, 0.3])
# training_one, test_one = selectedData.where(selectedData.label == 1).randomSplit([0.7, 0.3])

# training = training_zero.union(training_one)
# test = test_zero.union(test_one)
# training.show()
# # should be 70% of total in training, 30% in test
# print("Total data count:", selectedData.count())
# print("Total count of >6%", training.count())
# print("Total count of <6%", test.count())

'\nSeparate data into training/test used for Naive-Bayes\n'

In [25]:
'''
Naive-Bayes following from TF-IDF
'''
def NAIVEBAYES_HASH(smooth=0, model_type=0):
  # separating training/test sets
  training_zero, test_zero = selectedData.where(selectedData.label == 0).randomSplit([0.7, 0.3])
  training_one, test_one = selectedData.where(selectedData.label == 1).randomSplit([0.7, 0.3])

  training = training_zero.union(training_one)
  test = test_zero.union(test_one)
  #training.show()

  # print("Total data count:", selectedData.count())
  # print("Training data count:", training.count())
  # print("Testing data count:", test.count())
  # print("------------------------------------")
  # print("Total training count of >6%", training.where(selectedData.label == 0).count())
  # print("Total training count of <6%", training.where(selectedData.label == 1).count())
  # print("------------------------------------")
  # print("Total test count of >6%", test.where(selectedData.label == 0).count())
  # print("Total test count of <6%", test.where(selectedData.label == 1).count())

  # create trainer with parameters then train
  # smoothing: smooth probabilities of 0 to the input
  nb = NaiveBayes(smoothing=smooth, modelType=model_type)
  model_NB = nb.fit(training)

  # display on test set: appends a prediction column
  predictions = model_NB.transform(test)
  #predictions.show()

  # compute accuracy of on test set: compares labelCol and predictionCol
  evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
  accuracy = evaluator.evaluate(predictions)
  #print('Model accuracy:', accuracy)
  return accuracy

In [26]:
import statistics

extract_method = "HashingTF"
iter_each = 10
iter_total = 50
m_types = ["complement", "multinomial"]
means = []

for model_type in m_types:
  for k in range(iter_total):
    accuracies = []
    smoothing = random.uniform(0.01, 0.8)
    for i in range(iter_each):
      accuracies.append(NAIVEBAYES_HASH(smoothing, model_type))
    mean = statistics.mean(accuracies)
    print("=> Mean:", mean, "- Smoothing:", smoothing, "- Model:", model_type)
    means.append((mean, smoothing, model_type, extract_method))

=> Mean: 0.5458099883919728 - Smoothing: 0.030337076913773492 - Model: complement
=> Mean: 0.5662177929033682 - Smoothing: 0.06671180286798847 - Model: complement
=> Mean: 0.5584997403837564 - Smoothing: 0.3397107393650008 - Model: complement
=> Mean: 0.5590198938650541 - Smoothing: 0.43960123970917936 - Model: complement
=> Mean: 0.5565576576142158 - Smoothing: 0.024039861116237585 - Model: complement
=> Mean: 0.5651610230949394 - Smoothing: 0.6918848556842957 - Model: complement
=> Mean: 0.5518353991318374 - Smoothing: 0.47353362858000636 - Model: complement
=> Mean: 0.5537485327861409 - Smoothing: 0.6178817700649325 - Model: complement
=> Mean: 0.5384017174763671 - Smoothing: 0.7297371103938518 - Model: complement
=> Mean: 0.5659402808389002 - Smoothing: 0.7853476693630169 - Model: complement
=> Mean: 0.5516093308240206 - Smoothing: 0.1927701589585705 - Model: complement
=> Mean: 0.5565603727967727 - Smoothing: 0.16218444378588626 - Model: complement
=> Mean: 0.545044329728806 - Smo

In [27]:
from pyspark.sql.types import FloatType
acc_df = pd.DataFrame(means, columns=['mean', 'smoothing', 'model_type', 'extract_method'])
acc_df.to_csv("means_hash.csv")

In [28]:
from google.colab import files
files.download('means_hash.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>