#Prepare for Model Creation
##Download packages
There are two versions of SparkNLP that we could utilize (both work) however we vyed for the newer 3.1.2 version of Apache Spark which works with the newest version of Spark NLP (version 3.3.1).

In [1]:
import os
# > Old Package Versions
# # Install java
# ! apt-get update -qq
# ! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
# ! java -version
# # Install pyspark
# ! pip install --ignore-installed pyspark==2.4.4
# # Install Spark NLP
# ! pip install --ignore-installed spark-nlp==2.5.1

# > New Package Versions
! pip install -q pyspark==3.1.2 spark-nlp


[K     |████████████████████████████████| 212.4 MB 62 kB/s 
[K     |████████████████████████████████| 122 kB 47.0 MB/s 
[K     |████████████████████████████████| 198 kB 40.9 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


## Import the packages
Now that we've downloaded the necessary packages we import them and instantiate a spark session. We set the `gpu` parameter to `True` even though this CoLab session doesn't have GPU equipped as we would prefer to use GPU when possible. We then print out the package versions to ensure we have the versions we believe we have installed.

In [2]:
import sparknlp
spark = sparknlp.start(gpu = True) # for GPU training
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
import numpy as np
import subprocess
import json

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 3.3.1
Apache Spark version: 3.1.2


##Get the dataset
###Data cleaning
This dataset is downloaded from Kaggle, specifically from [this site](www.google.com). We then import it as a json file (some preprocessing has been done via Java) and clean it up a little more to the Spark NLP format. We'll also split the dataset up here by doing a 25% test and 75% train ratio.

In [3]:
np.random.seed(0)  # for consistency
# open the training and the testing sets
with open('trainTone.clean.txt', 'wt') as writer_train:
  with open('testTone.clean.txt', 'wt') as writer_test:
    # write the headers for both
    writer_train.writelines('category,description\n')
    writer_test.writelines('category,description\n')
    # loop through the lines of the full unnormalized trainTone dataset
    lines = json.load(open('trainTone.txt'))
    for line in lines:
      # get the relevant data and construct the line
      tone,sentence = line['tone'].title(),line['sentence']
      output_line = f'{tone},"{sentence}"\n'
      write_to_train = np.random.uniform(0,1) > 0.25
      if(write_to_train):
        writer_train.writelines(output_line)
      else:
        writer_test.writelines(output_line)

###Read in the datasets
We write a function to get datasets upon a certain number of total objects (training + testing datasets).

In [10]:
def get_datasets(n_total):
  # get the number of rows based on the 25% and 75% split
  train_target_nrows, test_target_nrows = n_total * 0.75, n_total * 0.25
  # rewrite the datasets to accomodate the new total
  # > define a helper function
  def write_tmp_dataset(input_fname, output_fname, target_nrows):
    np.random.seed(0)  # for consistency
    total_nrows = int(subprocess.check_output(f'cat {input_fname} | wc -l', shell=True).decode().strip())
    idxs = np.linspace(start=0, stop=total_nrows-1, num=total_nrows, dtype=int)
    if(target_nrows > len(idxs)):
      target_nrows = len(idxs)
      print(f'switching from {target_nrows} to {len(idxs)}')
    idxs = np.random.choice(idxs, size=int(target_nrows), replace=False)
    with open(output_fname, 'wt') as writer:
      writer.writelines('category,description\n')
      with open(input_fname, 'rt') as reader:
        for idx, line in enumerate(reader.readlines()):
          if(idx in idxs):
            writer.writelines(line)
  # > convert the clean files to subsets
  write_tmp_dataset('trainTone.clean.txt', 'trainTone.clean.subset.txt', train_target_nrows)
  write_tmp_dataset('testTone.clean.txt', 'testTone.clean.subset.txt', test_target_nrows)
  # load the datasets to the session
  trainDataset = spark.read.option('header', True).csv('trainTone.clean.subset.txt')
  testDataset = spark.read.option('header', True).csv('testTone.clean.subset.txt')
  return trainDataset, testDataset


##Run the model
###Build the pipeline
We create a function to make the pipeline.

In [8]:
def build_pipeline(max_epochs, batch_size, use=None):
  # actual content is inside description column
  document = DocumentAssembler()\
        .setInputCol("description")\
        .setOutputCol("document")

  if(use is None):
    # we can also use sentece detector here if we want to train on and get predictions for each sentence
    use = UniversalSentenceEncoder.pretrained("tfhub_use_lg", "en") \
          .setInputCols("document") \
          .setOutputCol("sentence_embeddings")

  # the classes/labels/categories are in category column
  classifierdl = ClassifierDLApproach()\
        .setInputCols(["sentence_embeddings"])\
        .setOutputCol("class")\
        .setLabelColumn("category")\
        .setMaxEpochs(max_epochs)\
        .setBatchSize(batch_size)\
        .setEnableOutputLogs(True)\
        .setRandomSeed(0)  # for consistency

  use_clf_pipeline = Pipeline(
      stages = [
          document,
          use,
          classifierdl
      ])
  
  return use_clf_pipeline

###Train the model
We set different spaces and loop through each one to see which one performs the best.

In [11]:
import time
from tqdm import tqdm
from sklearn.metrics import accuracy_score, balanced_accuracy_score
cols = ['n_total','max_epochs','batch_size','train_time','inference_time','acc','bacc']
model_df = pd.DataFrame(columns=cols)
# we can also use sentece detector here if we want to train on and get predictions for each sentence
use = UniversalSentenceEncoder.pretrained("tfhub_use_lg", "en") \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")
max_epochs = 3
for n_total in tqdm([1000, 2000, 4000, 8000, 16000]):
  for batch_size in [8, 16, 32, 64, 128, 256]:
    valid = n_total / batch_size < 250
    if(valid):
      # load datasets
      trainDataset, testDataset = get_datasets(n_total)
      use_clf_pipeline = build_pipeline(max_epochs, batch_size, use=use)
      # train model
      start_time = time.time()
      clf_pipelineModel = use_clf_pipeline.fit(trainDataset)
      train_time = time.time() - start_time
      # test model
      start_time = time.time()
      preds = clf_pipelineModel.transform(testDataset)
      inference_time = (time.time() - start_time) / (n_total * 0.25)
      preds = preds.select('category','class.result').toPandas()
      preds['result'] = preds['result'].apply(lambda x : x[0])
      # evaluate model
      acc = accuracy_score(preds['category'], preds['result'])
      bacc = balanced_accuracy_score(preds['category'], preds['result'])
      model_df.loc[model_df.shape[0]] = n_total, max_epochs, batch_size, train_time, inference_time, acc, bacc

tfhub_use_lg download started this may take some time.
Approximate size to download 753.3 MB
[OK!]


  0%|          | 0/1 [00:00<?, ?it/s]

switching from 11900 to 11900
switching from 11900 to 11900


100%|██████████| 1/1 [18:21<00:00, 1101.70s/it]


##Evaluate model
###Raw table scan
We just look through the table and order it to see if there are any interesting patterns that pop up.

In [12]:
model_df.sort_values('train_time', ascending=True)  # fast trainers

Unnamed: 0,n_total,max_epochs,batch_size,train_time,inference_time,acc,bacc
5,1000.0,3.0,256.0,31.117473,0.000898,0.416,0.213992
3,1000.0,3.0,64.0,33.363429,0.000694,0.5,0.257202
2,1000.0,3.0,32.0,33.735987,0.000969,0.496,0.255144
4,1000.0,3.0,128.0,33.816514,0.000587,0.512,0.263374
1,1000.0,3.0,16.0,35.216418,0.000808,0.5,0.257202
0,1000.0,3.0,8.0,43.424903,0.001433,0.324,0.166667
10,2000.0,3.0,256.0,55.546848,0.000401,0.492,0.262973
7,2000.0,3.0,32.0,57.444373,0.00034,0.5,0.267001
8,2000.0,3.0,64.0,58.067806,0.000389,0.492,0.262617
6,2000.0,3.0,16.0,59.132634,0.000372,0.49,0.261507


In [13]:
model_df.sort_values('inference_time', ascending=True)  # fast predicters

Unnamed: 0,n_total,max_epochs,batch_size,train_time,inference_time,acc,bacc
18,16000.0,3.0,128.0,393.490487,1.9e-05,0.52725,0.238969
19,16000.0,3.0,256.0,401.093894,3.2e-05,0.5265,0.238748
15,8000.0,3.0,64.0,209.245658,4.6e-05,0.5225,0.286819
16,8000.0,3.0,128.0,211.568278,5.8e-05,0.5165,0.276887
17,8000.0,3.0,256.0,200.113195,6.7e-05,0.3235,0.166667
14,4000.0,3.0,256.0,109.706501,0.000121,0.487,0.268592
13,4000.0,3.0,128.0,108.874894,0.000164,0.312,0.166667
12,4000.0,3.0,64.0,108.329389,0.000179,0.489,0.270508
11,4000.0,3.0,32.0,108.09432,0.000211,0.498,0.274969
9,2000.0,3.0,128.0,60.579353,0.000289,0.496,0.265083


In [14]:
model_df.sort_values('acc', ascending=False)  # accuracte predicters

Unnamed: 0,n_total,max_epochs,batch_size,train_time,inference_time,acc,bacc
18,16000.0,3.0,128.0,393.490487,1.9e-05,0.52725,0.238969
19,16000.0,3.0,256.0,401.093894,3.2e-05,0.5265,0.238748
15,8000.0,3.0,64.0,209.245658,4.6e-05,0.5225,0.286819
16,8000.0,3.0,128.0,211.568278,5.8e-05,0.5165,0.276887
4,1000.0,3.0,128.0,33.816514,0.000587,0.512,0.263374
3,1000.0,3.0,64.0,33.363429,0.000694,0.5,0.257202
7,2000.0,3.0,32.0,57.444373,0.00034,0.5,0.267001
1,1000.0,3.0,16.0,35.216418,0.000808,0.5,0.257202
11,4000.0,3.0,32.0,108.09432,0.000211,0.498,0.274969
2,1000.0,3.0,32.0,33.735987,0.000969,0.496,0.255144


In [15]:
model_df.sort_values('bacc', ascending=False)  # accuracte predicters

Unnamed: 0,n_total,max_epochs,batch_size,train_time,inference_time,acc,bacc
15,8000.0,3.0,64.0,209.245658,4.6e-05,0.5225,0.286819
16,8000.0,3.0,128.0,211.568278,5.8e-05,0.5165,0.276887
11,4000.0,3.0,32.0,108.09432,0.000211,0.498,0.274969
12,4000.0,3.0,64.0,108.329389,0.000179,0.489,0.270508
14,4000.0,3.0,256.0,109.706501,0.000121,0.487,0.268592
7,2000.0,3.0,32.0,57.444373,0.00034,0.5,0.267001
9,2000.0,3.0,128.0,60.579353,0.000289,0.496,0.265083
4,1000.0,3.0,128.0,33.816514,0.000587,0.512,0.263374
10,2000.0,3.0,256.0,55.546848,0.000401,0.492,0.262973
8,2000.0,3.0,64.0,58.067806,0.000389,0.492,0.262617
