#Download the necessary packages

In [1]:
import os
# > Old Package Versions
# # Install java
# ! apt-get update -qq
# ! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
# ! java -version
# # Install pyspark
# ! pip install --ignore-installed pyspark==2.4.4
# # Install Spark NLP
# ! pip install --ignore-installed spark-nlp==2.5.1

# > New Package Versions
! pip install -q pyspark==3.1.2 spark-nlp


[K     |████████████████████████████████| 212.4 MB 57 kB/s 
[K     |████████████████████████████████| 122 kB 53.7 MB/s 
[K     |████████████████████████████████| 198 kB 56.6 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


# Train the model
## Import the pacakges

In [2]:
import sparknlp
spark = sparknlp.start(gpu = True) # for GPU training >> sparknlp.start(gpu = True) # for Spark 2.3 =>> sparknlp.start(spark23 = True)
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 3.3.1
Apache Spark version: 3.1.2


##Get the training dataset
###Clean the dataset
Clean the Kaggle Dataset

In [3]:
import json
with open('trainTone.clean.txt', 'wt') as writer:
  writer.writelines('category,description\n')
  lines = json.load(open('trainTone.txt'))
  for line in lines:
    tone,sentence = line['tone'],line['sentence']
    writer.writelines(f'{tone},"{sentence}"\n')

###Load the training dataset

In [4]:
trainDataset = spark.read \
    .option("header", True) \
    .csv("trainTone.clean.txt")

trainDataset.show(truncate=50, n=5)

+--------+--------------------------------------------------+
|category|                                       description|
+--------+--------------------------------------------------+
| sadness|                           i didnt feel humiliated|
| sadness|i can go from feeling so hopeless to so damned ...|
|   anger|  im grabbing a minute to post i feel greedy wrong|
|    love|i am ever feeling nostalgic about the fireplace...|
|   anger|                              i am feeling grouchy|
+--------+--------------------------------------------------+
only showing top 5 rows



###Checkout the category distribution

In [5]:
from pyspark.sql.functions import col

trainDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|     joy| 5362|
| sadness| 4666|
|   anger| 2159|
|    fear| 1937|
|    love| 1304|
|surprise|  572|
+--------+-----+



##Assemble the model pipeline

In [6]:
# actual content is inside description column
document = DocumentAssembler()\
      .setInputCol("description")\
      .setOutputCol("document")

# we can also use sentece detector here if we want to train on and get predictions for each sentence
use = UniversalSentenceEncoder.pretrained("tfhub_use_lg", "en") \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classifierdl = ClassifierDLApproach()\
      .setInputCols(["sentence_embeddings"])\
      .setOutputCol("class")\
      .setLabelColumn("category")\
      .setMaxEpochs(30)\
      .setBatchSize(8)\
      .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classifierdl
    ])

tfhub_use_lg download started this may take some time.
Approximate size to download 753.3 MB
[OK!]


##Run the model

In [7]:
%%time
clf_pipelineModel = use_clf_pipeline.fit(trainDataset)

CPU times: user 5.04 s, sys: 492 ms, total: 5.53 s
Wall time: 16min 41s


##Check the logs

In [8]:
import os
log_file_name = os.listdir("/root/annotator_logs")[0]

with open("/root/annotator_logs/"+log_file_name, "r") as log_file :
    print(log_file.read())

Training started - epochs: 30 - learning_rate: 0.005 - batch_size: 8 - training_examples: 16000 - classes: 6
Epoch 0/30 - 16.63s - loss: 3014.3157 - acc: 0.524875 - batches: 2000
Epoch 1/30 - 15.17s - loss: 2906.148 - acc: 0.5685625 - batches: 2000
Epoch 2/30 - 15.16s - loss: 2883.1816 - acc: 0.5834375 - batches: 2000
Epoch 3/30 - 14.90s - loss: 2825.0278 - acc: 0.62 - batches: 2000
Epoch 4/30 - 15.07s - loss: 2776.0994 - acc: 0.64575 - batches: 2000
Epoch 5/30 - 15.09s - loss: 2750.424 - acc: 0.661 - batches: 2000
Epoch 6/30 - 15.03s - loss: 2731.1006 - acc: 0.6709375 - batches: 2000
Epoch 7/30 - 16.46s - loss: 2716.7332 - acc: 0.6780625 - batches: 2000
Epoch 8/30 - 15.09s - loss: 2705.637 - acc: 0.68425 - batches: 2000
Epoch 9/30 - 15.40s - loss: 2693.9539 - acc: 0.689375 - batches: 2000
Epoch 10/30 - 15.09s - loss: 2682.9326 - acc: 0.6938125 - batches: 2000
Epoch 11/30 - 15.30s - loss: 2672.5469 - acc: 0.697375 - batches: 2000
Epoch 12/30 - 15.48s - loss: 2663.7043 - acc: 0.7013125 

##Examine the predictions of the training

In [9]:
preds = clf_pipelineModel.transform(trainDataset)
preds.select('category','description',"class.result").show(n=5, truncate=50)


+--------+--------------------------------------------------+---------+
|category|                                       description|   result|
+--------+--------------------------------------------------+---------+
| sadness|                           i didnt feel humiliated|[sadness]|
| sadness|i can go from feeling so hopeless to so damned ...|[sadness]|
|   anger|  im grabbing a minute to post i feel greedy wrong|  [anger]|
|    love|i am ever feeling nostalgic about the fireplace...|    [joy]|
|   anger|                              i am feeling grouchy|  [anger]|
+--------+--------------------------------------------------+---------+
only showing top 5 rows



##Get model training metrics

In [None]:
preds_df = preds.select('category','description',"class.result").toPandas()
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])
from sklearn.metrics import classification_report
print(classification_report(preds_df['result'], preds_df['category']))

##Predict an example

In [11]:
from sparknlp.base import LightPipeline
light_model = LightPipeline(clf_pipelineModel)
text = 'i am not happy'
light_model.annotate(text)

{'class': ['sadness'],
 'document': ['i am not happy'],
 'sentence_embeddings': ['i am not happy']}

##Save the pipeline
###Do the actual saving

In [None]:
clf_pipelineModel.write().save('ToneItPipeline')
!tar czvf ToneItPipeline.tar.gz ToneItPipeline

###Check on the size

In [16]:
!du -sch ToneItPipeline

872M	ToneItPipeline
872M	total


In [44]:
!du -sch ToneItPipeline.tar.gz

775M	ToneItPipeline.tar.gz
775M	total


##Load the pipeline

In [38]:
import sparknlp
sparknlp.start()
from pyspark.ml import PipelineModel
from sparknlp.base import LightPipeline
ToneItPipeline = LightPipeline(PipelineModel.load('ToneItPipeline'))
ToneItPipeline.annotate("we fell to the floor our faces pale")

{'class': ['sadness'],
 'document': ['we fell to the floor our faces pale'],
 'sentence_embeddings': ['we fell to the floor our faces pale']}

universalsentenceencoder --> 0.69<br>
tokenizer+bertsmall+sentence --> 0.29!!!ALLSAD<br>
bertsmallsent --> 0.35!!!ALLJOY<br>
bertusecmlmenbase --> 0.35!!!ALLJOY