In [None]:
# Running this script will install the required dependencies for the notebook if running in Colab

!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.4.1 -s 5.1.2 -g

# Preprocessing


In [None]:
import pandas as pd

path = "./data"
df_amazon_train = pd.read_csv(f"{path}/train.csv", index_col=0)
df_amazon_test = pd.read_csv(f"{path}/test.csv", index_col=0)
df_amazon_validation = pd.read_csv(f"{path}/validation.csv", index_col=0)
df_turkish = pd.read_csv(f"{path}/hb.csv")

In [None]:
df_turkish_sample = df_turkish.sample(210000, random_state=42)

In [None]:
df_amazon = df_amazon_train[["review_body", "language", "stars"]]
df_turkish_sample = df_turkish_sample[["Rating (Star)", "Review"]]
df_turkish_sample.rename(
    columns={"Review": "review_body", "Rating (Star)": "stars"}, inplace=True
)
df_turkish_sample["language"] = "tr"

In [None]:
consolidated_df = pd.concat([df_amazon, df_turkish_sample], ignore_index=True)

In [None]:
consolidated_df["review_body"] = consolidated_df["review_body"].str.lower()

In [None]:
consolidated_df.drop(
    consolidated_df[
        consolidated_df["language"].isin(["ja", "zh", "de", "fr", "es", "tr"])
    ].index,
    inplace=True,
)

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(consolidated_df, test_size=0.1, random_state=42)

In [None]:
train_df.to_csv(f"{path}/final_training.csv", index=False)
test_df.to_csv(f"{path}/final_test.csv", index=False)

# Train Pipeline


In [None]:
import sparknlp
import pandas as pd
import re
import numpy as np

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sklearn.metrics import classification_report

In [None]:
spark = sparknlp.start(gpu=True)

In [None]:
trainDataset = spark.read.option("header", True).csv("./consolidated.csv", escape='"')

In [None]:
document_assembler = (
    DocumentAssembler().setInputCol("review_body").setOutputCol("document")
)

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")

stopwords_cleaner = (
    StopWordsCleaner()
    .setInputCols("normalized")
    .setOutputCol("cleanTokens")
    .setCaseSensitive(False)
)

lemma = (
    LemmatizerModel.pretrained("lemma_antbnc")
    .setInputCols(["cleanTokens"])
    .setOutputCol("lemma")
)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [None]:
glove_embeddings = (
    WordEmbeddingsModel()
    .pretrained()
    .setInputCols(["document", "lemma"])
    .setOutputCol("embeddings")
    .setCaseSensitive(False)
)

embeddingsSentence = (
    SentenceEmbeddings()
    .setInputCols(["document", "embeddings"])
    .setOutputCol("sentence_embeddings")
    .setPoolingStrategy("AVERAGE")
)

classsifierdl = (
    ClassifierDLApproach()
    .setInputCols(["sentence_embeddings"])
    .setOutputCol("class")
    .setLabelColumn("stars")
    .setMaxEpochs(10)
    .setLr(1e-3)
    .setValidationSplit(1e-1)
    .setEvaluationLogExtended(True)
    .setEnableOutputLogs(True)
)

train_pipeline = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        lemma,
        glove_embeddings,
        embeddingsSentence,
        classsifierdl,
    ]
)

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
%%time

train_pipeline_model = train_pipeline.fit(trainDataset)

CPU times: user 3.18 s, sys: 316 ms, total: 3.49 s
Wall time: 9min 36s


In [None]:
import os

log_file_name = os.listdir("/root/annotator_logs")[0]

with open("/root/annotator_logs/" + log_file_name, "r") as log_file:
    print(log_file.read())

label        tp	 fp	 fn	 prec	 rec	 f1
4            1366	 2416	 9264	 0.36118457	 0.12850423	 0.18956424
5            14240	 10502	 2981	 0.5755396	 0.8268974	 0.6786932
1            4102	 5271	 2967	 0.43764004	 0.5802801	 0.4989661
2            2657	 6504	 4515	 0.29003385	 0.3704685	 0.32535362
3            1163	 2179	 7145	 0.34799522	 0.13998556	 0.19965667
tp: 23528 fp: 26872 fn: 26872 labels: 5
Macro-average	 prec: 0.40247864, rec: 0.4092272, f1: 0.40582487
Micro-average	 prec: 0.4668254, recall: 0.4668254, f1: 0.4668254



In [None]:
!cd ~/annotator_logs && ls -l

total 44
-rw-r--r-- 1 root root 1496 Apr  2 10:58 ClassifierDLApproach_73b794d1720d.log
-rw-r--r-- 1 root root  535 Apr  2 10:58 ClassifierMetrics_109aa4693c3f.log
-rw-r--r-- 1 root root  533 Apr  2 10:55 ClassifierMetrics_125dbfc9bbcd.log
-rw-r--r-- 1 root root  537 Apr  2 10:56 ClassifierMetrics_18ecfb27ed22.log
-rw-r--r-- 1 root root  539 Apr  2 10:54 ClassifierMetrics_225cef80dc6c.log
-rw-r--r-- 1 root root  536 Apr  2 10:55 ClassifierMetrics_23b8273bb4d4.log
-rw-r--r-- 1 root root  531 Apr  2 10:54 ClassifierMetrics_4214345dcedf.log
-rw-r--r-- 1 root root  534 Apr  2 10:58 ClassifierMetrics_7745baae85b9.log
-rw-r--r-- 1 root root  534 Apr  2 10:53 ClassifierMetrics_9a42e07926b4.log
-rw-r--r-- 1 root root  539 Apr  2 10:57 ClassifierMetrics_a54d07433560.log
-rw-r--r-- 1 root root  534 Apr  2 10:57 ClassifierMetrics_b6155e43f57a.log


In [None]:
!cat ~/annotator_logs/ClassifierDLApproach_73b794d1720d.log

Training started - epochs: 10 - learning_rate: 0.001 - batch_size: 64 - training_examples: 453600 - classes: 5
Epoch 0/10 - 37.60s - loss: 10175.209 - acc: 0.45098862 - batches: 7088
Quality on validation dataset (10.0%), validation examples = 50400
Epoch 1/10 - 35.92s - loss: 10079.269 - acc: 0.46794528 - batches: 7088
Quality on validation dataset (10.0%), validation examples = 50400
Epoch 2/10 - 35.64s - loss: 10053.937 - acc: 0.472573 - batches: 7088
Quality on validation dataset (10.0%), validation examples = 50400
Epoch 3/10 - 35.65s - loss: 10039.82 - acc: 0.47564644 - batches: 7088
Quality on validation dataset (10.0%), validation examples = 50400
Epoch 4/10 - 35.43s - loss: 10029.037 - acc: 0.47776958 - batches: 7088
Quality on validation dataset (10.0%), validation examples = 50400
Epoch 5/10 - 35.13s - loss: 10020.801 - acc: 0.47936362 - batches: 7088
Quality on validation dataset (10.0%), validation examples = 50400
Epoch 6/10 - 35.44s - loss: 10014.198 - acc: 0.4808761 - b

In [None]:
train_pipeline_model.stages[-1].write().overwrite().save(f"{path}/model_weights")

# Inference Pipeline


In [None]:
testDataset = spark.read.option("header", True).csv(
    f"{path}/final_test.csv", escape='"'
)

In [None]:
document_assembler = (
    DocumentAssembler().setInputCol("review_body").setOutputCol("document")
)

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")

stopwords_cleaner = (
    StopWordsCleaner()
    .setInputCols("normalized")
    .setOutputCol("cleanTokens")
    .setCaseSensitive(False)
)

lemma = (
    LemmatizerModel.pretrained("lemma_antbnc")
    .setInputCols(["cleanTokens"])
    .setOutputCol("lemma")
)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [None]:
glove_embeddings = (
    WordEmbeddingsModel()
    .pretrained()
    .setInputCols(["document", "lemma"])
    .setOutputCol("embeddings")
    .setCaseSensitive(False)
)

embeddingsSentence = (
    SentenceEmbeddings()
    .setInputCols(["document", "embeddings"])
    .setOutputCol("sentence_embeddings")
    .setPoolingStrategy("AVERAGE")
)

classsifierdl = (
    ClassifierDLModel.load("./model_weights")
    .setInputCols(["sentence_embeddings"])
    .setOutputCol("class")
)

test_pipeline = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stopwords_cleaner,
        lemma,
        glove_embeddings,
        embeddingsSentence,
        classsifierdl,
    ]
)

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
preds = test_pipeline.fit(testDataset).transform(testDataset)

In [None]:
preds_df = preds.select("stars", "review_body", "class.result").toPandas()
preds_df["result"] = preds_df["result"].apply(lambda x: x[0])

In [None]:
print(classification_report(preds_df["stars"], preds_df["result"]))

              precision    recall  f1-score   support

           1       0.48      0.57      0.52     17699
           2       0.32      0.30      0.31     17690
           3       0.34      0.22      0.27     21240
           4       0.37      0.18      0.24     26434
           5       0.58      0.83      0.68     42937

    accuracy                           0.48    126000
   macro avg       0.42      0.42      0.40    126000
weighted avg       0.44      0.48      0.44    126000

