In [None]:
!wget http://setup.johnsnowlabs.com/kaggle.sh -O - | bash

In [None]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [None]:
RANDOM_SEED = 42
CLASSIFIER_THRESHOLD = 0.4

np.random.seed(RANDOM_SEED)

In [None]:
class_map = {
    0: "optimistic",
    1: "thankful",
    2: "empathetic",
    3: "pessimistic",
    4: "anxious",
    5: "sad",
    6: "annoyed",
    7: "denial",
    8: "surprise",
    9: "official_report",
    10: "joking"
}

inv_class_map = {
    "optimistic": 0,
    "thankful": 1,
    "empathetic": 2,
    "pessimistic": 3,
    "anxious": 4,
    "sad": 5,
    "annoyed": 6,
    "denial": 7,
    "surprise": 8,
    "official_report": 9,
    "joking": 10
}

In [None]:
spark = sparknlp.start(gpu=False)
print("Spark NLP version: ", sparknlp.version())

In [None]:
records = []
with open('/kaggle/input/sentiment-analysis-of-covid-19-related-tweets/training.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            current_classes = []
            classes = list(map(int, row[2].split(" ")))
            for i in range(0, 11):
                if i in classes:
                    current_classes.append(class_map[i])

            records.append((row[0], row[1], current_classes))
            line_count += 1

    print(f'Processed {line_count} lines.')

df = pd.DataFrame.from_records(records, columns=["id", "text", "labels"])

#trainDataset, testDataset = spark.read.parquet("training_sparknlp.parquet").randomSplit([0.9, 0.1], seed=RANDOM_SEED)
trainDataset, testDataset = spark.createDataFrame(df).randomSplit([0.9, 0.1], seed=RANDOM_SEED)

#df.to_parquet("training_sparknlp.parquet", compression="gzip")
df.info()

In [None]:
trainDataset.show(2)

In [None]:
print("Train dataset: ", trainDataset.cache().count())
print("Test dataset: ", testDataset.cache().count())

In [None]:
%%time
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

In [None]:
%%time
sentence_detector = SentenceDetectorDLModel\
  .pretrained("sentence_detector_dl", "en") \
  .setInputCols(["document"]) \
  .setOutputCol("sentences")

In [None]:
%%time
tokenizer = Tokenizer() \
    .setInputCols(["sentences"]) \
    .setOutputCol("token")

In [None]:
%%time
#sentence_embeddings = BertSentenceEmbeddings.pretrained("sent_covidbert_large_uncased", "en") \
#    .setInputCols("document") \
#    .setOutputCol("sentence_embeddings")
word_embeddings = BertEmbeddings.pretrained("covidbert_large_uncased", "en") \
        .setInputCols("sentences", "token") \
        .setOutputCol("embeddings")

In [None]:
%%time
sentence_embeddings = SentenceEmbeddings() \
        .setInputCols(["sentences", "embeddings"]) \
        .setOutputCol("sentence_embeddings") \
        .setPoolingStrategy("AVERAGE")

In [None]:
%%time
multiClassifier = MultiClassifierDLApproach()\
    .setInputCols("sentence_embeddings")\
    .setOutputCol("category")\
    .setLabelColumn("labels")\
    .setBatchSize(128)\
    .setMaxEpochs(10)\
    .setLr(1e-3)\
    .setThreshold(CLASSIFIER_THRESHOLD)\
    .setShufflePerEpoch(False)\
    .setEnableOutputLogs(True)\
    .setValidationSplit(0.1)

In [None]:
pipeline = Pipeline(
    stages = [
        document,
        sentence_detector,
        tokenizer,
        word_embeddings,
        sentence_embeddings,
        multiClassifier
    ])

In [None]:
%%time
pipelineModel = pipeline.fit(trainDataset)

In [None]:
!ls -l ~/annotator_logs/

In [None]:
!cat ~/annotator_logs/MultiClassifierDLApproach_da4ee550bf50.log

In [None]:
pipelineModel.stages[-1].write().overwrite().save('tmp_multi_classifierDL_model')

In [None]:
!zip -r model2.zip /kaggle/working/tmp_multi_classifierDL_model

In [None]:
multiClassifier = MultiClassifierDLModel.load("../input/ieee-gsc-challenge-2-covid19-bert-model/kaggle/working/tmp_multi_classifierDL_model") \
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("category")\
  .setThreshold(0.4)

pipeline = Pipeline(
    stages = [
        document,
        sentence_embeddings,
        multiClassifier
    ])

In [None]:
preds = pipeline.fit(testDataset).transform(testDataset)
preds_df = preds.select('labels', 'category.result').toPandas()

In [None]:
mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(preds_df['labels'])
y_pred = mlb.transform(preds_df['result'])

In [None]:
print("Classification report: \n", (classification_report(y_true, y_pred)))
print("F1 micro averaging:",(f1_score(y_true, y_pred, average='micro')))
print("F1 macro averaging:",(f1_score(y_true, y_pred, average='macro')))
print("ROC: ",(roc_auc_score(y_true, y_pred, average="micro")))

In [None]:
submission_df = pd.read_csv("/kaggle/input/sentiment-analysis-of-covid-19-related-tweets/validation.csv")
submission_df.columns = ["id", "text"]
submissionDataset = spark.createDataFrame(submission_df)

submission_df.head(10)

In [None]:
preds = pipeline.fit(submissionDataset).transform(submissionDataset)
preds_df = preds.select('id', 'category.result').toPandas()
preds_df.to_parquet("sparknlp-bert-covid.parquet", compression="gzip")

In [None]:
preds_df.head(10)

In [None]:
preds_df["Labels"] = preds_df["result"].apply(lambda x: " ".join(sorted([str(inv_class_map[idx]) for idx in x])))
preds_df.columns = ["ID", "result", "Labels"]
preds_df[["ID", "Labels"]].to_csv("challenge2-sparkml-bert_covid19_large_uncased-3.csv", index=None)