<a href="https://colab.research.google.com/github/JayaBigDataIsCool/Community_Contribution/blob/main/bert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed -q spark-nlp==2.4.5

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
[K     |████████████████████████████████| 215.7MB 68kB/s 
[K     |████████████████████████████████| 204kB 40.7MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 112kB 4.3MB/s 
[?25h

In [2]:
import sparknlp

spark = sparknlp.start(gpu=True) # for GPU training >> sparknlp.start(gpu=True)

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

Spark NLP version 2.4.5
Apache Spark version: 2.4.4


In [8]:

trainDataset = spark.read \
      .option("header", True) \
      .csv("/content/dataset_for_cls_mdl.csv")

trainDataset.show(truncate=500)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------+
|                                                                                                                                                          text|                         class|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------+
|             Are you seeing a dramatic deterioration in export conditions in your conversations with manufacturing clients since the end of the third quarter?|     Impact of market slowdown|
|                                                            Is the decline evenly split between sectors, or is it one of them that pulls everything else down?| Industry-specific environment|
|                                       

In [9]:
from pyspark.sql.functions import col

In [10]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

In [11]:
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")

In [12]:
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

In [13]:
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)


In [14]:
lemma = LemmatizerModel.pretrained('lemma_antbnc') \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [15]:
glove_embeddings = WordEmbeddingsModel().pretrained() \
 .setInputCols(["document",'lemma'])\
 .setOutputCol("embeddings")\
 .setCaseSensitive(False)

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [16]:
embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

In [17]:
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("category")\
  .setLabelColumn("class")\
  .setMaxEpochs(100)\
  .setEnableOutputLogs(True)

In [18]:
clf_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemma, 
            glove_embeddings,
            embeddingsSentence,
            classsifierdl])

In [19]:
%%time

clf_pipelineModel = clf_pipeline.fit(trainDataset)

CPU times: user 88.4 ms, sys: 8.79 ms, total: 97.2 ms
Wall time: 10.8 s


In [25]:

preds = clf_pipelineModel.transform(trainDataset)

preds.select('text','class',"category.result").show(10, truncate=100)

+----------------------------------------------------------------------------------------------------+------------------------------+--------------------------------+
|                                                                                                text|                         class|                          result|
+----------------------------------------------------------------------------------------------------+------------------------------+--------------------------------+
|Are you seeing a dramatic deterioration in export conditions in your conversations with manufactu...|     Impact of market slowdown|     [Impact of market slowdown]|
|  Is the decline evenly split between sectors, or is it one of them that pulls everything else down?| Industry-specific environment| [Industry-specific environment]|
|In terms of the magnitude of these investments, will you be maintaining this level throughout the...|       Short-term growth plans|       [Short-term growth plans]