### Clones Spark NLP repo

In [None]:
!git clone https://github.com/JohnSnowLabs/spark-nlp.git

### Creates Spark session

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.Builder().getOrCreate()

In [None]:
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import DocumentAssembler

### As of version 1.5.3, the Finisher transformer has a bug

In [None]:
from sparknlp.base import AnnotatorTransformer

class Finisher(AnnotatorTransformer):

    inputCols = Param(Params._dummy(), "inputCols", "input annotations", typeConverter=TypeConverters.toListString)
    outputCols = Param(Params._dummy(), "outputCols", "output finished annotation cols", typeConverter=TypeConverters.toListString)
    valueSplitSymbol = Param(Params._dummy(), "valueSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
    annotationSplitSymbol = Param(Params._dummy(), "annotationSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
    cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove annotation columns", typeConverter=TypeConverters.toBoolean)
    #includeMetadata = Param(Params._dummy(), "includeMetadata", "annotation metadata format", typeConverter=TypeConverters.toBoolean)
    outputAsArray = Param(Params._dummy(), "outputAsArray", "finisher generates an Array with the results instead of string", typeConverter=TypeConverters.toBoolean)
    name = "Finisher"

    @keyword_only
    def __init__(self):
        super(Finisher, self).__init__(classname="com.johnsnowlabs.nlp.Finisher")
        self._setDefault(
            valueSplitSymbol="#",
            annotationSplitSymbol="@",
            cleanAnnotations=True,
            #includeMetadata=False,
            outputAsArray=False
        )

    @keyword_only
    def setParams(self):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setInputCols(self, value):
        return self._set(inputCols=value)

    def setOutputCols(self, value):
        return self._set(outputCols=value)

    def setValueSplitSymbol(self, value):
        return self._set(valueSplitSymbol=value)

    def setAnnotationSplitSymbol(self, value):
        return self._set(annotationSplitSymbol=value)

    def setCleanAnnotations(self, value):
        return self._set(cleanAnnotations=value)

    def setIncludeKeys(self, value):
        return self._set(includeMetadata=value)

    def setOutputAsArray(self, value):
        return self._set(outputAsArray=value)

### Loads Dataset

In [None]:
data = spark. \
        read. \
        parquet("./spark-nlp/src/test/resources/sentiment.parquet"). \
        limit(10000)
data.cache()
data.count()

Create appropriate annotators. We are using Sentence Detection, Tokenizing the sentences, and find the lemmas of those tokens
The Finisher will only output the Sentiment.

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("./spark-nlp/src/test/resources/lemma-corpus-small/lemmas_small.txt", key_delimiter="->", value_delimiter="\t")
        
sentiment_detector = SentimentDetector() \
    .setInputCols(["lemma", "sentence"]) \
    .setOutputCol("sentiment_score") \
    .setDictionary("./spark-nlp/src/test/resources/sentiment-corpus/default-sentiment-dict.txt", ",")
    
finisher = Finisher() \
    .setInputCols(["sentiment_score"]) \
    .setOutputCols(["sentiment"])

Train the pipeline, which is only being trained from external resources, not from the dataset we pass on.
The prediction runs on the target dataset

In [None]:
pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, lemmatizer, sentiment_detector, finisher])
model = pipeline.fit(data)
result = model.transform(data)

In [None]:
result.columns

In [None]:
result.filter("sentiment != 'positive'").limit(50).toPandas()

We filter the finisher output, to find the positive sentiment lines

In [None]:
#from pyspark.sql import functions as F
#result.withColumn('sentiment', F.expr('sentiment_score[0].result')).filter("sentiment == 'positive'").select('text').show()