In [None]:
! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sarcasm/train-balanced-sarcasm.csv -P /tmp

In [None]:
import sys
import time
import sparknlp

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark NLP") \
    .master("local[8]") \
    .config("spark.driver.memory","6G") \
    .config("spark.driver.maxResultSize", "1G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "800M")\
    .config("spark.jars.packages", 'com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.0') \
    .getOrCreate()

In [None]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

In [None]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

trainBalancedSarcasmDF = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/train-balanced-sarcasm.csv")
trainBalancedSarcasmDF.printSchema()

# Let's create a temp view (table) for our SQL queries
trainBalancedSarcasmDF.createOrReplaceTempView('data')

sql.sql('SELECT COUNT(*) FROM data').collect()

In [None]:
df = sql.sql('select label,concat(parent_comment,"\n",comment) as comment from data where comment is not null and parent_comment is not null limit 100000')
print(type(df))
print("Amount of rows:", df.count())
df = df.limit(2000) #minimize dataset if you are not running on a cluster
df.printSchema()
df.show()

In [None]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline


document_assembler = DocumentAssembler() \
    .setInputCol("comment") \
    .setOutputCol("document")
    
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence") \
    .setUseAbbreviations(True)
    
tokenizer = Tokenizer() \
  .setInputCols(["sentence"]) \
  .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")
    
normalizer = Normalizer() \
    .setInputCols(["stem"]) \
    .setOutputCol("normalized")

finisher = Finisher() \
    .setInputCols(["normalized"]) \
    .setOutputCols(["ntokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, stemmer, normalizer, finisher])
nlp_model = nlp_pipeline.fit(df)
processed = nlp_model.transform(df).persist()
processed.count()
processed.show()

In [None]:
train, test = processed.randomSplit(weights=[0.7, 0.3], seed=123)
print(train.count())
print(test.count())

In [None]:
from pyspark.ml import feature as spark_ft

stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')
sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens', outputCol='clean_tokens', stopWords=stopWords)
text2vec = spark_ft.Word2Vec(
    vectorSize=50, minCount=5, seed=123, 
    inputCol='ntokens', outputCol='text_vec', 
    windowSize=5, maxSentenceLength=30
)
assembler = spark_ft.VectorAssembler(inputCols=['text_vec'], outputCol='features')
feature_pipeline = Pipeline(stages=[sw_remover, text2vec,assembler])
feature_model = feature_pipeline.fit(train)

train_featurized = feature_model.transform(train).persist()
train_featurized.count()
train_featurized.show()

In [None]:
from pyspark.ml import classification as spark_cls


mlpc = spark_cls.MultilayerPerceptronClassifier(
    maxIter=100, seed=123, layers=[50, 25, 10,2]
)

model = mlpc.fit(train_featurized)


In [None]:
test_featurized = feature_model.transform(test)
preds = model.transform(test_featurized)
preds.show()

In [None]:
pred_df = preds.select('comment', 'label', 'prediction').toPandas()

In [None]:
pred_df.head()

In [None]:
import pandas as pd
from sklearn import metrics as skmetrics
pd.DataFrame(
    data=skmetrics.confusion_matrix(pred_df['label'], pred_df['prediction']),
    columns=['pred ' + l for l in ['0','1']],
    index=['true ' + l for l in ['0','1']]
)

In [None]:
print(skmetrics.classification_report(pred_df['label'], pred_df['prediction'], 
                                      target_names=['0','1']))