In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version
# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
# Install Spark NLP
! pip install --ignore-installed -q spark-nlp==2.4.5 

openjdk version "1.8.0_272"
OpenJDK Runtime Environment (build 1.8.0_272-8u272-b10-0ubuntu1~18.04-b10)
OpenJDK 64-Bit Server VM (build 25.272-b10, mixed mode)


In [3]:
data_dir = os.path.join(os.getcwd(),'gdrive','My Drive','Colab Notebooks','Text_Data')
file_path = os.path.join(data_dir,'sarcasm_headline_dataset.csv')

In [4]:
import sparknlp
spark = sparknlp.start() 

df = spark.read.csv(file_path,inferSchema=True,header=True)
df.printSchema()

root
 |-- HEADLINE: string (nullable = true)
 |-- IS_SARCASTIC: string (nullable = true)



In [5]:
df = df.filter((df["IS_SARCASTIC"] == '0') | (df["IS_SARCASTIC"] == '1'))

In [6]:
from pyspark.sql.functions import * 
from pyspark.sql.types import * 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import *
from pyspark.ml.feature import Word2Vec

In [7]:
from pyspark.ml import Pipeline
from sparknlp.base import DocumentAssembler,Finisher
from sparknlp.annotator import SentenceDetector, Tokenizer, Normalizer, StopWordsCleaner, Lemmatizer

In [8]:
# StringIndexer
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="IS_SARCASTIC", outputCol="label")

# Get lemmatizer dictionary
!wget -q https://raw.githubusercontent.com/mahavivo/vocabulary/master/lemmas/AntBNC_lemmas_ver_001.txt

In [9]:
#vector_assembler = VectorAssembler().setInputCols(['embeddings']).setOutputCol('features')

In [10]:
docAssembler = DocumentAssembler().setInputCol("HEADLINE") \
                                  .setOutputCol("document") \
                                  .setCleanupMode("shrink_full")

sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = Tokenizer().setInputCols(['sentence']).setOutputCol("tokens")

normalizer = Normalizer().setInputCols(["tokens"]) \
                         .setOutputCol("normalized_tokens") \
                         .setLowercase(True) \
                         .setCleanupPatterns(["[^\w\d\s]"])

remove_stopwords = StopWordsCleaner().setInputCols(["normalized_tokens"]) \
                                    .setOutputCol("clean_tokens") \
                                    .setCaseSensitive(False)

lemmatizer = Lemmatizer().setInputCols(["clean_tokens"]) \
                         .setOutputCol("lemmatized_token") \
                         .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

finisher = Finisher().setInputCols(["lemmatized_token"]) \
                     .setOutputCols("lemma_bow") \
                     .setIncludeMetadata(False)

pipeline_stages =[docAssembler, sentence_detector, 
                  tokenizer, normalizer, remove_stopwords, 
                  lemmatizer, finisher]
            



In [11]:
nlpPipeline = Pipeline(stages=pipeline_stages)
pipelineModel = nlpPipeline.fit(df)

df2 = pipelineModel.transform(df)
df2.printSchema()

root
 |-- HEADLINE: string (nullable = true)
 |-- IS_SARCASTIC: string (nullable = true)
 |-- lemma_bow: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [12]:
df2.show(5,False)

+------------------------------------------------------------------------------------+------------+----------------------------------------------------------------------------+
|HEADLINE                                                                            |IS_SARCASTIC|lemma_bow                                                                   |
+------------------------------------------------------------------------------------+------------+----------------------------------------------------------------------------+
|former versace store clerk sues over secret 'black code' for minority shoppers      |0           |[former, versace, store, clerk, sue, secret, black, code, minority, shopper]|
|the 'roseanne' revival catches up to our thorny political mood, for better and worse|0           |[roseanne, revival, catch, thorny, political, mood, well, bad]              |
|mom starting to fear son's web series closest thing she will have to grandchild     |1           |[mom, start, fea

In [13]:
df2 = df2.withColumn("lemma_bow_size",size("lemma_bow"))
vector_dimension = df2.agg({"lemma_bow_size":"max"}).collect()[0][0]
print(vector_dimension)

27


In [14]:
word2Vec = Word2Vec(vectorSize= vector_dimension, minCount=0, inputCol="lemma_bow", outputCol="features")

In [15]:
classifier = LogisticRegression()

In [16]:
ml_stages =[indexer,word2Vec,classifier]

ml_pipeline = Pipeline(stages=ml_stages)

train,test = df2.randomSplit([0.7, 0.3],seed = 1984)

In [17]:
model = ml_pipeline.fit(train)

In [18]:
predictionAndLabels = model.transform(test)

In [19]:
predictionAndLabels.printSchema()

root
 |-- HEADLINE: string (nullable = true)
 |-- IS_SARCASTIC: string (nullable = true)
 |-- lemma_bow: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- lemma_bow_size: integer (nullable = false)
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [20]:
auc_roc_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',metricName='areaUnderROC')
pr_roc_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',metricName='areaUnderPR')
precision_evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(metricName="f1")

In [21]:
predictionAndLabels.groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 5682|
|       1.0| 2365|
+----------+-----+



In [22]:
print("AUC ROC: {0:.2f}".format(auc_roc_evaluator.evaluate(predictionAndLabels)))
print("PR ROC: {0:.2f}".format(pr_roc_evaluator.evaluate(predictionAndLabels)))
print("Precision: {0:.2f}".format(precision_evaluator.evaluate(predictionAndLabels)))
print("Recall ROC: {0:.2f}".format(recall_evaluator.evaluate(predictionAndLabels)))
print("F1: {0:.2f}".format(f1_evaluator.evaluate(predictionAndLabels)))


AUC ROC: 0.59
PR ROC: 0.55
Precision: 0.61
Recall ROC: 0.62
F1: 0.60
