In [1]:
from google.colab import files
import pandas as pd
import io

uploaded = files.upload()

Saving sarcasm_dataset.jsonl to sarcasm_dataset.jsonl


In [3]:
df = pd.read_json(io.BytesIO(uploaded['sarcasm_dataset.jsonl']), lines=True)
df_copy = df.copy()
print(df.head())

     label                                           response  \
0  SARCASM  @USER @USER @USER I don't get this .. obviousl...   
1  SARCASM  @USER @USER trying to protest about . Talking ...   
2  SARCASM  @USER @USER @USER He makes an insane about of ...   
3  SARCASM  @USER @USER Meanwhile Trump won't even release...   
4  SARCASM  @USER @USER Pretty Sure the Anti-Lincoln Crowd...   

                                             context  
0  [A minor child deserves privacy and should be ...  
1  [@USER @USER Why is he a loser ? He's just a P...  
2  [Donald J . Trump is guilty as charged . The e...  
3  [Jamie Raskin tanked Doug Collins . Collins lo...  
4  [Man ... y ’ all gone “ both sides ” the apoca...  


In [4]:
# total dataset size
print("# data points: " + str(len(df)))

# data points: 5000


In [5]:
# labeled 0 amount
print("# sarcasm data points: " + str(len(df[df['label'] == 'SARCASM'])))
# labeled 1 amount
print("# non-sarcasm data points: " + str(len(df[df['label'] == 'NOT_SARCASM'])))

# sarcasm data points: 2500
# non-sarcasm data points: 2500


In [6]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Corpus
corpus = df["response"].dropna().tolist()
corpus_str = '. '.join(corpus)
tokens = word_tokenize(corpus_str)
print("Vocabulary size: " + str(len(set(tokens))))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Vocabulary size: 15796


In [7]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [8]:
import textstat
sentence_lens = [textstat.sentence_count(tweet) for tweet in corpus]
print("Min Sentence Length per Entry: " + str(min(sentence_lens)))
print("Max Sentence Length per Entry: " + str(max(sentence_lens)))
print("Avg Sentence Length per Entry: " + str(sum(sentence_lens) / len(sentence_lens)))

Min Sentence Length per Entry: 1
Max Sentence Length per Entry: 10
Avg Sentence Length per Entry: 1.9556


In [9]:
word_per_sentence = [textstat.words_per_sentence(tweet) for tweet in corpus]
print("Min Word per Sentence: " + str(min(word_per_sentence)))
print("Max Word per Sentence: " + str(max(word_per_sentence)))
print("Avg Word per Sentence: " + str(textstat.words_per_sentence(corpus_str)))

Min Word per Sentence: 3.0
Max Word per Sentence: 57.0
Avg Word per Sentence: 12.017999590918388


In [10]:
corpus_0 = df[df['label'] == 'NOT_SARCASM']["response"].dropna().tolist()
corpus_1 = df[df['label'] == 'SARCASM']["response"].dropna().tolist()
corpus_str_0 = '. '.join(corpus_0)
corpus_str_1 = '. '.join(corpus_1)
sentence_lens_0 = [textstat.sentence_count(tweet) for tweet in corpus_0]
print("Min Sentence Length per Non-Sarcasm Entry: " + str(min(sentence_lens_0)))
print("Max Sentence Length per Non-Sarcasm Entry: " + str(max(sentence_lens_0)))
print("Avg Sentence Length per Non-Sarcasm Entry: " + str(sum(sentence_lens_0) / len(sentence_lens_0)))
sentence_lens_1 = [textstat.sentence_count(tweet) for tweet in corpus_1]
print("Min Sentence Length per Sarcasm Entry: " + str(min(sentence_lens_1)))
print("Max Sentence Length per Sarcasm Entry: " + str(max(sentence_lens_1)))
print("Avg Sentence Length per Sarcasm Entry: " + str(sum(sentence_lens_1) / len(sentence_lens_1)))

Min Sentence Length per Non-Sarcasm Entry: 1
Max Sentence Length per Non-Sarcasm Entry: 10
Avg Sentence Length per Non-Sarcasm Entry: 2.1804
Min Sentence Length per Sarcasm Entry: 1
Max Sentence Length per Sarcasm Entry: 8
Avg Sentence Length per Sarcasm Entry: 1.7308


In [11]:
word_per_sentence_0 = [textstat.words_per_sentence(tweet) for tweet in corpus_0]
print("Min Word per Sentence of Non-Sarcasm: " + str(min(word_per_sentence_0)))
print("Max Word per Sentence of Non-Sarcasm: " + str(max(word_per_sentence_0)))
print("Avg Word per Sentence of Non-Sarcasm: " + str(textstat.words_per_sentence(corpus_str_0)))
word_per_sentence_1 = [textstat.words_per_sentence(tweet) for tweet in corpus_1]
print("Min Word per Sentence of Sarcasm: " + str(min(word_per_sentence_1)))
print("Max Word per Sentence of Sarcasm: " + str(max(word_per_sentence_1)))
print("Avg Word per Sentence of Sarcasm: " + str(textstat.words_per_sentence(corpus_str_1)))

Min Word per Sentence of Non-Sarcasm: 3.0
Max Word per Sentence of Non-Sarcasm: 57.0
Avg Word per Sentence of Non-Sarcasm: 11.835809943129702
Min Word per Sentence of Sarcasm: 3.0
Max Word per Sentence of Sarcasm: 52.0
Avg Word per Sentence of Sarcasm: 12.247515599722671


In [12]:
'''
70-79	Fairly Easy
60-69	Standard
50-59	Fairly Difficult
'''
reading_ease = textstat.flesch_reading_ease(corpus_str)
reading_ease_0 = textstat.flesch_reading_ease(corpus_str_0)
reading_ease_1 = textstat.flesch_reading_ease(corpus_str_1)
print("Reading Ease: " + str(reading_ease))
print("Reading Ease of Non-Sarcasm: " + str(reading_ease_0))
print("Reading Ease of Sarcasm: " + str(reading_ease_1))

Reading Ease: 76.22
Reading Ease of Non-Sarcasm: 76.42
Reading Ease of Sarcasm: 76.01


In [13]:
'''
Returns the ARI (Automated Readability Index) which outputs a number that approximates the grade level needed to comprehend the text.

For example if the ARI is 6.5, then the grade level to comprehend the text is 6th to 7th grade.
'''
readability = textstat.automated_readability_index(corpus_str)
readability_0 = textstat.automated_readability_index(corpus_str_0)
readability_1 = textstat.automated_readability_index(corpus_str_1)
print("Readability: " + str(readability))
print("Readability of Non-Sarcasm: " + str(readability_0))
print("Readability of Sarcasm: " + str(readability_1))

Readability: 6.9
Readability of Non-Sarcasm: 6.6
Readability of Sarcasm: 7.2


In [14]:
'''
Returns the SMOG index of the given text. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document.
'''
smog = textstat.smog_index(corpus_str)
smog_0 = textstat.smog_index(corpus_str_0)
smog_1 = textstat.smog_index(corpus_str_1)
print("Smog: " + str(smog))
print("Smog of Non-Sarcasm: " + str(smog_0))
print("Smog of Sarcasm: " + str(smog_1))

Smog: 8.6
Smog of Non-Sarcasm: 8.5
Smog of Sarcasm: 8.7


In [15]:
'''
Based upon all the above tests, returns the estimated school grade level required to understand the text.
'''
txt_std = textstat.text_standard(corpus_str, True)
txt_std_0 = textstat.text_standard(corpus_str_0, True)
txt_std_1 = textstat.text_standard(corpus_str_1, True)
print("Overall Readability Consensus: " + str(txt_std))
print("Overall Readability Consensus of Non-Sarcasm: " + str(txt_std_0))
print("Overall Readability Consensus of Sarcasm: " + str(txt_std_1))

Overall Readability Consensus: 6.0
Overall Readability Consensus of Non-Sarcasm: 6.0
Overall Readability Consensus of Sarcasm: 6.0


In [16]:
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [17]:
sent_scores = {
    'neg': 0,
    'neu': 0,
    'pos': 0,
    'compound': 0
}
for sentence in corpus:
    score = sia.polarity_scores(sentence)
    sent_scores['neg'] += score['neg']
    sent_scores['neu'] += score['neu']
    sent_scores['pos'] += score['pos']
    sent_scores['compound'] += score['compound']

sent_scores['neg'] /= len(corpus)
sent_scores['neu'] /= len(corpus)
sent_scores['pos'] /= len(corpus)
sent_scores['compound'] /= len(corpus)
print(sent_scores)

{'neg': 0.08242480000000015, 'neu': 0.7768941999999979, 'pos': 0.14068079999999972, 'compound': 0.15483894000000004}


In [18]:
sent_scores_0 = {
    'neg': 0,
    'neu': 0,
    'pos': 0,
    'compound': 0
}
for sentence in corpus_0:
    score = sia.polarity_scores(sentence)
    sent_scores_0['neg'] += score['neg']
    sent_scores_0['neu'] += score['neu']
    sent_scores_0['pos'] += score['pos']
    sent_scores_0['compound'] += score['compound']

sent_scores_0['neg'] /= len(corpus_0)
sent_scores_0['neu'] /= len(corpus_0)
sent_scores_0['pos'] /= len(corpus_0)
sent_scores_0['compound'] /= len(corpus_0)
print(sent_scores_0)

{'neg': 0.06083839999999998, 'neu': 0.7773748000000006, 'pos': 0.16178839999999997, 'compound': 0.2691060799999999}


In [19]:
sent_scores_1 = {
    'neg': 0,
    'neu': 0,
    'pos': 0,
    'compound': 0
}
for sentence in corpus_1:
    score = sia.polarity_scores(sentence)
    sent_scores_1['neg'] += score['neg']
    sent_scores_1['neu'] += score['neu']
    sent_scores_1['pos'] += score['pos']
    sent_scores_1['compound'] += score['compound']

sent_scores_1['neg'] /= len(corpus_1)
sent_scores_1['neu'] /= len(corpus_1)
sent_scores_1['pos'] /= len(corpus_1)
sent_scores_1['compound'] /= len(corpus_1)
print(sent_scores_1)

{'neg': 0.10401119999999996, 'neu': 0.7764136000000006, 'pos': 0.11957319999999999, 'compound': 0.04057179999999999}
