In [1]:
import re
import string

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from collections import Counter

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report, RocCurveDisplay

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, Normalizer

import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional

from sentence_transformers import SentenceTransformer

from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer, AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq, Seq2SeqTrainer,
                          Seq2SeqTrainingArguments)

from datasets import Dataset, load_dataset, concatenate_datasets

from huggingface_hub import HfFolder

import evaluate
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/doski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
data = pd.read_json('../Datasets/sarcasm_data.json')

In [3]:
data.head()

Unnamed: 0,160,170,180,190,1105,1162,1175,1182,1213,1276,...,2623,2131,2236,2546,2242,2169,2235,234,2608,2524
utterance,It's just a privilege to watch your mind at work.,I don't think I'll be able to stop thinking ab...,"Since it's not bee season, you can have my epi...","Lois Lane is falling, accelerating at an initi...",I'm just inferring this is a couch because the...,"Sheldon, if you were a robot, and I knew and y...",If you're compiling a mix CD for a double suic...,No. The dark crescent-shaped patterns under yo...,"How's this? ""Pleased to meet you, Dr. Gablehau...","Yeah, my parents felt that naming me Leonard a...",...,What do you know? Its a treat for the eyes and...,I really don't wanna sit with Allen Iverson ov...,"Gee, if only she were one and had no idea what...","Oh sure, she was probably up all night excited...",Are you still enjoying your nap?,"Hes not right for the part, and if I suggest h...","Oh yeah he has a caretaker his older brother, ...",Is it me or the greetings gone downhill around...,"You are right, by saying nice, I am virtually ...","Yes and we are ""very"" excited about it."
speaker,SHELDON,PENNY,SHELDON,SHELDON,SHELDON,HOWARD,SHELDON,SHELDON,SHELDON,LEONARD,...,CHANDLER,CHANDLER,CHANDLER,CHANDLER,CHANDLER,CHANDLER,CHANDLER,CHANDLER,CHANDLER,CHANDLER
context,[I never would have identified the fingerprint...,[This is one of my favorite places to kick bac...,"[Here we go. Pad thai, no peanuts., But does i...",[A marathon? How many Superman movies are ther...,"[Great Caesar's ghost, look at this place., So...","[On the other hand, no arousal., None?, None.,...","[Oh, good Lord., God, that's a good song.]","[How do I look?, Could you be more specific?, ...","[You know, I am not going to enjoy this party....",[He switched over to high-energy radiation res...,...,"[Chandler?, Hey! Ah!, What do you think?, I th...",[Wish I could switch with someone.],"[You can't go away this weekend, its Emma's bi...","[Hey, Hey, where's the birthday girl?, Oh she ...","[Okay?, Hi Emma, it's the year 2020.]","[What am I gonna do now?, Just pass the tape a...","[Helo! Anybody in there order a celebrity?, Wh...","[Hey, You son of a bitch!]","[Did I go to this school?, Hey, there's Missy ...","[Anyway, if you don't feel like being alone to..."
context_speakers,"[LEONARD, SHELDON]","[HOWARD, PENNY, HOWARD, HOWARD, HOWARD, PENNY,...","[LEONARD, HOWARD, LEONARD]","[PENNY, SHELDON, PENNY, SHELDON, SHELDON, PENN...","[SHELDON, LEONARD, SHELDON, SHELDON, SHELDON, ...","[PERSON, LEONARD, PERSON, LEONARD, PERSON, LEO...","[SHELDON, LEONARD]","[LEONARD, SHELDON, LEONARD]","[SHELDON, LEONARD, SHELDON, LEONARD, SHELDON, ...","[PERSON, PERSON, LEONARD, PERSON, LEONARD, PER...",...,"[PHOEBE, CHANDLER, MONICA, CHANDLER]",[CHANDLER],"[RACHEL, MONICA, RACHEL]","[MONICA, CHANDLER, RACHEL]","[ROSS, CHANDLER]","[CHANDLER, RACHEL]","[JOEY, PERSON, CHANDLER, PERSON]","[CHANDLER, JOEY]","[CHANDLER, ROSS, CHANDLER, ROSS]",[ROSS]
show,BBT,BBT,BBT,BBT,BBT,BBT,BBT,BBT,BBT,BBT,...,FRIENDS,FRIENDS,FRIENDS,FRIENDS,FRIENDS,FRIENDS,FRIENDS,FRIENDS,FRIENDS,FRIENDS


In [4]:
data = data.T

In [5]:
data.head()

Unnamed: 0,utterance,speaker,context,context_speakers,show,sarcasm
160,It's just a privilege to watch your mind at work.,SHELDON,[I never would have identified the fingerprint...,"[LEONARD, SHELDON]",BBT,True
170,I don't think I'll be able to stop thinking ab...,PENNY,[This is one of my favorite places to kick bac...,"[HOWARD, PENNY, HOWARD, HOWARD, HOWARD, PENNY,...",BBT,True
180,"Since it's not bee season, you can have my epi...",SHELDON,"[Here we go. Pad thai, no peanuts., But does i...","[LEONARD, HOWARD, LEONARD]",BBT,False
190,"Lois Lane is falling, accelerating at an initi...",SHELDON,[A marathon? How many Superman movies are ther...,"[PENNY, SHELDON, PENNY, SHELDON, SHELDON, PENN...",BBT,False
1105,I'm just inferring this is a couch because the...,SHELDON,"[Great Caesar's ghost, look at this place., So...","[SHELDON, LEONARD, SHELDON, SHELDON, SHELDON, ...",BBT,True


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 160 to 2524
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   utterance         690 non-null    object
 1   speaker           690 non-null    object
 2   context           690 non-null    object
 3   context_speakers  690 non-null    object
 4   show              690 non-null    object
 5   sarcasm           690 non-null    object
dtypes: object(6)
memory usage: 53.9+ KB


In [7]:
data.show.value_counts(dropna=False)

FRIENDS           356
BBT               280
GOLDENGIRLS        40
SARCASMOHOLICS     14
Name: show, dtype: int64

In [8]:
data.sarcasm.value_counts(dropna=False)

True     345
False    345
Name: sarcasm, dtype: int64

In [9]:
data.rename(columns={'utterance': 'comment', 'sarcasm': 'label'}, inplace=True)

In [10]:
df = data[['comment', 'label']].reset_index()

In [11]:
df['comment'] = df['comment'].str.lower()

In [12]:
df.head()

Unnamed: 0,index,comment,label
0,160,it's just a privilege to watch your mind at work.,True
1,170,i don't think i'll be able to stop thinking ab...,True
2,180,"since it's not bee season, you can have my epi...",False
3,190,"lois lane is falling, accelerating at an initi...",False
4,1105,i'm just inferring this is a couch because the...,True


In [13]:
df.replace({'label': {True: 1, False: 0}}, inplace=True)

In [14]:
df.head()

Unnamed: 0,index,comment,label
0,160,it's just a privilege to watch your mind at work.,1
1,170,i don't think i'll be able to stop thinking ab...,1
2,180,"since it's not bee season, you can have my epi...",0
3,190,"lois lane is falling, accelerating at an initi...",0
4,1105,i'm just inferring this is a couch because the...,1


In [15]:
df['comment_words'] = df['comment'].apply(lambda x: x.split(' '))

In [16]:
df['message_len'] = df['comment_words'].apply(len)

In [17]:
df.head()

Unnamed: 0,index,comment,label,comment_words,message_len
0,160,it's just a privilege to watch your mind at work.,1,"[it's, just, a, privilege, to, watch, your, mi...",10
1,170,i don't think i'll be able to stop thinking ab...,1,"[i, don't, think, i'll, be, able, to, stop, th...",11
2,180,"since it's not bee season, you can have my epi...",0,"[since, it's, not, bee, season,, you, can, hav...",10
3,190,"lois lane is falling, accelerating at an initi...",0,"[lois, lane, is, falling,, accelerating, at, a...",51
4,1105,i'm just inferring this is a couch because the...,1,"[i'm, just, inferring, this, is, a, couch, bec...",20


In [18]:
Counter(df.comment_words.sum()).most_common(100)

[('i', 267),
 ('a', 257),
 ('the', 244),
 ('you', 228),
 ('to', 212),
 ('and', 146),
 ('of', 123),
 ('in', 99),
 ('that', 90),
 ('it', 77),
 ('my', 75),
 ('is', 75),
 ('just', 71),
 ("i'm", 65),
 ('was', 63),
 ('have', 60),
 ("don't", 55),
 ('for', 55),
 ('with', 55),
 ('on', 54),
 ('all', 51),
 ('your', 50),
 ('not', 49),
 ('if', 49),
 ('we', 49),
 ('what', 48),
 ("it's", 46),
 ('me', 45),
 ('but', 44),
 ('like', 43),
 ('no,', 42),
 ('no', 41),
 ('so', 41),
 ('do', 41),
 ('this', 40),
 ('about', 39),
 ('oh,', 37),
 ('are', 37),
 ('he', 35),
 ('be', 34),
 ('am', 33),
 ('an', 32),
 ('out', 32),
 ("you're", 32),
 ('oh', 32),
 ('can', 31),
 ('she', 31),
 ('at', 29),
 ('think', 28),
 ('go', 27),
 ('yeah,', 26),
 ('well,', 26),
 ('when', 26),
 ('her', 25),
 ('because', 25),
 ('up', 24),
 ('how', 24),
 ('see', 24),
 ('it.', 23),
 ('got', 23),
 ('know', 23),
 ("that's", 23),
 ('would', 22),
 ('going', 22),
 ('really', 22),
 ('you.', 21),
 ('good', 21),
 ('get', 21),
 ('gonna', 21),
 ('as', 20

In [19]:
df['comment_words'] = df['comment_words'].apply(
    lambda x: [''.join(re.findall('[\s\w\d]', word)) for word in x if (
        (word not in string.punctuation) and (word not in STOP_WORDS)
    )]
)

In [20]:
df.head()

Unnamed: 0,index,comment,label,comment_words,message_len
0,160,it's just a privilege to watch your mind at work.,1,"[its, privilege, watch, mind, work]",10
1,170,i don't think i'll be able to stop thinking ab...,1,"[dont, think, ill, able, stop, thinking, it]",11
2,180,"since it's not bee season, you can have my epi...",0,"[its, bee, season, epinephrine]",10
3,190,"lois lane is falling, accelerating at an initi...",0,"[lois, lane, falling, accelerating, initial, r...",51
4,1105,i'm just inferring this is a couch because the...,1,"[im, inferring, couch, evidence, suggests, cof...",20


In [21]:
df.head()

Unnamed: 0,index,comment,label,comment_words,message_len
0,160,it's just a privilege to watch your mind at work.,1,"[its, privilege, watch, mind, work]",10
1,170,i don't think i'll be able to stop thinking ab...,1,"[dont, think, ill, able, stop, thinking, it]",11
2,180,"since it's not bee season, you can have my epi...",0,"[its, bee, season, epinephrine]",10
3,190,"lois lane is falling, accelerating at an initi...",0,"[lois, lane, falling, accelerating, initial, r...",51
4,1105,i'm just inferring this is a couch because the...,1,"[im, inferring, couch, evidence, suggests, cof...",20


In [22]:
reddit_data = pd.read_csv('../Datasets/clean_reddit_data.csv')

In [23]:
train_reddit, test_reddit = train_test_split(
    reddit_data,
    train_size=10000-int(345*0.6),
    test_size=2000-int(345*0.4),
    random_state=9
)

In [24]:
train_movies, test_movies = train_test_split(
    df,
    train_size=0.6,
    random_state=9
)

In [25]:
train = pd.concat([train_reddit, train_movies], ignore_index=True)
test = pd.concat([test_reddit, test_movies], ignore_index=True)

In [26]:
train['label'] = train['label'].astype(str)
test['label'] = test['label'].astype(str)

In [27]:
MODEL_PATH = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [28]:
dataset = load_dataset('imdb')

In [29]:
dataset.clear()

In [30]:
dataset['train'] = Dataset.from_pandas(train[['comment', 'label']])
dataset['test'] = Dataset.from_pandas(test[['comment', 'label']])

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['comment', 'label'],
        num_rows: 10207
    })
    test: Dataset({
        features: ['comment', 'label'],
        num_rows: 2138
    })
})

In [32]:
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets(
    [dataset['train'], dataset['test']]
)\
.map(
    lambda x: tokenizer(x['comment'], truncation=True),
    batched=True,
    remove_columns=['comment', 'label']
)
max_source_length = max([len(x) for x in tokenized_inputs['input_ids']])
print(f'Max source length: {max_source_length}')

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets(
    [dataset['train'], dataset['test']]
)\
.map(
    lambda x: tokenizer(x['label'], truncation=True),
    batched=True,
    remove_columns=['comment', 'label']
)
max_target_length = max([len(x) for x in tokenized_targets['input_ids']])
print(f'Max target length: {max_target_length}')

Map: 100%|██████████| 12345/12345 [00:00<00:00, 17680.29 examples/s]


Max source length: 172


Map: 100%|██████████| 12345/12345 [00:00<00:00, 20655.46 examples/s]

Max target length: 3





In [33]:
def preprocess_function(sample, padding='max_length'):
    # add prefix to the input for t5
    inputs = [item for item in sample['comment']]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample['label'], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == 'max_length':
        labels['input_ids'] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
        ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [34]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['comment', 'label'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map: 100%|██████████| 10207/10207 [00:01<00:00, 8080.53 examples/s]
Map: 100%|██████████| 2138/2138 [00:00<00:00, 7612.61 examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']





In [35]:
# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

In [36]:
# Metric
metric = evaluate.load('accuracy')

In [37]:
# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ['\n'.join(sent_tokenize(pred)) for pred in preds]
    labels = ['\n'.join(sent_tokenize(label)) for label in labels]

    return preds, labels

In [38]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, average='macro')
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result['gen_len'] = np.mean(prediction_lens)
    return result

In [39]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [40]:
# Hugging Face repository id
# repository_id = f"{model_id.split('/')[1]}-imdb-text-classification"

In [41]:
# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir='.',
#     output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=3e-4,

    num_train_epochs=2,
    # logging & evaluation strategies
#     logging_dir=f"{repository_id}/logs",
    logging_strategy="epoch",
    # logging_steps=1000,
    evaluation_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
#     report_to="tensorboard",
#     push_to_hub=True,
#     hub_strategy="every_save",
#     hub_model_id=repository_id,
#     hub_token=HfFolder.get_token(),
)

In [42]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [43]:
# Start training
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1276,0.2734
2552,0.1909


TrainOutput(global_step=2552, training_loss=0.2321525800938143, metrics={'train_runtime': 69642.3948, 'train_samples_per_second': 0.293, 'train_steps_per_second': 0.037, 'total_flos': 4805156328800256.0, 'train_loss': 0.2321525800938143, 'epoch': 2.0})

In [54]:
# trainer.evaluate()

In [45]:
trainer.save_model('../Models/Tuned-Flan-T5-Base')

In [46]:
from tqdm.auto import tqdm

In [47]:
samples_number = len(dataset['test'])
progress_bar = tqdm(range(samples_number))
predictions_list = []
labels_list = []

  0%|          | 0/2138 [00:00<?, ?it/s]

In [50]:
for i in range(samples_number):
    text = dataset['test']['comment'][i]
    inputs = tokenizer.encode_plus(text, padding='max_length', max_length=512, return_tensors='pt')
    outputs = model.generate(
        inputs['input_ids'], 
        attention_mask=inputs['attention_mask'], 
        max_length=150, 
        num_beams=4, 
        early_stopping=True
    )
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions_list.append(prediction)
    labels_list.append(dataset['test']['label'][i])

    progress_bar.update(1)

100%|██████████| 2138/2138 [1:15:44<00:00,  1.70s/it]  

In [51]:
str_labels_list = []

for i in range(len(labels_list)): 
    str_labels_list.append(str(labels_list[i]))

In [53]:
print(classification_report(str_labels_list, predictions_list, zero_division=0))

              precision    recall  f1-score   support

           0       0.67      0.63      0.65      1044
           1       0.67      0.71      0.69      1094

    accuracy                           0.67      2138
   macro avg       0.67      0.67      0.67      2138
weighted avg       0.67      0.67      0.67      2138

