# **Sarcasm Detection using Hierarchical BERT**

## Data cleaning and transformation

In [7]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [9]:
!kaggle datasets download -d danofer/sarcasm

Dataset URL: https://www.kaggle.com/datasets/danofer/sarcasm
License(s): copyright-authors
Downloading sarcasm.zip to /content
 53% 115M/216M [00:00<00:00, 1.20GB/s]
100% 216M/216M [00:00<00:00, 712MB/s] 


In [10]:
# extracting the xip contents
zip_ref = zipfile.ZipFile('/content/sarcasm.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [11]:
df = pd.read_csv('/content/train-balanced-sarcasm.csv')
df.head(5)

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [12]:
df.shape

(1010826, 10)

In [13]:
df = df[:10000]
df = df[['label', 'comment']]
df.head(5)

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.


In [14]:
df.isna().sum()

Unnamed: 0,0
label,0
comment,1


In [15]:
df.dropna(inplace=True)
df.isna().sum()

Unnamed: 0,0
label,0
comment,0


In [16]:
# remove unwanted numbers and symbols
df['comment'] = df['comment'].str.replace(r'[^a-zA-Z\s]', "", regex=True)
df.head(5)

Unnamed: 0,label,comment
0,0,NC and NH
1,0,You do know west teams play against west teams...
2,0,They were underdogs earlier today but since Gr...
3,0,This meme isnt funny none of the new york nigg...
4,0,I could use one of those tools


In [17]:
# converting data into lowercase
df['comment'] = df['comment'].str.lower()
df.head(5)

Unnamed: 0,label,comment
0,0,nc and nh
1,0,you do know west teams play against west teams...
2,0,they were underdogs earlier today but since gr...
3,0,this meme isnt funny none of the new york nigg...
4,0,i could use one of those tools


## Tokenization

In [18]:
from transformers import BertTokenizer, TFBertModel

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [20]:
# function for tokenizer
def tokenize_data(text, max_length=100):
  return tokenizer(text.tolist(), padding='max_length', truncation=True, max_length=max_length, return_tensors='np')

tokenized_data = tokenize_data(df['comment'])

In [21]:
tokenized_data

{'input_ids': array([[  101, 13316,  1998, ...,     0,     0,     0],
       [  101,  2017,  2079, ...,     0,     0,     0],
       [  101,  2027,  2020, ...,     0,     0,     0],
       ...,
       [  101,  5095,  2305, ...,     0,     0,     0],
       [  101, 29420,  2015, ...,     0,     0,     0],
       [  101,  2016, 28719, ...,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}

## **Train Test Split**

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X = tokenized_data['input_ids']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7999, 100), (2000, 100), (7999,), (2000,))

## **Building the Model**

In [24]:
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [25]:
class HierarchicalBert(tf.keras.Model):
  def __init__(self, bert_model, lstm_units, cnn_filters, dense_units):
    super(HierarchicalBert, self).__init__()
    self.bert = bert_model

    # sentence encoding layer
    self.dense_sentence = tf.keras.layers.Dense(768, activation='relu')

    # context summarisation layer
    self.mean_pooling = tf.keras.layers.GlobalAveragePooling1D()

    # context encoder layer
    self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=True))

    # CNN layer
    self.cnn = tf.keras.layers.Conv1D(cnn_filters, 2, activation='relu')
    self.pool = tf.keras.layers.GlobalMaxPooling1D()

    # fully connected layer
    self.dense_df = tf.keras.layers.Dense(dense_units, activation='relu')

    # output layers
    self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')

  def call(self, inputs):

    # BERT embeddings
    bert_output = self.bert(inputs)[0]

    # sentence encoding layer
    sentence_encoding = self.dense_sentence(bert_output)

    # context summarization
    context_summary = self.mean_pooling(sentence_encoding)
    # expand dimension
    context_summary = tf.expand_dims(context_summary, 1)

    # context encoder
    context_encoding = self.bilstm(context_summary)

    # sqeezing the dimension
    context_encoding = tf.squeeze(context_encoding, 1)

    # adding channel dimension to match the required input shape by conv layer
    context_encoding = tf.expand_dims(context_encoding, -1)

    # CNN layer
    cnn_output = self.cnn(context_encoding)

    # pooling layer
    pooled_output = self.pool(cnn_output)

    # fully connected layer
    dense_output = self.dense_df(pooled_output)

    # final output layer
    output = self.output_layer(dense_output)

    return output

In [26]:
# allow TensorFlow model to load from PyTorch weights
bert_model = TFBertModel.from_pretrained("bert-base-uncased", from_pt=True)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Al

In [27]:
# defining the hierarchical bert model

model = HierarchicalBert(bert_model, lstm_units=128, cnn_filters=64, dense_units=32)

In [28]:
# compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [29]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("Is GPU available:", tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))

Num GPUs Available:  1
Built with CUDA: True
Is GPU available: True


In [30]:
# fit the model
model.fit(X_train, y_train, epochs=5, batch_size=32)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tf_keras.src.callbacks.History at 0x7b330e902f00>

In [31]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

Test Loss: 0.6549, Test Accuracy: 0.6375
