In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import re

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from collections import defaultdict
from nltk.corpus import wordnet as wn

In [3]:
df_raw=pd.read_json(r"./../../Dataset/Sarcasm_Headlines_Dataset_v2/Sarcasm_Headlines_Dataset_v2.json",lines=True)

In [4]:
df=df_raw.loc[:1500]

In [5]:
df=df_raw

In [6]:
df.shape

(28619, 3)

In [7]:
df.columns

Index(['is_sarcastic', 'headline', 'article_link'], dtype='object')

In [8]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


## BERT

### Loading Distil-BERT

In [9]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## For BERT Base, uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

### Tokenization

In [10]:
tokenized = df['headline'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

### Padding

In [11]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [12]:
np.array(padded).shape

(28619, 193)

### Masking

In [13]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(28619, 193)

In [14]:
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

## 

In [15]:
input_ids = torch.tensor(padded)
input_ids = input_ids.type(torch.LongTensor)
input_ids.shape

torch.Size([28619, 193])

In [16]:
attention_mask = torch.tensor(attention_mask)
attention_mask.shape

torch.Size([28619, 193])

#### With CPU

In [17]:
# %%time
# with torch.no_grad():
#     last_hidden_states = model(input_ids, attention_mask=attention_mask)

#### With GPU

In [18]:
input_ids = input_ids.to('cuda')
attention_mask = attention_mask.to('cuda')
model.to('cuda')

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [19]:
print(input_ids.shape)
print(attention_mask.shape)

torch.Size([28619, 193])
torch.Size([28619, 193])


In [30]:
batch_size = 250
no_of_batches=(df.shape[0]//batch_size)+1
reminder=(df.shape[0]%batch_size)
print("Batch Size =",batch_size)
print("Number of Batches =",no_of_batches)
print("Reminder in last batch =",reminder)

Batch Size = 250
Number of Batches = 115
Reminder in last batch = 119


In [31]:
%%time
embeddings=np.empty((0, 768), int)
for i in range(0,no_of_batches):
    a=batch_size*i
    input_ids_batch=input_ids[a:a+batch_size]
    attention_mask_batch=attention_mask[a:a+batch_size]
    print(i,input_ids_batch.shape)

    with torch.no_grad():
        last_hidden_states = model(input_ids_batch, attention_mask=attention_mask_batch)
        features = last_hidden_states[0][:,0,:].cpu().numpy()
        embeddings=np.append(embeddings,features,axis=0)
    


0 torch.Size([250, 193])
1 torch.Size([250, 193])
2 torch.Size([250, 193])
3 torch.Size([250, 193])
4 torch.Size([250, 193])
5 torch.Size([250, 193])
6 torch.Size([250, 193])
7 torch.Size([250, 193])
8 torch.Size([250, 193])
9 torch.Size([250, 193])
10 torch.Size([250, 193])
11 torch.Size([250, 193])
12 torch.Size([250, 193])
13 torch.Size([250, 193])
14 torch.Size([250, 193])
15 torch.Size([250, 193])
16 torch.Size([250, 193])
17 torch.Size([250, 193])
18 torch.Size([250, 193])
19 torch.Size([250, 193])
20 torch.Size([250, 193])
21 torch.Size([250, 193])
22 torch.Size([250, 193])
23 torch.Size([250, 193])
24 torch.Size([250, 193])
25 torch.Size([250, 193])
26 torch.Size([250, 193])
27 torch.Size([250, 193])
28 torch.Size([250, 193])
29 torch.Size([250, 193])
30 torch.Size([250, 193])
31 torch.Size([250, 193])
32 torch.Size([250, 193])
33 torch.Size([250, 193])
34 torch.Size([250, 193])
35 torch.Size([250, 193])
36 torch.Size([250, 193])
37 torch.Size([250, 193])
38 torch.Size([250, 19

In [32]:
from numpy import asarray
from numpy import save
# save to npy file
save('embeddings.npy', embeddings)

In [33]:
embeddings.shape

(28619, 768)

In [34]:
# load numpy array from npy file
from numpy import load
# load array
embeddings_npy = load('embeddings.npy')
# print the array
print(embeddings_npy)

[[-0.08782839  0.06258331 -0.12000587 ... -0.33156553  0.5002954
   0.1805836 ]
 [-0.43426287 -0.50481713 -0.36135322 ... -0.02944355  0.34979802
   0.26927984]
 [-0.13752083 -0.09303751  0.14121507 ... -0.0079867   0.26747227
   0.29764438]
 ...
 [-0.28889391 -0.20626098  0.15944281 ... -0.02881556  0.36084583
   0.32674789]
 [-0.66664398 -0.10732649 -0.48685697 ... -0.26801217  0.5380348
   0.31238717]
 [-0.05215537  0.08621484  0.14538945 ... -0.13530289  0.40199804
   0.26933154]]
