In [3]:
import torch   
from torchtext import data 
torch.manual_seed(2021)
from torchtext.legacy.data import Field, TabularDataset, LabelField, BucketIterator, NestedField

In [131]:
import pandas as pd
twitter = pd.read_json("sarcasm_detection_shared_task_twitter_training.jsonl", lines=True)
for i,c in enumerate(twitter.context):
    twitter.context[i] = c[-1]
twitter.to_json("sarcasm_detection_shared_task_twitter_training_context.jsonl", orient = 'records', lines=True)

"Watch this clip - Shows a ' Free Kashmir ' paid billboard in New Jersey , USA . I came across many such billboards . Who is paying for them ? Why ? An entire anti-India nexus is out there to destabilize India . Countless ignorant students falling in the trap . Must be made aware of facts ! <URL>"

In [132]:
CONTEXT = Field(tokenize='spacy',batch_first=True,include_lengths=True)
TEXT = Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = LabelField(dtype = torch.float,batch_first=True)




In [133]:
fields = {'label':('label', LABEL), 'response':('text', TEXT), 'context':('context', CONTEXT)}

In [134]:
training_data=TabularDataset(path = 'sarcasm_detection_shared_task_twitter_training_context.jsonl', format = 'json', fields = fields,)

In [135]:
training_data.examples[1].context

['@USER',
 '@USER',
 'having',
 'to',
 'make',
 'up',
 'excuses',
 'of',
 'why',
 'your',
 'crowd',
 'was',
 'small',
 '.']

In [136]:
import random
train_data, valid_data = training_data.split(split_ratio=0.9, random_state = random.seed(2021))

In [137]:
#initialize glove embeddings
TEXT.build_vocab(train_data, min_freq=1,vectors = "glove.twitter.27B.100d")
#glove.twitter.27B.100d
CONTEXT.build_vocab(train_data, min_freq=1,vectors = "glove.twitter.27B.100d")
LABEL.build_vocab(train_data,)

#Unique tokens in response
# print("Size of TEXT vocabulary:", TEXT.vocab.freqs.

#Unique tokens in label
print("Size of LABEL vocabulary:",len(CONTEXT.vocab))
# print(TEXT.vocab.vectors[50])
#Commonly used words
print(CONTEXT.vocab.freqs.most_common(10))
  

Size of LABEL vocabulary: 14935
[('@USER', 7333), ('.', 5785), ('the', 3267), (',', 3068), ('to', 2606), ('I', 2092), ('a', 2063), ('and', 1832), ('you', 1791), ('of', 1641)]


In [138]:
print(LABEL.vocab.stoi)

defaultdict(None, {'SARCASM': 0, 'NOT_SARCASM': 1})


In [139]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
BATCH_SIZE = 16
device = 'cpu'

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)


In [140]:
len(train_data)

4500

In [141]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, weights, weights_context,
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.embedding_context = nn.Embedding.from_pretrained(weights_context)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 4, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths, context, context_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        embedded_context = self.embedding_context(context)
        # print(embedded.shape)

        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        packed_embedded_context = nn.utils.rnn.pack_padded_sequence(embedded_context, context_lengths.cpu(), batch_first=True, enforce_sorted=False)
        # print(packed_embedded)
        #
        # packed_output, hidden = self.lstm(packed_embedded)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        packed_output_context, (hidden_context, cell_context) = self.lstm(packed_embedded_context)
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        hidden_context = torch.cat((hidden_context[-2,:,:], hidden_context[-1,:,:]), dim = 1)
        # print(hidden.shape)
        # print(hidden_context.shape)
        
        hidden = torch.cat((hidden, hidden_context), dim=1)
        # print(hidden.shape)
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [142]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 300
num_output_nodes = 1
num_layers = 3
bidirection = True
dropout = 0.2

pretrained_embeddings = TEXT.vocab.vectors
# model.embedding.weight.data.copy_(pretrained_embeddings)

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, pretrained_embeddings, CONTEXT.vocab.vectors,
                   bidirectional = True, dropout = dropout)

In [143]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(14667, 100)
  (embedding_context): Embedding(14935, 100)
  (lstm): LSTM(100, 300, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=1200, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 5,295,601 trainable parameters
torch.Size([14667, 100])


In [144]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [145]:
from sklearn.metrics import accuracy_score


def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text 
        context, context_lengths = batch.context

        #convert to 1D tensor
        predictions = model(text, text_lengths, context, context_lengths).squeeze(0) # dodala nula
        
        #compute the loss
        loss = criterion(predictions.squeeze(), batch.label.squeeze())
        # loss = criterion(predictions, batch.label)       
    
        #compute the binary accuracy
        # acc = binary_accuracy(predictions, batch.label)  
        acc = accuracy_score(predictions.detach().squeeze().round().numpy(), batch.label.detach().numpy() ) 
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [146]:


def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            context, context_lengths = batch.context
            
            #convert to 1d tensor
            predictions = model(text, text_lengths, context, context_lengths).squeeze(0) # dodala 0
            
            #compute loss and accuracy
            loss = criterion(predictions.squeeze(), batch.label.squeeze())
            # loss = criterion(predictions, batch.label)
            # acc = binary_accuracy(predictions, batch.label)
            acc = accuracy_score(predictions.detach().squeeze().round().numpy(), batch.label.detach().numpy() ) 
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [147]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        print(f"\nEpoch {epoch+1}")
        if (epoch+1)<=5:
            torch.save(model.state_dict(), 'saved_weights.pt')
        else:
            torch.save(model.state_dict(), 'saved_weights.pt')

    
    # torch.save(model.state_dict(), 'saved_weights.pt')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch 1
	Train Loss: 0.595 | Train Acc: 67.29%
	 Val. Loss: 0.536 |  Val. Acc: 73.44%
	Train Loss: 0.519 | Train Acc: 74.29%
	 Val. Loss: 0.543 |  Val. Acc: 72.27%

Epoch 3
	Train Loss: 0.501 | Train Acc: 74.89%
	 Val. Loss: 0.512 |  Val. Acc: 73.44%
	Train Loss: 0.488 | Train Acc: 76.11%
	 Val. Loss: 0.528 |  Val. Acc: 72.46%
	Train Loss: 0.478 | Train Acc: 76.57%
	 Val. Loss: 0.528 |  Val. Acc: 71.88%
	Train Loss: 0.462 | Train Acc: 77.02%
	 Val. Loss: 0.527 |  Val. Acc: 73.63%

Epoch 7
	Train Loss: 0.456 | Train Acc: 77.66%
	 Val. Loss: 0.511 |  Val. Acc: 72.66%
	Train Loss: 0.447 | Train Acc: 77.75%
	 Val. Loss: 0.517 |  Val. Acc: 72.85%

Epoch 9
	Train Loss: 0.434 | Train Acc: 78.66%
	 Val. Loss: 0.509 |  Val. Acc: 73.63%

Epoch 10
	Train Loss: 0.421 | Train Acc: 78.99%
	 Val. Loss: 0.503 |  Val. Acc: 72.85%


In [148]:
#load weights
path='saved_weights.pt'
model.load_state_dict(torch.load(path))
model.eval()

#inference 
import spacy
nlp = spacy.load('en_core_web_sm')

def predict(model, sentence, cont):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor

    tokenized = [tok.text for tok in nlp.tokenizer(cont)]  #tokenize the sentence 
    indexed = [CONTEXT.vocab.stoi[t] for t in tokenized]
    # print(indexed)          #convert to integer sequence
    length_context = [len(indexed)]                                    #compute no. of words
    tensor_context = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor_context = tensor_context.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor_context = torch.LongTensor(length_context)                   #convert to tensor
    
    prediction = model(tensor, length_tensor, tensor_context, length_tensor_context)                  #prediction 
    return prediction.item()               

In [149]:
import pandas as pd
twitter = pd.read_json("sarcasm_detection_shared_task_twitter_testing.jsonl", lines=True)

input_labels = [ 1 if x!='SARCASM' else 0 for x in twitter.label ] # ako koristim long dataset onda ==
print(len(input_labels))

1800


In [151]:
outs=[]
for s,c in zip(twitter.response,twitter.context):
    outs.append(round(predict(model,s,c[-1])))

In [152]:
predict(model,twitter.response[2],twitter.context[2][0])

0.7289590239524841

In [153]:
c=0
for x,y in zip(outs,input_labels):
    if(x==y):
        c+=1
print(c)
print(c/1800)

1215
0.675


In [155]:
from sklearn.metrics import f1_score
f1 = f1_score(input_labels, outs, average='macro')
f1

0.6744347826086956

In [156]:
from sklearn import metrics
print("Precision:",metrics.precision_score(input_labels, outs))
precision =metrics.precision_score(input_labels, outs)

# Model Recall: what perce
# ntage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(input_labels, outs))
recall = metrics.recall_score(input_labels, outs)

print(f1_score(input_labels, outs, average='macro'))

Precision: 0.6615384615384615
Recall: 0.7166666666666667
0.6744347826086956
