# Colab: Connect Google Drive

In [1]:
from google.colab import drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/Colab\ Notebooks/pytorch-multimodal_sarcasm_detection

/content/drive/MyDrive/Colab Notebooks/pytorch-multimodal_sarcasm_detection


# Bertweet

In [5]:
!pip install transformers
!pip3 -q install emoji



In [6]:
import torch 
from transformers import AutoModel, AutoTokenizer
from transformers import (get_linear_schedule_with_warmup,AdamW,AutoModel, AutoTokenizer, AutoModelForSequenceClassification)
from torch.utils.data import (TensorDataset,DataLoader, RandomSampler, SequentialSampler, Dataset)
import ast
import pandas as pd

from sklearn.utils import shuffle

import os
import random
import time
import datetime
import torch
import argparse
import numpy as np
import pandas as pd
from torch.nn import functional as F

In [7]:
model_1 = torch.load('/content/drive/MyDrive/Colab Notebooks/berttweet_2epoch.pt')

In [8]:
def load_data_lists(path):
    data_points_lists = []
    with open(path, encoding='utf-8') as f:
        lines = f.readlines()

        for line in lines:
            try:
                data_points_lists.append(ast.literal_eval(line))
            except:
                # Ignore lines with errors
                pass

    print('Found {} lines in "{}".'.format(len(lines), path))
    print('Successfully loaded {} data points from "{}".'.format(len(data_points_lists), path))
    
    return data_points_lists

COLUMN_NAMES = ['ID', 'Text', 'Sarcastic']

In [9]:
def construct_df(data_points_lists, column_names=COLUMN_NAMES):
    df = pd.DataFrame(data_points_lists, columns=column_names)
    df['ID'] = pd.to_numeric(df['ID'])
    df['Sarcastic'] = df['Sarcastic'].astype('bool')
    
    return df

In [10]:
train_df = construct_df(load_data_lists('text_data/train.txt'))

train_df.info()
train_df.head()

Found 29040 lines in "text_data/train.txt".
Successfully loaded 29040 data points from "text_data/train.txt".
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29040 entries, 0 to 29039
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         29040 non-null  int64 
 1   Text       29040 non-null  object
 2   Sarcastic  29040 non-null  bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 482.2+ KB


Unnamed: 0,ID,Text,Sarcastic
0,910308516510011393,most # funny quotes : 21 snarky and # funny qu...,True
1,725333760762363905,spurs # creativethinking ! <url>,True
2,840006160660983809,<user> thanks for showing up for our appointme...,True
3,854334602516733952,only a hardcore fan of sir jonny sins will get...,True
4,908913372199915520,haha . # lol,True


In [11]:
valid_df = construct_df(load_data_lists('text_data/valid.txt'), column_names=COLUMN_NAMES + ['Sarc_2'])

valid_df.info()
valid_df.head()

Found 2410 lines in "text_data/valid.txt".
Successfully loaded 2410 data points from "text_data/valid.txt".
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2410 entries, 0 to 2409
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         2410 non-null   int64 
 1   Text       2410 non-null   object
 2   Sarcastic  2410 non-null   bool  
 3   Sarc_2     2410 non-null   int64 
dtypes: bool(1), int64(2), object(1)
memory usage: 59.0+ KB


Unnamed: 0,ID,Text,Sarcastic,Sarc_2
0,915657464401580032,whew ... that extra <num> miles today to the g...,True,1
1,854678856724340736,""" oh , good . now no one will know we 're here...",True,1
2,904892917277274112,how much of it you think is true ? has this be...,True,1
3,855466461296504832,<user> finally found proof that the earth is f...,True,1
4,927373534652805120,many ways to overcome tension & fear but nothi...,True,1


In [12]:
test_df = construct_df(load_data_lists('text_data/test.txt'), column_names=COLUMN_NAMES + ['Sarc_2'])

test_df.info()
test_df.head()

Found 2409 lines in "text_data/test.txt".
Successfully loaded 2409 data points from "text_data/test.txt".
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2409 entries, 0 to 2408
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         2409 non-null   int64 
 1   Text       2409 non-null   object
 2   Sarcastic  2409 non-null   bool  
 3   Sarc_2     2409 non-null   int64 
dtypes: bool(1), int64(2), object(1)
memory usage: 58.9+ KB


Unnamed: 0,ID,Text,Sarcastic,Sarc_2
0,862902619928506372,i am guessing # netflix no longer lets you gra...,True,1
1,892551658487631873,it 's the insensitive strikeouts at suntrust p...,True,1
2,853143461360480256,"following the path of the river calder , so .....",True,1
3,918423568823840768,# westernsahara # authority has no lessons 2ge...,True,1
4,731617467718610944,hey <user> great sale !,True,1


In [13]:
train_df = shuffle(train_df, random_state=42)
# valid_df = shuffle(valid_df, random_state=42)
# test_df = shuffle(test_df, random_state=42)

In [14]:
def bert_encode(df, tokenizer):
    input_ids = []
    attention_masks = []
    # print(df)
    for sent in df[['Text']].values:
        sent = sent.item()
        encoded_dict = tokenizer.encode_plus(
                            sent,                      
                            add_special_tokens = True, 
                            max_length = 128,           
                            pad_to_max_length = True,
                            truncation = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',    
                    )
           
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    inputs = {
    'input_word_ids': input_ids,
    'input_mask': attention_masks}

    return inputs

In [15]:
def prepare_dataloaders(train_df,test_df,batch_size=64):
    # Load the AutoTokenizer with a normalization mode if the input Tweet is raw
    
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)
    
    tweet_train = bert_encode(train_df, tokenizer)
    tweet_train_labels = train_df['Sarcastic'].astype(int)
    
    tweet_test = bert_encode(test_df, tokenizer)

    input_ids, attention_masks = tweet_train.values()
    labels = torch.tensor(tweet_train_labels.values)
    train_dataset = TensorDataset(input_ids, attention_masks, labels)

    
    input_ids, attention_masks = tweet_test.values()
    test_dataset = TensorDataset(input_ids, attention_masks)

    
    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset), 
                batch_size = batch_size 
            )


    test_dataloader = DataLoader(
                test_dataset, 
                sampler = SequentialSampler(test_dataset), 
                batch_size = batch_size
            )
    return train_dataloader, test_dataloader

In [16]:
def predict(model,test_dataloader):
    model.eval()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    preds = []

    for batch in test_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        for logit in logits:
            preds.append(logit)
    # preds
    return preds

In [17]:
train_dataloader,test_dataloader = prepare_dataloaders(train_df, test_df)
_,val_dataloader = prepare_dataloaders(train_df, valid_df)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
pred_bertweet = predict(model_1,test_dataloader)

In [19]:
pred_bertweet_label = np.argmax(pred_bertweet,axis=1)

In [20]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, accuracy_score, recall_score

In [21]:
print(accuracy_score(test_df['Sarcastic'],pred_bertweet_label))
print(f1_score(test_df['Sarcastic'],pred_bertweet_label))
print(precision_score(test_df['Sarcastic'],pred_bertweet_label))
print(recall_score(test_df['Sarcastic'],pred_bertweet_label))

0.9028642590286425
0.8767123287671234
0.9316909294512878
0.8278606965174129


# Fusion model setup

In [22]:
import shutil

In [None]:
os.mkdir('image_data/test')
for id in test_df['ID']:
  img_path = 'image_data/{}.jpg'.format(id)
  if os.path.isfile(img_path):
    shutil.move(img_path, 'image_data/test/{}.jpg'.format(id))

In [None]:
os.mkdir('image_data/train')
for id in train_df['ID']:
  img_path = 'image_data/{}.jpg'.format(id)
  if os.path.isfile(img_path):
    shutil.move(img_path, 'image_data/train/{}.jpg'.format(id))

In [None]:
os.mkdir('image_data/valid')
for id in valid_df['ID']:
  img_path = 'image_data/{}.jpg'.format(id)
  if os.path.isfile(img_path):
    shutil.move(img_path, 'image_data/valid/{}.jpg'.format(id))

In [None]:
from PIL import Image
from torch.utils.data import Dataset, DataLoader,random_split
import torchvision
from torchvision import datasets, models, transforms
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import PIL
import pickle

# Test .py

In [36]:
!python LoadData1.py

image feature torch.Size([32, 196, 2048]) torch.FloatTensor
attribute index torch.Size([32, 5]) torch.LongTensor
group tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1]) torch.LongTensor
image id tensor([862902619928506372, 892551658487631873, 853143461360480256,
        918423568823840768, 731617467718610944, 684633048483106816,
        722492930016026624, 702345480924041217, 859056671179636737,
        923929354610053120, 919651889511313408, 877816574240972802,
        806888630031622148, 712810737811505153, 894741372527411200,
        732682873166159872, 722871462915235842, 839086335096942592,
        802576734117691392, 726980104090456065, 939714200456318981,
        884901385141190657, 824024594755518464, 895860997054799872,
        924449074568400896, 840225661117706240, 821997332346437632,
        863886502698156032, 736082451424890881, 821870486372155393,
        914153415105753092, 712023362784989184]) torch.LongTen

In [37]:
!python ImageFeature.py

torch.Size([32, 1024])
torch.Size([196, 32, 1024])


In [38]:
!python AttributeFeature.py

torch.Size([32, 200])
torch.Size([32, 5, 200])


In [39]:
!python FuseAllFeature1.py

torch.Size([32, 512])


In [40]:
!python FinalClassifier1.py

tensor([[0.4953, 0.5059],
        [0.4964, 0.5057],
        [0.4975, 0.5054],
        [0.4957, 0.5066],
        [0.4963, 0.5060],
        [0.4959, 0.5031],
        [0.4943, 0.5043],
        [0.4958, 0.5047],
        [0.4945, 0.5055],
        [0.4966, 0.5068],
        [0.4966, 0.5053],
        [0.4968, 0.5059],
        [0.4964, 0.5070],
        [0.4946, 0.5055],
        [0.4954, 0.5056],
        [0.4956, 0.5047],
        [0.4957, 0.5059],
        [0.4946, 0.5060],
        [0.4947, 0.5058],
        [0.4945, 0.5048],
        [0.4961, 0.5046],
        [0.4948, 0.5063],
        [0.4958, 0.5042],
        [0.4951, 0.5053],
        [0.4957, 0.5067],
        [0.4949, 0.5047],
        [0.4959, 0.5057],
        [0.4946, 0.5070],
        [0.4962, 0.5053],
        [0.4945, 0.5058],
        [0.4944, 0.5049],
        [0.4961, 0.5045]], grad_fn=<SigmoidBackward0>)


# Fusion model

In [23]:
import torch
import ImageFeature
import AttributeFeature
import FinalClassifier1
import FuseAllFeature1
from LoadData1 import *
from torch.utils.data import Dataset, DataLoader,random_split
import numpy as np

In [24]:
class Multimodel(torch.nn.Module):
    def __init__(self,fc_dropout_rate):
        super(Multimodel, self).__init__()
        self.image = ImageFeature.ExtractImageFeature()
        self.attribute = AttributeFeature.ExtractAttributeFeature()
        self.fuse = FuseAllFeature1.ModalityFusion()
        self.final_classifier = FinalClassifier1.ClassificationLayer(fc_dropout_rate)
    def forward(self,image_feature, attribute_index):
        image_result,image_seq = self.image(image_feature)
        attribute_result,attribute_seq = self.attribute(attribute_index)
        fusion = self.fuse(image_result,image_seq,attribute_result,attribute_seq.permute(1,0,2))
        output = self.final_classifier(fusion)
        return output

In [45]:
def train(model,train_loader,valid_loader,loss_fn,optimizer,number_of_epoch):

  for epoch in range(number_of_epoch):
    print('begin training: epoch %d' %epoch)
    train_loss=0
    correct_train=0
    model.train()
    count = 0
    for image_feature, attribute_index, group, id in train_loader:
      group = group.to(device)
      pred = model(image_feature.to(device), attribute_index.to(device))
      # pred = torch.argmax(pred,axis=1)
      # pred = pred.view(-1,1).to(torch.float32)
      loss = loss_fn(pred, group)
      train_loss+=loss
      correct_train+=(torch.argmax(pred,axis=1)==group).sum().item()
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      print('processing %dth epoch, %d' %(epoch, count))
      count += 1
    
    # calculate valid loss

    valid_loss=0
    correct_valid=0
    model.eval()
    print('begin testing: epoch %d' %epoch)
    with torch.no_grad():
      for val_image_feature, val_attribute_index, val_group, val_id in valid_loader:
          val_group = val_group.to(device)
          val_pred = model(val_image_feature.to(device), val_attribute_index.to(device))
          # val_pred = torch.argmax(val_pred,axis=1)
          # val_pred = val_pred.view(-1,1).to(torch.float32)
          val_loss = loss_fn(val_pred, val_group)
          valid_loss+=val_loss
          correct_valid+=(torch.argmax(pred,axis=1)==group).sum().item()

    print("epoch: %d train_loss=%.5f train_acc=%.3f valid_loss=%.5f valid_acc=%.3f"%(epoch,
                                                                                      train_loss/len(train_loader),
                                                                                  correct_train/len(train_loader)/batch_size,
                                                                                      valid_loss/len(valid_loader),
                                                                                      correct_valid/len(valid_loader)/batch_size))

In [46]:
learning_rate_list = [0.001]
fc_dropout_rate_list=[0,0.3,0.9,0.99]
weight_decay_list=[0,1e-6,1e-5,1e-4]
# weight_decay_list=[1e-7]
batchsz=32
data_shuffle=False

In [52]:
train_loader = DataLoader(train_set, batch_size=batchsz, shuffle=True, num_workers=4)
test_loader = DataLoader(test_set, batch_size=batchsz, shuffle=data_shuffle, num_workers=4)
valid_loader = DataLoader(valid_set, batch_size=batchsz, shuffle=data_shuffle, num_workers=4)

In [51]:
import itertools
comb = itertools.product(learning_rate_list,fc_dropout_rate_list,weight_decay_list)

In [None]:
for learning_rate,fc_dropout_rate,weight_decay in list(comb):
    print(f"learning rate={learning_rate} | fc dropout={fc_dropout_rate} | weight decay={weight_decay}")
    # loss function
    loss_fn=torch.nn.CrossEntropyLoss()
    # initilize the model
    model = Multimodel(fc_dropout_rate).to(device)
    # optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,weight_decay=weight_decay)
    # train
    number_of_epoch=2
    train(model,train_loader,valid_loader,loss_fn,optimizer,number_of_epoch)

learning rate=0.001 | fc dropout=0 | weight decay=0
begin training: epoch 0
processing 0th epoch, 0
processing 0th epoch, 1
processing 0th epoch, 2
processing 0th epoch, 3
processing 0th epoch, 4
processing 0th epoch, 5
processing 0th epoch, 6
processing 0th epoch, 7
processing 0th epoch, 8
processing 0th epoch, 9
processing 0th epoch, 10
processing 0th epoch, 11
processing 0th epoch, 12
processing 0th epoch, 13
processing 0th epoch, 14
processing 0th epoch, 15
processing 0th epoch, 16
processing 0th epoch, 17
processing 0th epoch, 18
processing 0th epoch, 19
processing 0th epoch, 20
processing 0th epoch, 21
processing 0th epoch, 22
processing 0th epoch, 23
processing 0th epoch, 24
processing 0th epoch, 25
processing 0th epoch, 26
processing 0th epoch, 27
processing 0th epoch, 28
processing 0th epoch, 29
processing 0th epoch, 30
processing 0th epoch, 31
processing 0th epoch, 32
processing 0th epoch, 33
processing 0th epoch, 34
processing 0th epoch, 35
processing 0th epoch, 36
processin

In [30]:
import sklearn.metrics as metrics
import seaborn as sns
from scipy.special import softmax
from sklearn.metrics import accuracy_score, f1_score, precision_score, accuracy_score, recall_score

In [31]:
def validation_metrics(model, dataset):
    model.eval()
    with torch.no_grad():
        correct=0
        confusion_matrix_sum=None
        loss_sum=0
        count=0
        preds = []
        for image_feature, attribute_index, group, id in dataset:
            group = group.to(device)
            pred = model(image_feature.to(device), attribute_index.to(device))
            loss = loss_fn(pred, group)
            loss_sum+=loss
            logits = torch.argmax(pred,axis=1)
            correct+=(logits==group).sum().item()
            count += 1
            pred = pred.detach().cpu().numpy()

            for i in pred:
                preds.append(i)
            print(count)

        acc=correct/len(dataset)/batch_size
        loss_avg=loss_sum/len(dataset)
    return preds, loss_avg.item(), acc

In [33]:
pred_fusion, loss, acc = validation_metrics(model, test_loader)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76


In [34]:
pred_fusion = np.array(pred_fusion)
pred_bertweet = np.array(pred_bertweet)
pred_fusion_final = (pred_fusion + pred_bertweet) / 2
pred_fusion_final_score = softmax(pred_fusion_final)
pred_fusion_final_label = np.argmax(pred_fusion_final_score,axis=1)

In [35]:
print(accuracy_score(test_df['Sarcastic'],pred_fusion_final_label))
print(f1_score(test_df['Sarcastic'],pred_fusion_final_label))
print(precision_score(test_df['Sarcastic'],pred_fusion_final_label))
print(recall_score(test_df['Sarcastic'],pred_fusion_final_label))

0.9061851390618514
0.8830227743271221
0.9201725997842503
0.8487562189054726
