# GPT-2 Transformer Model for Sarcasm Detection

In [1]:
import torch
import numpy as np
import pandas as pd
import pandas as pd
import numpy as np
import nltk
import copy
import shap
import gc
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2Model, GPT2ForSequenceClassification
from sklearn.metrics import recall_score, accuracy_score, f1_score, precision_score
from tqdm import tqdm

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

torch.cuda.is_available()

# https://colab.research.google.com/drive/1dMTdO5vxdVX0NA2Qe7AV9WGEy8ZH67Xn?usp=sharing#scrollTo=afcc233b

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/lfrostbyte/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lfrostbyte/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/lfrostbyte/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def random_oversampling(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Given credit card dataset with 0 as the majority for 'Class' column. Returns credit card data with two classes
    having the same shape, given raw data read from csv.

    Parameters
    ----------
    df: pd.DataFrame
        The potentially imbalanced dataset.

    Returns
    -------
    Oversampled dataset with equal number of fraudulent and legitimate data.
    '''

    majority = df[df['label'] == 1]
    minority = df[df['label'] == 0]
    n = majority.shape[0]
    print(n)
    minority = pd.DataFrame.sample(minority, n, replace=True, random_state=42)
    return pd.concat([majority, minority], axis=0)

train_df = pd.read_csv("./train.csv")
print(train_df[train_df['label'] == 0].shape)
print(train_df[train_df['label'] == 1].shape)
print(train_df[train_df['label'] == 2].shape)
print(train_df[train_df['label'] == 3].shape)
print(train_df[train_df['label'] == 4].shape)
print(train_df[train_df['label'] == 5].shape)
print("Preprocess labels")
train_df['label'] = train_df['label'].apply(lambda x: 1 if x <= 3 else 0) # 0 (Real) for 3-5 and 1 (Fake) for 0-2
print(train_df[train_df['label'] == 0].shape)
print(train_df[train_df['label'] == 1].shape)
train_df = random_oversampling(train_df)
print("Oversampling")
print(train_df[train_df['label'] == 0].shape)
print(train_df[train_df['label'] == 1].shape)
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

val_df = pd.read_csv("val.csv")
val_df['label'] = val_df['label'].apply(lambda x: 1 if x <= 3 else 0) # 0 (Real) for 3-5 and 1 (Fake) for 0-2
val_df = random_oversampling(val_df) # We have to balance this too otherwise training yields really poor results
val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True)

test_df = pd.read_csv("test.csv")
test_df['label'] = test_df['label'].apply(lambda x: 1 if x <= 3 else 0) # 0 (Real) for 3-5 and 1 (Fake) for 0-2

train_df

(2425, 16)
(5284, 16)
(2882, 16)
(2967, 16)
(2743, 16)
(2068, 16)
Preprocess labels
(4811, 16)
(13558, 16)
13558
Oversampling
(13558, 16)
(13558, 16)
1696


Unnamed: 0,id,label,statement,date,subject,speaker,speaker_description,state_info,true_counts,mostly_true_counts,half_true_counts,mostly_false_counts,false_counts,pants_on_fire_counts,context,justification
0,4425,0,The people in Massachusetts like (the state he...,"October 18, 2011",health care;polls and public opinion;states,mitt romney,Mitt Romney is a U.S. senator from Utah. He ra...,national,31,33,58,35,32,19,"Hanover, N.H",Romney has strong support for this . A recent ...
1,3378,1,Said Planned Parenthood's early objective was ...,"March 15, 2011",abortion,herman cain,"Herman Cain is an author, columnist and talk r...",georgia,0,3,5,4,11,3,a talk at a conservative think tank,Why would Sanger try to destroy a race of peop...
2,6016,0,"There are close to 900,000 unemployed veterans...","July 23, 2012",economy;military;veterans,sanford bishop,Democrat Sanford Bishop represents Georgia's 2...,georgia,1,1,0,0,1,0,an online video of an earlier speech,Veterans are faring better than U.S. workers a...
3,95,1,Hillary stood up for universal health care whe...,"October 4, 2007",health care,hillary clinton,Hillary Clinton was the 2016 Democratic nomine...,national,72,76,70,43,31,9,Iowa and New Hampshire,But the Clinton campaign says she used her inf...
4,10596,0,The United States has the highest incarceratio...,"August 5, 2015",criminal justice;crime,jim webb,Jim Webb is running for president of the Unite...,virginia,3,4,2,1,0,0,a web post,Webb said the U.S. has the highest incarcerati...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27111,10653,0,"Says Ron Johnson ""opposes entirely a federal m...","August 18, 2015",agriculture;economy;jobs;labor;government regu...,russ feingold,Russ Feingold served as a Democratic U.S. Sena...,wisconsin,3,5,7,2,4,1,a speech,"Johnson opposes a raise to $15, the statement ..."
27112,10010,1,President Barack Obama's veto of the Keystone ...,"February 24, 2015",energy;jobs,tom graves,Tom Graves represents Georgia's 9th Congressio...,georgia,5,0,5,1,0,1,a press release,Both Democrats and Republicans have spun the n...
27113,5740,1,"Says ""Oregon is one of only three states that ...","May 30, 2012",environment;jobs;message machine 2012;recreation,stop gillnetting now,The Stop Gillnetting Now campaign is working t...,oregon,0,0,0,0,1,0,campaign press statement,There are some states that don’t depend on com...
27114,18968,0,"Republican elected officials, a network of pro...","November 17, 2020",fake news;coronavirus,jb pritzker,J.B. Pritzker is Illinois' 43rd governor and a...,illinois,1,5,3,7,3,0,remarks at a news conference,"Pritzker said ""Republican elected officials, a..."


In [3]:
def preprocess_row(row):
    combined_input = (
        "Subject: " + str(row['subject'] or "") + 
        "; Speaker: " + str(row['speaker'] or "") + 
        "; Speaker Description: " + str(row['speaker_description'] or "") + 
        "; State: " + str(row['state_info'] or "") + 
        "; Context: " + str(row['context'] or "") + 
        "; Statement: " + str(row['statement'] or "")
    )
    return combined_input

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

In [4]:
# Apply the function row-wise and store tokenized output
train_df["input"] = train_df.apply(preprocess_row, axis=1)
val_df["input"] = val_df.apply(preprocess_row, axis=1)
test_df["input"] = test_df.apply(preprocess_row, axis=1)

train_df["input"]

0        Subject: health care;polls and public opinion;...
1        Subject: abortion; Speaker: herman cain; Speak...
2        Subject: economy;military;veterans; Speaker: s...
3        Subject: health care; Speaker: hillary clinton...
4        Subject: criminal justice;crime; Speaker: jim ...
                               ...                        
27111    Subject: agriculture;economy;jobs;labor;govern...
27112    Subject: energy;jobs; Speaker: tom graves; Spe...
27113    Subject: environment;jobs;message machine 2012...
27114    Subject: fake news;coronavirus; Speaker: jb pr...
27115    Subject: education;pensions;unions;workers; Sp...
Name: input, Length: 27116, dtype: object

In [5]:
# For convenience
# train_df = train_df.head(100)
# val_df = val_df.head(100)
# test_df = test_df.head(100)

train_X, train_Y = train_df['input'], train_df['label']
val_X, val_Y = val_df['input'], val_df['label']
test_X, test_Y = test_df['input'], test_df['label']

print(len(train_X))
print(len(train_Y))

27116
27116


In [6]:
val_Y.value_counts()

label
1    1696
0    1696
Name: count, dtype: int64

In [7]:
class GPT2Classifier(torch.nn.Module):
    def __init__(self, seq_len):
        super(GPT2Classifier,self).__init__()
        self.trfLayer = GPT2Model.from_pretrained("gpt2")
        self.seq_len = seq_len
        self.fc = torch.nn.Linear(in_features=seq_len * 768, out_features=2)

    def forward(self, X: torch.Tensor):
        input_ids = X[:, :self.seq_len]
        att_masks = X[:, self.seq_len:]
        outputs, _ = self.trfLayer(input_ids=input_ids, attention_mask=att_masks, return_dict=False)
        batch_size = outputs.shape[0]
        outputs = outputs.view(batch_size, -1)
        outputs2 = self.fc(outputs)
        return outputs2

class NewsDataset(Dataset):
    """
    Custom Dataset class
    Inputs: Strings of statements, context and a vector corresponding to the 6 counts from the original dataset
    Outputs: Encoded tokenized vectors for statements, context and a vector corresponding to the 6 counts from the original dataset
    """
    def __init__(self, features, labels, tokenizer, max_length=1024):
        self.inputs = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Maximum sequence length

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        x = self.inputs[idx]
        encoded_news = self.tokenizer(x, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt")
        # decoded_string = self.tokenizer.decode(encoded_news["input_ids"][0], skip_special_tokens=True)
        # print(decoded_string)
        return encoded_news, self.labels[idx]
    
def load_model(path, s):
    """
    Loads model from directory.
    """
    model = GPT2Classifier(seq_len=s)
    model.load_state_dict(torch.load(path))
    model.eval()
    return model

def train(m: GPT2Classifier, trainData, valData, lr, early_stop_tol):
    max_epochs = 500
    trainLoader = DataLoader(trainData, batch_size=16, shuffle=True)
    valLoader = DataLoader(valData, batch_size=16, shuffle=True)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(m.parameters(), lr=lr)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        m = m.cuda()
        criterion = criterion.cuda()

    did_not_improve_count = 0
    best_val_score = 0
    best_epoch = 0
    best_state_dict = None
    m.train()
    
    for epoch_num in range(max_epochs):
        total_loss_train = 0
        train_predictions = []
        train_labels = []
        for encoded_news, label in tqdm(trainLoader):
            label = label.to(device)
            news_mask = encoded_news['attention_mask'].squeeze(1).to(device)
            news_input_id = encoded_news["input_ids"].squeeze(1).to(device)  

            X = torch.concat((news_input_id, news_mask), axis=-1)

            output = m(X)

            batch_loss = criterion(output, label)
            total_loss_train += batch_loss.item()

            # add original labels
            train_labels += label.cpu().numpy().flatten().tolist()
            train_predictions += output.argmax(dim=1).cpu().numpy().flatten().tolist()

            batch_loss.backward()
            optimizer.step()
        
        total_loss_val = 0
        val_predictions = []
        val_labels = []
        
        with torch.no_grad():
            for encoded_news, label in valLoader:
                label = label.to(device)
                news_mask = encoded_news['attention_mask'].squeeze(1).to(device)
                news_input_id = encoded_news["input_ids"].squeeze(1).to(device)

                X = torch.concat((news_input_id, news_mask), axis=-1)            

                output = m(X)
                    
                batch_loss = criterion(output, label)
                total_loss_val += batch_loss.item()
                
                val_labels += label.cpu().numpy().flatten().tolist()
                val_predictions += output.argmax(dim=1).cpu().numpy().flatten().tolist()
        
        train_labels = np.array(train_labels)
        train_predictions = np.array(train_predictions)
        val_labels = np.array(val_labels)
        val_predictions = np.array(val_predictions)
        
        train_r = recall_score(train_labels, train_predictions)
        train_p = precision_score(train_labels, train_predictions)
        train_f1 = f1_score(train_labels, train_predictions)
        train_acc = accuracy_score(train_labels, train_predictions)

        val_r = recall_score(val_labels, val_predictions)
        val_p = precision_score(val_labels, val_predictions)
        val_f1 = f1_score(val_labels, val_predictions)
        val_acc = accuracy_score(val_labels, val_predictions)

        if val_acc > (best_val_score + early_stop_tol): # We shouldn't be early stopping based on recall cos then the model will just overfit to positives only
            best_val_score = val_acc
            did_not_improve_count = 0
            best_epoch = epoch_num
            best_state_dict = copy.deepcopy(m.state_dict())
            print(f"Saving new best val acc {best_val_score}")
            torch.save(best_state_dict, f"gpt2-lr{lr}-iter{best_epoch+1}-tol{early_stop_tol}.pt")
        else:
            did_not_improve_count += 1

        print(
            f"Epochs: {epoch_num + 1} | Train Loss {total_loss_train/len(trainData): .3f} \
    | Train R,P,Acc,F1 = {train_r: .3f}, {train_p: .3f}, {train_acc: .3f}, {train_f1: .3f} \
            | Val Loss: {total_loss_val / len(valData): .3f} \
    | Val R,P,Acc,F1 = {val_r: .3f}, {val_p: .3f}, {val_acc: .3f}, {val_f1: .3f}") 
        
        print(f"Val True Labels 1={np.sum(val_labels == 1)}, 0={np.sum(val_labels == 0)}")
        print(f"Val Predictions 1={np.sum(val_predictions == 1)}, 0={np.sum(val_predictions == 0)}")
             
        if did_not_improve_count >= 5:
            break

In [8]:
# Hyperparameters
s = 128
hidden_size = 768
val_tol = 0.01
lr = 5e-6
batch_size = 16

In [9]:
model = GPT2Classifier(seq_len=s)
trainData = NewsDataset(features=train_X, labels=train_Y, tokenizer=tokenizer, max_length=s)
valData = NewsDataset(features=val_X, labels=val_Y, tokenizer=tokenizer, max_length=s)

In [10]:
train(m=model, trainData=trainData, valData=valData, lr=lr, early_stop_tol=val_tol)

100%|██████████| 1695/1695 [07:43<00:00,  3.66it/s]


Saving new best val acc 0.5262382075471698
Epochs: 1 | Train Loss  0.207     | Train R,P,Acc,F1 =  0.533,  0.509,  0.509,  0.520             | Val Loss:  0.071     | Val R,P,Acc,F1 =  0.780,  0.517,  0.526,  0.622
Val True Labels 1=1696, 0=1696
Val Predictions 1=2557, 0=835


100%|██████████| 1695/1695 [07:43<00:00,  3.66it/s]


Saving new best val acc 0.5498231132075472
Epochs: 2 | Train Loss  0.232     | Train R,P,Acc,F1 =  0.467,  0.526,  0.523,  0.494             | Val Loss:  0.131     | Val R,P,Acc,F1 =  0.750,  0.536,  0.550,  0.625
Val True Labels 1=1696, 0=1696
Val Predictions 1=2375, 0=1017


100%|██████████| 1695/1695 [07:43<00:00,  3.66it/s]


Saving new best val acc 0.5943396226415094
Epochs: 3 | Train Loss  0.188     | Train R,P,Acc,F1 =  0.608,  0.533,  0.538,  0.568             | Val Loss:  0.082     | Val R,P,Acc,F1 =  0.383,  0.664,  0.594,  0.485
Val True Labels 1=1696, 0=1696
Val Predictions 1=978, 0=2414


100%|██████████| 1695/1695 [07:43<00:00,  3.66it/s]


Epochs: 4 | Train Loss  0.135     | Train R,P,Acc,F1 =  0.528,  0.596,  0.585,  0.560             | Val Loss:  0.185     | Val R,P,Acc,F1 =  0.965,  0.508,  0.516,  0.666
Val True Labels 1=1696, 0=1696
Val Predictions 1=3220, 0=172


100%|██████████| 1695/1695 [07:43<00:00,  3.66it/s]


Saving new best val acc 0.6238207547169812
Epochs: 5 | Train Loss  0.115     | Train R,P,Acc,F1 =  0.687,  0.623,  0.635,  0.653             | Val Loss:  0.095     | Val R,P,Acc,F1 =  0.647,  0.618,  0.624,  0.632
Val True Labels 1=1696, 0=1696
Val Predictions 1=1776, 0=1616


100%|██████████| 1695/1695 [07:42<00:00,  3.66it/s]


Saving new best val acc 0.6394457547169812
Epochs: 6 | Train Loss  0.090     | Train R,P,Acc,F1 =  0.689,  0.712,  0.705,  0.700             | Val Loss:  0.133     | Val R,P,Acc,F1 =  0.439,  0.733,  0.639,  0.549
Val True Labels 1=1696, 0=1696
Val Predictions 1=1017, 0=2375


100%|██████████| 1695/1695 [07:42<00:00,  3.66it/s]


Epochs: 7 | Train Loss  0.075     | Train R,P,Acc,F1 =  0.746,  0.752,  0.750,  0.749             | Val Loss:  0.131     | Val R,P,Acc,F1 =  0.445,  0.725,  0.638,  0.552
Val True Labels 1=1696, 0=1696
Val Predictions 1=1041, 0=2351


100%|██████████| 1695/1695 [07:42<00:00,  3.66it/s]


Epochs: 8 | Train Loss  0.066     | Train R,P,Acc,F1 =  0.767,  0.780,  0.775,  0.773             | Val Loss:  0.162     | Val R,P,Acc,F1 =  0.491,  0.688,  0.634,  0.573
Val True Labels 1=1696, 0=1696
Val Predictions 1=1210, 0=2182


100%|██████████| 1695/1695 [07:42<00:00,  3.66it/s]


Epochs: 9 | Train Loss  0.054     | Train R,P,Acc,F1 =  0.816,  0.815,  0.815,  0.816             | Val Loss:  0.163     | Val R,P,Acc,F1 =  0.577,  0.644,  0.629,  0.608
Val True Labels 1=1696, 0=1696
Val Predictions 1=1519, 0=1873


100%|██████████| 1695/1695 [07:45<00:00,  3.64it/s]


Epochs: 10 | Train Loss  0.044     | Train R,P,Acc,F1 =  0.852,  0.850,  0.851,  0.851             | Val Loss:  0.209     | Val R,P,Acc,F1 =  0.687,  0.615,  0.628,  0.649
Val True Labels 1=1696, 0=1696
Val Predictions 1=1895, 0=1497


100%|██████████| 1695/1695 [07:58<00:00,  3.54it/s]


Epochs: 11 | Train Loss  0.042     | Train R,P,Acc,F1 =  0.863,  0.866,  0.864,  0.864             | Val Loss:  0.309     | Val R,P,Acc,F1 =  0.825,  0.565,  0.595,  0.671
Val True Labels 1=1696, 0=1696
Val Predictions 1=2477, 0=915


In [17]:
def evaluate(model, testData):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    testLoader = DataLoader(testData, batch_size=16, shuffle=True)

    if use_cuda:
        model = model.cuda()

    predictions = []
    true_labels = []
    model.eval()
    
    with torch.no_grad():
        for encoded_news, label in testLoader:
            label = label.to(device)
            news_mask = encoded_news['attention_mask'].squeeze(1).to(device)
            news_input_id = encoded_news['input_ids'].squeeze(1).to(device)

            X = torch.cat((news_input_id, news_mask), dim=-1)
            output = model(X)
                    
            # add original labels
            true_labels += label.cpu().numpy().flatten().tolist()
            predictions += output.argmax(dim=1).cpu().numpy().flatten().tolist()

    r_score = recall_score(true_labels, predictions)
    p_score = precision_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    test_acc = accuracy_score(true_labels, predictions)
    print(f'Test Accuracy: {test_acc: .3f}, Recall: {r_score: .3f}, Precision {p_score: .3f}, Acc: {test_acc: .3f} F1: {f1: .3f}')

testData = NewsDataset(features=test_X, labels=test_Y, tokenizer=tokenizer, max_length=s)
print(len(testData))
gpt2Trf = load_model("./gpt2-lr5e-06-iter6-tol0.01.pt", s=s)

evaluate(gpt2Trf, testData)

2296
Test Accuracy:  0.586, Recall:  0.506, Precision  0.884, Acc:  0.586 F1:  0.643
