# Reddit Dataset

In [1]:
#!pip install opendatasets
#!pip install transformers

In [1]:
import numpy as np
import pandas as pd
import string
import opendatasets as od
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

import torch
from torch import nn, optim
import torchtext.data
import torchtext.datasets
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.preprocessing import LabelBinarizer
import transformers
from transformers import BertTokenizer, BertModel
from transformers import RobertaTokenizer, RobertaModel, ElectraModel, ElectraTokenizer
from transformers import logging
logging.set_verbosity_error()

from torch import cuda
from tqdm.notebook import tqdm
device = 'cuda' if cuda.is_available() else 'cpu'


from collections import Counter as ctr

## Get Data

In [2]:
#od.download("https://www.kaggle.com/datasets/danofer/sarcasm?resource=download")

In [16]:
data = pd.read_csv('sarcasm/train-balanced-sarcasm.csv')
data

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...
...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,..."
1010822,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...
1010823,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...
1010824,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...


In [17]:
ctr(data['label'])

Counter({0: 505413, 1: 505413})

##### drop data with low scores

In [18]:
data = data[data['score'] > 10]
data = data.drop("score", axis=1)

data = data[30000:60000]

##### analyze lengths

In [19]:
max(data['comment'].str.len()), min(data['comment'].str.len()), data['comment'].str.len().mean()

(1224, 1, 60.53053333333333)

##### normalize data

In [20]:
data['comment'] = data['comment'].astype(str)
data['comment'] = data['comment'].apply(lambda x: x.lower())

# def remove_punctuation(s):
#     s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
#     return s
# data['comment'] = data['comment'].apply(remove_punctuation)

In [21]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer

lemma = WordNetLemmatizer()
w_tokenizer = WhitespaceTokenizer()

def lemmatize_text(text):
    return ' '.join([lemma.lemmatize(w) for w in w_tokenizer.tokenize(text)])

data['comment'] = data['comment'].apply(lemmatize_text)
# data

## Train/Test Split for MNB

In [22]:
with open('glasgow_stop_words.txt') as f:
    stops = f.readlines()
f.close()
vec = CountVectorizer(stop_words=stops)
bag_o_words = vec.fit_transform(data['comment'])
bag_o_words = np.array(bag_o_words.todense())



In [23]:
X = bag_o_words
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [24]:
model = MultinomialNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

In [25]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred, average="macro"))
print('ROC AUC:', roc_auc_score(y_test, y_pred))

Accuracy: 0.6335353535353535
F1 score: 0.6276945530535158
ROC AUC: 0.6286198807584101


In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.54      0.58      4684
           1       0.63      0.72      0.67      5216

    accuracy                           0.63      9900
   macro avg       0.63      0.63      0.63      9900
weighted avg       0.63      0.63      0.63      9900



In [27]:
results = pd.DataFrame(data={'predicted': y_pred, 'actual': y_test})
predictions = results.join(data)

In [28]:
def is_correct(predicted, actual):
    if predicted == actual:
        return True
    else:
        return False

predictions['correct'] = predictions.apply(lambda x: is_correct(x.predicted, x.actual), axis=1)
predictions = predictions[['comment','predicted','actual','correct']]

In [29]:
pd.set_option('display.max_colwidth', None)

##### looking at where the model made correct/incorrect decisions

In [30]:
predictions[predictions['correct']==False].sample(10)

Unnamed: 0,comment,predicted,actual,correct
491152,those penalty shootouts...,1,0,False
426279,mossad obviously used it mind control to make ken livingstone put his foot in his mouth.,0,1,False
352881,also the hangar wa getting pretty filled up and they didn't want to have to deal with yet another core taking up space.,0,1,False
275010,"ah - thanks for the laugh, anonymous teenager.",1,0,False
270178,it got the stats of a dry rot so it can be good,1,0,False
355119,"kai should be a professional dancer, not a bodybuilder.",1,0,False
322020,still great at crushing highschool pus,0,1,False
285332,"for all the complaint that saudi arabia is exporting it radical form on islam (wahhabism), american church are the one funding the rise of fundamental christianity in most of africa.",1,0,False
316065,"reverse psychology not working, still going to non cannon bloodvelds in nieves cave",0,1,False
461111,rip 2016,0,1,False


In [31]:
predictions[predictions['correct']==True].sample(10)

Unnamed: 0,comment,predicted,actual,correct
323164,"also: utility bills, dental visits, car maintenance...",0,0,True
501932,you forgot the,1,1,True
309385,"your comment is curious, because it seems you either don't believe that *your* child's behavior is the result of how you've raised him or you're assuming the zoo child doesn't have a ""disorder"".",0,0,True
363936,"create au azeroth for 1-60, problem solved.",1,1,True
348978,did you just assume that person gender?,1,1,True
330485,"well, good thing you aren't an island and there isn't a single gay person in the uk.",1,1,True
286374,we had an aunt that would kiss u and then grab our packages.,0,0,True
385884,but don't you realise that pot use eventually lead to heroin use?,1,1,True
294095,"fuck, i wish they would.",0,0,True
256861,because it's all decoration and ha absolutely no other negative affect on the people who live/work here,1,1,True


## Train/Test Split for ELECTRA

In [201]:
training_data, testing_data = train_test_split(data, test_size = 0.08)

In [202]:
training_data = training_data[['comment', 'label']]
testing_data = testing_data[['comment', 'label']]

# training_data = training_data[['parent_comment', 'label']]
# testing_data = testing_data[['parent_comment', 'label']]
# training_data = training_data[['text', 'label']]
# testing_data = testing_data[['text', 'label']]

In [203]:
train_y = pd.get_dummies(training_data.label)
test_y = pd.get_dummies(testing_data.label)

In [204]:
train_data = np.array(training_data['comment'])
test_data = np.array(testing_data['comment'])

# train_data = np.array(training_data['parent_comment'])
# test_data = np.array(testing_data['parent_comment'])
# train_data = np.array(training_data['text'])
# test_data = np.array(testing_data['text'])

train_labels = LabelBinarizer().fit_transform(train_y)
test_labels = LabelBinarizer().fit_transform(test_y)

train_data.shape, train_labels.shape, test_data.shape, test_labels.shape

((27600,), (27600, 2), (2400,), (2400, 2))

In [205]:
train_data = train_data[:500]
train_labels = train_labels[:500]

test_data = test_data[:100]
test_labels = test_labels[:100]

## Tokenize Data & Define Model

In [206]:
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

In [207]:
class MultiLabelDataset(torch.utils.data.Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': self.targets[index].clone().detach()
        }

In [208]:
class ELECTRAClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(ELECTRAClass, self).__init__()
                   
        self.l1 = ElectraModel.from_pretrained("google/electra-small-discriminator")
        self.classifier = torch.nn.Linear(256, NUM_OUT)
        self.dropout = torch.nn.Dropout(0.5)
        self.softmax = torch.nn.Softmax(dim=1)
        

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        classifier = self.classifier(pooler)
        dropout = self.dropout(classifier)
        output = self.softmax(dropout)
        return output

In [209]:
def loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

## Train Model

In [210]:
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 4
NUM_OUT = 2
LEARNING_RATE = 2e-05

training_data = MultiLabelDataset(train_data, torch.from_numpy(train_labels), tokenizer, MAX_LEN)
testing_data = MultiLabelDataset(test_data, torch.from_numpy(test_labels), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(testing_data, **test_params)

In [211]:
model = ELECTRAClass(NUM_OUT)
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')  
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    targets = torch.max(targs, dim=1)
    print('accuracy on test set {}'.format(accuracy_score(guesses.indices, targets.indices)))

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch: 0, Loss:  0.7720845937728882


  0%|          | 0/13 [00:00<?, ?it/s]

accuracy on test set 0.45


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch: 1, Loss:  0.621605396270752


  0%|          | 0/13 [00:00<?, ?it/s]

accuracy on test set 0.54


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch: 2, Loss:  0.8914043307304382


  0%|          | 0/13 [00:00<?, ?it/s]

accuracy on test set 0.5


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch: 3, Loss:  0.6579529047012329


  0%|          | 0/13 [00:00<?, ?it/s]

accuracy on test set 0.52


In [67]:
guesses.indices

tensor([1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
        1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
        0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
        1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
        1, 0, 1, 1])

In [68]:
targets.indices

tensor([1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 1])