In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    f1_score,
    roc_auc_score,
)

import torch
from torch import nn
from torch.optim import Adam

from tqdm import tqdm

from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.distributed as dist

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
def set_random_seed(random_seed=None):
    """
    Using random seed for numpy and torch
    """
    if random_seed is None:
        random_seed = 13
    os.environ["PYTHONHASHSEED"] = str(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)


set_random_seed(42)

In [4]:
sarcasm_df = pd.read_csv("data/train-balanced-sarcasm.csv")

In [5]:
sarcasm_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [6]:
# We just need comment & label columns
# So, let's remove others.
sarcasm_df.drop(
    [
        "author",
        "subreddit",
        "score",
        "ups",
        "downs",
        "date",
        "created_utc",
        "parent_comment",
    ],
    axis=1,
    inplace=True,
)
# remove empty rows
sarcasm_df.dropna(inplace=True)

# Some comments are missing, so we drop the corresponding rows.
sarcasm_df.dropna(subset=["comment"], inplace=True)

In [7]:
sarcasm_df["label"].value_counts()

label
0    505403
1    505368
Name: count, dtype: int64

In [8]:
# Calculate the lengths of comments
comment_lengths = [len(comment.split()) for comment in sarcasm_df["comment"]]

# Calculate the mean, maximum, and minimum lengths
mean_length = sum(comment_lengths) / len(comment_lengths)
max_length = max(comment_lengths)
min_length = min(comment_lengths)

# Print the results
print("Mean length:", mean_length)
print("Maximum length:", max_length)
print("Minimum length:", min_length)

Mean length: 10.461467533199905
Maximum length: 2222
Minimum length: 1


In [9]:
# Filter the dataframe to keep only comments with length <= 50
mask = [length <= 50 for length in comment_lengths]
sarcasm_df = sarcasm_df[mask]

# Reset the index of the dataframe
sarcasm_df.reset_index(drop=True, inplace=True)

In [10]:
# Calculate the lengths of comments
comment_lengths = [len(comment.split()) for comment in sarcasm_df["comment"]]

# Calculate the mean, maximum, and minimum lengths
mean_length = sum(comment_lengths) / len(comment_lengths)
max_length = max(comment_lengths)
min_length = min(comment_lengths)

# Print the results
print("Mean length:", mean_length)
print("Maximum length:", max_length)
print("Minimum length:", min_length)

Mean length: 10.265359705412772
Maximum length: 50
Minimum length: 1


In [11]:
sarcasm_df.head()

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.


In [12]:
sarcasm_df["label"].value_counts()

label
1    504617
0    503166
Name: count, dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    sarcasm_df["comment"], sarcasm_df["label"], test_size=0.25, random_state=42
)

In [14]:
X_train, X_test, y_train, y_test = (
    list(X_train),
    list(X_test),
    list(y_train),
    list(y_test),
)

In [16]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [17]:
from preprocessing import preprocess

In [18]:
class SARCDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        texts = X

        texts = [preprocess(text) for text in tqdm(texts, desc="Preprocessing")]

        self._print_random_samples(texts)

        self.texts = [
            tokenizer(
                text,
                padding="max_length",
                max_length=150,
                truncation=True,
                return_tensors="pt",
            )
            for text in tqdm(texts, desc="Tokenizing")
        ]

        self.labels = y

    def _print_random_samples(self, texts):
        random_entries = np.random.randint(0, len(texts), 5)

        for i in random_entries:
            print(f"Entry {i}: {texts[i]}")

        print()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        label = -1
        if hasattr(self, "labels"):
            label = self.labels[idx]

        return text, label

In [19]:
dataset = SARCDataset(sarcasm_df["comment"], sarcasm_df["label"], tokenizer)

Preprocessing:   0%|          | 0/1007783 [00:00<?, ?it/s]

Preprocessing: 100%|██████████| 1007783/1007783 [00:13<00:00, 72319.61it/s]


Entry 121958: [CLS] I read this as man or no man ? [SEP]
Entry 671155: [CLS] Wait but I thought Islamists were responsible for basically every war right ? [SEP]
Entry 131932: [CLS] Serious question is there any other kind of meat for Christmas ? [SEP]
Entry 365838: [CLS] Watch your tone you sexist ! [SEP]
Entry 259178: [CLS] My parents figured I was going to do it anyway so they would rather just know about it and make sure I was being responsible [SEP]



Tokenizing: 100%|██████████| 1007783/1007783 [03:21<00:00, 4990.12it/s]


In [20]:
# save dataset using pickle
import pickle

with open("preprocessed_dataset.pkl", "wb") as f:
    pickle.dump(dataset, f)

In [None]:
# load dataset using pickle
with open("preprocessed_dataset.pkl", "rb") as f:
    dataset = pickle.load(f)

In [21]:
train_sarc = SARCDataset(X_train, y_train, tokenizer)

Preprocessing: 100%|██████████| 755838/755838 [00:10<00:00, 72482.12it/s]


Entry 521430: [CLS] overrated futbol underrated Hockey [SEP]
Entry 87498: [CLS] Certainly the last thing Labor wants the press gallery talking about are The NBN National Disability Care Gonkski reforms [SEP]
Entry 175203: [CLS] Definitely bonking [SEP]
Entry 191335: [CLS] where did you get that little Majoras mask figure ? [SEP]
Entry 278167: [CLS] And the fact that Putin was right and Obama was wrong on Syria is secondary of course [SEP]



Tokenizing: 100%|██████████| 755838/755838 [02:26<00:00, 5148.18it/s]


In [19]:
test_sarc = SARCDataset(X_test, y_test, tokenizer)

Preprocessing: 100%|██████████| 251947/251947 [00:03<00:00, 72583.52it/s]


Entry 119879: [CLS] The idea that players get paid based solely on production is nonsense [SEP]
Entry 110268: [CLS] Than [SEP]
Entry 207892: [CLS] To be fair everyone knows that type of people that would buy a MacBook Pro are the kind of people that would only use it for Facebook and to show off that Apple logo [SEP]
Entry 54886: [CLS] we have solved the mystery of who Im Uzi is boys ! [SEP]
Entry 137337: [CLS] I guess we are all taking it a bit hard during the shutdown [SEP]



Tokenizing: 100%|██████████| 251947/251947 [00:49<00:00, 5079.96it/s]


In [22]:
train_dataloader = DataLoader(train_sarc, batch_size=128, shuffle=True, num_workers=16)
val_dataloader = DataLoader(test_sarc, batch_size=128, num_workers=16)

In [23]:
model = RobertaModel.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
print(model)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [25]:
class SARCClassifier(nn.Module):
    def __init__(self, base_model):
        super(SARCClassifier, self).__init__()

        self.bert = base_model
        self.fc1 = nn.Linear(768, 32)
        self.fc2 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0][
            :, 0
        ]
        x = self.fc1(bert_out)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.sigmoid(x)

        return x

In [35]:
def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
    best_val_loss = float("inf")
    early_stopping_threshold_count = 0
    EARLY_STOPPING = 3

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    model = model.to(device)
    criterion = criterion.to(device)

    model_metrics = {}
    model_metrics["train_accuracy"] = []
    model_metrics["val_accuracy"] = []
    model_metrics["train_loss"] = []
    model_metrics["val_loss"] = []
    model_metrics["f1"] = []
    model_metrics["val_f1"] = []
    model_metrics["auc"] = []
    model_metrics["val_auc"] = []

    for epoch in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        total_f1_train = 0
        total_auc_train = 0

        model.train()

        for train_input, train_label in tqdm(train_dataloader):
            attention_mask = train_input["attention_mask"].to(device)
            input_ids = train_input["input_ids"].squeeze(1).to(device)

            train_label = train_label.to(device)

            output = model(input_ids, attention_mask)

            loss = criterion(output, train_label.float().unsqueeze(1))

            total_loss_train += loss.item()

            acc = ((output >= 0.5).int() == train_label.unsqueeze(1)).sum().item()
            total_acc_train += acc

            out_preds = output.cpu().detach().numpy().flatten()
            targets = train_label.cpu().detach().numpy().flatten()
            try:
                auc_score = roc_auc_score(targets, out_preds)
            except:
                auc_score = 1
            total_auc_train += auc_score

            out_preds[out_preds < 0.5] = 0
            out_preds[out_preds >= 0.5] = 1
            f1_score_ = f1_score(targets, out_preds)
            total_f1_train += f1_score_

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        with torch.no_grad():
            total_acc_val = 0
            total_loss_val = 0
            total_f1_val = 0
            total_auc_val = 0

            model.eval()

            for val_input, val_label in tqdm(val_dataloader):
                attention_mask = val_input["attention_mask"].to(device)
                input_ids = val_input["input_ids"].squeeze(1).to(device)

                val_label = val_label.to(device)

                output = model(input_ids, attention_mask)

                loss = criterion(output, val_label.float().unsqueeze(1))

                total_loss_val += loss.item()

                acc = ((output >= 0.5).int() == val_label.unsqueeze(1)).sum().item()
                total_acc_val += acc

                out_preds = output.cpu().detach().numpy().flatten()
                targets = val_label.cpu().detach().numpy().flatten()
                try:
                    auc_score = roc_auc_score(targets, out_preds)
                except:
                    auc_score = 1.0
                total_auc_val += auc_score

                out_preds[out_preds < 0.5] = 0
                out_preds[out_preds >= 0.5] = 1
                f1_score_ = f1_score(targets, out_preds)
                total_f1_val += f1_score_

            print(
                f"Epochs: {epoch + 1} "
                f"| Train Loss: {total_loss_train / len(train_dataloader): .3f} "
                f"| Train Accuracy: {total_acc_train / (len(train_dataloader.dataset)): .3f} "
                f"| Val Loss: {total_loss_val / len(val_dataloader): .3f} "
                f"| Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}"
            )
            model_metrics["train_accuracy"].append(
                total_acc_train / (len(train_dataloader.dataset))
            )
            model_metrics["val_accuracy"].append(
                total_acc_val / len(val_dataloader.dataset)
            )
            model_metrics["train_loss"].append(total_loss_train / len(train_dataloader))
            model_metrics["val_loss"].append(total_loss_val / len(val_dataloader))
            model_metrics["f1"].append(total_f1_train / len(train_dataloader))
            model_metrics["val_f1"].append(total_f1_val / len(val_dataloader))
            model_metrics["auc"].append(total_auc_train / len(train_dataloader))
            model_metrics["val_auc"].append(total_auc_val / len(val_dataloader))

            print(model_metrics)

            if best_val_loss > total_loss_val:
                best_val_loss = total_loss_val
                torch.save(model, f"best_model.pt")
                print("Saved model")
                early_stopping_threshold_count = 0
            else:
                early_stopping_threshold_count += 1

            if early_stopping_threshold_count >= EARLY_STOPPING:
                print("Early stopping")
                break
    return model_metrics

In [37]:
model = SARCClassifier(model)

dist.init_process_group(backend="nccl")
model = nn.parallel.DistributedDataParallel(model)

model.to(device)

learning_rate = 1e-5
epochs = 10

ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set

In [28]:
metrics = train(model, train_dataloader, val_dataloader, learning_rate, epochs)

NameError: name 'learning_rate' is not defined