In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My\ Drive/Text-Mining-Code
# !git clone https://github.com/Smolky/hahackathon-2021
# %cd hahackathon-2021/datasets/
!pip install transformers

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Text-Mining-Code
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

class HumourDataset(Dataset):
  def __init__(self, csv_path, 
               columns=None, fill_nan=False, drop_nan=False):
    self.df = pd.read_csv(csv_path)
    self.df = self.df[~self.df.isnull().any(axis=1)] if drop_nan else self.df
    self.df = self.df.fillna(0) if fill_nan else self.df
    self.columns = columns
  
  def __len__(self):
    return len(self.df.index)
  
  def __getitem__(self, idx):
    return self.df.iloc[idx, [self.df.columns.get_loc(column) for column in self.columns]] if self.columns else None


class HumourTransformerDataset(HumourDataset):
  def __init__(self, transformer_name, csv_path, 
               label_columns=[None], text_column="text", fill_nan=True, drop_nan=False):
    super().__init__(csv_path, label_columns, fill_nan=fill_nan, drop_nan=drop_nan)

    self.label_columns = label_columns
    text_data = self.df[text_column].tolist()
    self.tokenizer = AutoTokenizer.from_pretrained(transformer_name)
    self.encodings = self.tokenizer(text_data, truncation=True, padding="max_length")
  
  def __len__(self):
    return super().__len__()
  
  def __getitem__(self, idx):
    item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
    if(None in self.label_columns):
      return item
    else :
      labels = super().__getitem__(idx)
      if(type(labels) == pd.DataFrame and len(labels.columns) == 1):
        labels = int(int(labels.iloc[0, 0]) == 1)
      else:
        labels = torch.FloatTensor([int(int(label) == 1) for label in labels]).to(device)
    return item, labels


## Shared Model

In [49]:
transformer_name = "bert-base-cased" # "./shared_model/"

In [62]:
shared_train_dataset = HumourTransformerDataset(
    transformer_name, 
    "./datasets/hahackathon_train.csv",
    ["is_humor", "humor_controversy"])

shared_train_dataloader = DataLoader(shared_train_dataset, shuffle=True, batch_size=16)

In [54]:
import torch
from torch.optim import AdamW
from transformers import get_scheduler
from transformers import AutoConfig, AutoModelForSequenceClassification

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

num_epochs = 3
num_training_steps = num_epochs * len(shared_train_dataloader)

shared_config = AutoConfig.from_pretrained(transformer_name, output_hidden_states=True, num_labels=2)
shared_model = AutoModelForSequenceClassification.from_config(shared_config)
shared_model.to(device)

shared_loss = torch.nn.BCEWithLogitsLoss()
optimizer = AdamW(shared_model.parameters(), lr=1e-5)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, 
        num_warmup_steps=0, num_training_steps=num_training_steps
)

In [63]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

shared_model.train()
for epoch in range(num_epochs):
  for batch, labels in shared_train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = shared_model(**batch)
    loss = shared_loss(outputs.logits, labels)
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.set_postfix({'loss': loss.item()})
    progress_bar.update(1)

  0%|          | 0/1500 [00:00<?, ?it/s]

In [65]:
import os

dir = "./shared_model/"

if(not os.path.isdir(dir)):
  os.makedirs(dir)
shared_model.save_pretrained(dir)

del shared_train_dataloader
del optimizer
del lr_scheduler
del shared_train_dataset
del shared_model

## Humor Detection

In [5]:
transformer_name = "bert-base-cased" # "./humor_model/"

humor_train_dataset = HumourTransformerDataset(
    transformer_name, 
    "./datasets/hahackathon_train.csv",
    ["is_humor"])

humor_train_dataloader = DataLoader(humor_train_dataset, shuffle=True, batch_size=8)

In [11]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import get_scheduler
from transformers import AutoConfig, AutoModelForSequenceClassification

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

num_epochs = 3
num_training_steps = num_epochs * len(humor_train_dataloader)

humor_config = AutoConfig.from_pretrained(transformer_name, output_hidden_states=True, num_labels=1)
humor_model = AutoModelForSequenceClassification.from_config(humor_config)
humor_model.to(device)

shared_model = AutoModelForSequenceClassification.from_pretrained("./shared_model/")
shared_model.to(device)

class HumorClassifier(nn.Module):
  def __init__(self, n_classes):
    super(HumorClassifier, self).__init__()
    # self.pooling = nn.AvgPool2d(3, stride=2)
    self.lin1 = nn.Linear(768 * 2, 768)
    self.lin2 = nn.Linear(768, 356)
    self.lin3 = nn.Linear(356, 100)
    self.classfi = nn.Linear(100, n_classes)
    
  def forward(self, x):
    # x = self.pooling(x)
    x = self.lin1(x)
    x = self.lin2(x)
    x = self.lin3(x)
    return self.classfi(x)

classifier = HumorClassifier(1)
classifier.to(device)

humor_loss = nn.BCEWithLogitsLoss()
optimizer = AdamW(list(humor_model.parameters()) + list(classifier.parameters()), lr=1e-5)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, 
        num_warmup_steps=0, num_training_steps=num_training_steps
)

In [13]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

shared_model.eval()
humor_model.train()

for epoch in range(num_epochs):
  for batch, labels in humor_train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    humor_out = humor_model(**batch)
    humor_out = humor_out.hidden_states
    humor_out = torch.mean(humor_out[-1], dim=1).squeeze()

    with torch.no_grad():
      shared_out = shared_model(**batch)
      shared_out = shared_out.hidden_states
      shared_out = torch.mean(shared_out[-1], dim=1).squeeze()

    humor_out = torch.cat((humor_out, shared_out), 1)
    del shared_out
    humor_out = classifier(humor_out)

    loss = humor_loss(humor_out, labels)
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.set_postfix({'loss': loss.item()})
    progress_bar.update(1)

  0%|          | 0/3000 [00:00<?, ?it/s]

In [14]:
import os

dir = "./humor_model/"

if(not os.path.isdir(dir)):
  os.makedirs(dir)
humor_model.save_pretrained(dir)

torch.save(classifier, "./humor_classi.pt")

del humor_train_dataloader
del optimizer
del lr_scheduler
del humor_train_dataset
del humor_model
del shared_model

## Controversy Detection

In [3]:
transformer_name = "bert-base-cased" # "./humor_model/"

cont_train_dataset = HumourTransformerDataset(
    transformer_name, 
    "./datasets/hahackathon_train.csv",
    ["humor_controversy"],
    drop_nan=True)

cont_train_dataloader = DataLoader(cont_train_dataset, shuffle=True, batch_size=16)

In [4]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import get_scheduler
from transformers import AutoConfig, AutoModelForSequenceClassification

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

num_epochs = 4
num_training_steps = num_epochs * len(cont_train_dataloader)

cont_config = AutoConfig.from_pretrained(transformer_name, output_hidden_states=True, num_labels=1)
cont_model = AutoModelForSequenceClassification.from_config(cont_config)
cont_model.to(device)

shared_model = AutoModelForSequenceClassification.from_pretrained("./shared_model/")
shared_model.to(device)

class ContClassifier(nn.Module):
  def __init__(self, n_classes):
    super(ContClassifier, self).__init__()
    # self.pooling = nn.AvgPool2d(3, stride=2)
    self.lin1 = nn.Linear(768 * 2, 768)
    self.lin2 = nn.Linear(768, 356)
    self.lin3 = nn.Linear(356, 100)
    self.classfi = nn.Linear(100, n_classes)
    
  def forward(self, x):
    # x = self.pooling(x)
    x = self.lin1(x)
    x = self.lin2(x)
    x = self.lin3(x)
    return self.classfi(x)

cont_classifier = ContClassifier(1)
cont_classifier.to(device)

cont_loss = nn.BCEWithLogitsLoss()
optimizer = AdamW(list(cont_model.parameters()) + list(cont_classifier.parameters()), lr=1e-4)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, 
        num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

shared_model.eval()
cont_model.train()

for epoch in range(num_epochs):
  for batch, labels in cont_train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    cont_out = cont_model(**batch)
    cont_out = cont_out.hidden_states
    cont_out = torch.mean(cont_out[-1], dim=1).squeeze()

    with torch.no_grad():
      shared_out = shared_model(**batch)
      shared_out = shared_out.hidden_states
      shared_out = torch.mean(shared_out[-1], dim=1).squeeze()

    cont_out = torch.cat((cont_out, shared_out), 1)
    del shared_out
    cont_out = cont_classifier(cont_out)

    loss = cont_loss(cont_out, labels)
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.set_postfix({'loss': loss.item()})
    progress_bar.update(1)

In [None]:
import os

dir = "./cont_model/"

if(not os.path.isdir(dir)):
  os.makedirs(dir)
cont_model.save_pretrained(dir)

torch.save(classifier, "./cont_classi.pt")

del cont_train_dataloader
del optimizer
del lr_scheduler
del cont_train_dataset
del cont_model
del shared_model

## Evaluation

In [14]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

test_df = pd.read_csv("./datasets/gold-test-27446.csv")
test_dataset = HumourTransformerDataset("bert-base-cased", 
                                        "./datasets/gold-test-27446.csv")

test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=8)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

shared_model = AutoModelForSequenceClassification.from_pretrained("./shared_model/")
shared_model.to(device)
shared_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [15]:
humor_model = AutoModelForSequenceClassification.from_pretrained("./humor_model")
humor_model.to(device)
humor_model.eval()

class HumorClassifier(torch.nn.Module):
  def __init__(self, n_classes):
    super(HumorClassifier, self).__init__()
    # self.pooling = nn.AvgPool2d(3, stride=2)
    self.lin1 = nn.Linear(768 * 2, 768)
    self.lin2 = nn.Linear(768, 356)
    self.lin3 = nn.Linear(356, 100)
    self.classfi = nn.Linear(100, n_classes)
    
  def forward(self, x):
    # x = self.pooling(x)
    x = self.lin1(x)
    x = self.lin2(x)
    x = self.lin3(x)
    return self.classfi(x)

classifier = torch.load("./humor_classi.pt")
classifier.to(device)
classifier.eval()

HumorClassifier(
  (lin1): Linear(in_features=1536, out_features=768, bias=True)
  (lin2): Linear(in_features=768, out_features=356, bias=True)
  (lin3): Linear(in_features=356, out_features=100, bias=True)
  (classfi): Linear(in_features=100, out_features=1, bias=True)
)

In [16]:
preds = []

for batch in test_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    shared_out = shared_model(**batch)
    shared_out = shared_out.hidden_states
    shared_out = torch.mean(shared_out[-1], dim=1).squeeze()

    humor_out = humor_model(**batch)
    humor_out = humor_out.hidden_states
    humor_out = torch.mean(humor_out[-1], dim=1).squeeze()

    humor_out = torch.cat((humor_out, shared_out), 1)
    humor_out = classifier(humor_out)
    humor_out = torch.nn.functional.sigmoid(humor_out)

  humor_out = (humor_out > 0.5).int()
  preds += humor_out.squeeze(1).tolist()

test_df["humor_preds"] = preds
test_df

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,humor_preds
0,9001,Finding out your ex got fat is like finding 20...,1,2.20,0.0,0.90,1
1,9002,"For Brockmann, stereotypes imperil national se...",0,,,0.35,0
2,9003,A girl runs up to her mother with a pile of cr...,1,2.80,1.0,0.10,1
3,9004,gotta wonder if baseball still would've been c...,1,2.15,0.0,0.00,0
4,9005,When you're dreading getting in the shower cuz...,1,2.25,0.0,0.35,1
...,...,...,...,...,...,...,...
995,9996,What do you call a black man on the moon? An a...,1,1.88,1.0,1.05,1
996,9997,when im picking someone up and they ask how lo...,1,1.88,0.0,0.00,1
997,9998,"A black lesbian, an obese white neck-beard, an...",1,1.80,1.0,1.65,1
998,9999,and I recognize the need to use ALL of my plat...,0,,,0.00,0


In [17]:
cont_model = AutoModelForSequenceClassification.from_pretrained("./cont_model")
cont_model.to(device)
cont_model.eval()

class ContClassifier(torch.nn.Module):
  def __init__(self, n_classes):
    super(ContClassifier, self).__init__()
    # self.pooling = nn.AvgPool2d(3, stride=2)
    self.lin1 = nn.Linear(768 * 2, 768)
    self.lin2 = nn.Linear(768, 356)
    self.lin3 = nn.Linear(356, 100)
    self.classfi = nn.Linear(100, n_classes)
    
  def forward(self, x):
    # x = self.pooling(x)
    x = self.lin1(x)
    x = self.lin2(x)
    x = self.lin3(x)
    return self.classfi(x)

cont_classifier = torch.load("./cont_classi.pt")
cont_classifier.to(device)
cont_classifier.eval()

ContClassifier(
  (lin1): Linear(in_features=1536, out_features=768, bias=True)
  (lin2): Linear(in_features=768, out_features=356, bias=True)
  (lin3): Linear(in_features=356, out_features=100, bias=True)
  (classfi): Linear(in_features=100, out_features=1, bias=True)
)

In [18]:
preds = []

for batch in test_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    shared_out = shared_model(**batch)
    shared_out = shared_out.hidden_states
    shared_out = torch.mean(shared_out[-1], dim=1).squeeze()

    cont_out = cont_model(**batch)
    cont_out = cont_out.hidden_states
    cont_out = torch.mean(cont_out[-1], dim=1).squeeze()

    cont_out = torch.cat((cont_out, shared_out), 1)
    cont_out = cont_classifier(cont_out)
    cont_out = torch.nn.functional.sigmoid(cont_out)

  cont_out = (cont_out > 0.5).int()
  preds += cont_out.squeeze(1).tolist()

test_df["cont_preds"] = preds

In [19]:
test_df.to_csv("./test_df.csv")