In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My\ Drive/Text-Mining-Code
# !git clone https://github.com/Smolky/hahackathon-2021
# %cd hahackathon-2021/datasets/

Mounted at /content/drive
/content/drive/My Drive/Text-Mining-Code


In [2]:
import pandas as pd
import torch
import nltk
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize

nltk.download('punkt')

class HumorDataset(Dataset):
  def __init__(self, df, label=None):
    self.label = label
    
    self.vocab_size = None
    self.word_to_ix = None

    self.df = df
    self.df = self.preprocess_data(self.df)

  def preprocess_data(self, df):
    df['tokens'] = df['text'].apply(word_tokenize)

    vocab = set()
    for tokens in df['tokens']:
      vocab.update(tokens)
    self.word_to_ix = {word: i + 2 for i, word in enumerate(vocab)}
    self.word_to_ix['<UNK>'] = 0
    self.word_to_ix['<PAD>'] = 1

    def tokens_to_indices(tokens):
      return [self.word_to_ix.get(word, 0) for word in tokens]

    df['indices'] = df['tokens'].apply(tokens_to_indices)
    self.vocab_size = len(vocab) + 2

    return df

  def __len__(self):
    return len(self.df.index)

  def __getitem__(self, idx):
    row = self.df.iloc[idx]
    indices = row['indices']
    label = row[self.label]
    return torch.tensor(indices), label

  def collate(self, batch):
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    inputs = pad_sequence(inputs, batch_first=True, padding_value=self.word_to_ix['<PAD>'])
    return inputs, torch.unsqueeze(torch.FloatTensor(labels), 1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
class LSTMC(torch.nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, num_classes):
    super().__init__()
    self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
    self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
    self.fc = torch.nn.Linear(hidden_dim, num_classes)

  def forward(self, inputs):
    embedded = self.embedding(inputs)
    lstm_output, _ = self.lstm(embedded)
    last_hidden_state = lstm_output[:, -1, :]
    output = self.fc(last_hidden_state)
    return output


EMBEDDING_DIM = 100
HIDDEN_DIM = 256
NUM_CLASSES = 1
LEARNING_RATE = 1e-3
NUM_EPOCHS = 10

## Humor Classification

In [34]:
train_df = pd.read_csv("./datasets/hahackathon_train.csv")
train_dataset = HumorDataset(train_df, "is_humor")
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=train_dataset.collate)

VOCAB_SIZE = train_dataset.vocab_size 

model = LSTMC(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CLASSES)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

criterion = torch.nn.BCEWithLogitsLoss()

for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0
    train_acc = 0
    for inputs, labels in train_loader:
      optimizer.zero_grad()
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      train_loss += loss.item() * inputs.size(0)
      preds = torch.argmax(outputs, dim=1)
      train_acc += torch.sum(preds == labels)

    train_loss /= len(train_dataset)
    train_acc = train_acc.float() / len(train_dataset)
    print("Loss : ", train_loss, " - Train Acc : ", train_acc)

Loss :  0.665247287273407  - Train Acc :  tensor(12.2720)
Loss :  0.6478652830123901  - Train Acc :  tensor(12.2720)
Loss :  0.654735198020935  - Train Acc :  tensor(12.2720)
Loss :  0.46051715332269666  - Train Acc :  tensor(12.2720)
Loss :  0.30527178722620013  - Train Acc :  tensor(12.2720)
Loss :  0.22275484851002694  - Train Acc :  tensor(12.2720)
Loss :  0.1536042091920972  - Train Acc :  tensor(12.2720)
Loss :  0.09703765147738158  - Train Acc :  tensor(12.2720)
Loss :  0.06837640217598527  - Train Acc :  tensor(12.2720)
Loss :  0.040981808956246826  - Train Acc :  tensor(12.2720)


TypeError: ignored

In [36]:
torch.save(model, "./humor_lstm.pt")

In [6]:
model = torch.load("./humor_lstm.pt")

test_df = pd.read_csv("./datasets/gold-test-27446.csv")
test_dataset = HumorDataset(test_df, "is_humor")
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=test_dataset.collate)

preds = []

model.eval()
with torch.no_grad():
  for inputs, labels in test_loader:
    outputs = model(inputs)
    pred = torch.sigmoid(outputs)
    pred = (pred > 0.5).float()
    
    preds += pred.squeeze(1).tolist()

test_df["humor_pred"] = preds
test_df

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,tokens,indices,humor_pred
0,9001,Finding out your ex got fat is like finding 20...,1,2.20,0.0,0.90,"[Finding, out, your, ex, got, fat, is, like, f...","[964, 76, 1271, 5178, 1068, 1057, 3981, 3515, ...",1.0
1,9002,"For Brockmann, stereotypes imperil national se...",0,,,0.35,"[For, Brockmann, ,, stereotypes, imperil, nati...","[1896, 1316, 4011, 3493, 493, 4966, 4963, 3035...",0.0
2,9003,A girl runs up to her mother with a pile of cr...,1,2.80,1.0,0.10,"[A, girl, runs, up, to, her, mother, with, a, ...","[3424, 2244, 4338, 1378, 1459, 4772, 608, 4902...",1.0
3,9004,gotta wonder if baseball still would've been c...,1,2.15,0.0,0.00,"[got, ta, wonder, if, baseball, still, would, ...","[1068, 3143, 649, 3035, 65, 1246, 4517, 1477, ...",0.0
4,9005,When you're dreading getting in the shower cuz...,1,2.25,0.0,0.35,"[When, you, 're, dreading, getting, in, the, s...","[4909, 999, 4822, 2869, 4636, 1212, 2579, 5123...",1.0
...,...,...,...,...,...,...,...,...,...
995,9996,What do you call a black man on the moon? An a...,1,1.88,1.0,1.05,"[What, do, you, call, a, black, man, on, the, ...","[2021, 3265, 999, 2465, 2626, 2988, 1372, 1972...",1.0
996,9997,when im picking someone up and they ask how lo...,1,1.88,0.0,0.00,"[when, im, picking, someone, up, and, they, as...","[2167, 3245, 2769, 427, 1378, 416, 2140, 2469,...",1.0
997,9998,"A black lesbian, an obese white neck-beard, an...",1,1.80,1.0,1.65,"[A, black, lesbian, ,, an, obese, white, neck-...","[3424, 2988, 4022, 4011, 709, 1440, 1536, 5076...",1.0
998,9999,and I recognize the need to use ALL of my plat...,0,,,0.00,"[and, I, recognize, the, need, to, use, ALL, o...","[416, 4506, 805, 2579, 2660, 1459, 2877, 1314,...",1.0


## Controversy Classification

In [47]:
train_df = pd.read_csv("./datasets/hahackathon_train.csv")
train_df = train_df.fillna(0)
train_dataset = HumorDataset(train_df, "humor_controversy")
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=train_dataset.collate)

VOCAB_SIZE = train_dataset.vocab_size 

model = LSTMC(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, NUM_CLASSES)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

criterion = torch.nn.BCEWithLogitsLoss()

for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0
    train_acc = 0
    for inputs, labels in train_loader:
      optimizer.zero_grad()
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      train_loss += loss.item() * inputs.size(0)
      preds = torch.round(torch.sigmoid(outputs))
      train_acc += torch.sum(preds == labels)

    train_loss /= len(train_dataset)
    train_acc = train_acc.float() / len(train_dataset)
    print("Loss : ", train_loss, " - Train Acc : ", train_acc)

Loss :  0.619321519613266  - Train Acc :  tensor(0.6892)
Loss :  0.6182294446229935  - Train Acc :  tensor(0.6914)
Loss :  0.6034796781539917  - Train Acc :  tensor(0.6921)
Loss :  0.5845528795719147  - Train Acc :  tensor(0.6924)
Loss :  0.5589480514526367  - Train Acc :  tensor(0.6891)
Loss :  0.5262210704088212  - Train Acc :  tensor(0.6975)
Loss :  0.46930172514915464  - Train Acc :  tensor(0.7351)
Loss :  0.40476221668720247  - Train Acc :  tensor(0.7870)
Loss :  0.32924495244026186  - Train Acc :  tensor(0.8347)
Loss :  0.2467093511968851  - Train Acc :  tensor(0.8894)


In [48]:
torch.save(model, "./controversy_lstm.pt")

In [8]:
model = torch.load("./controversy_lstm.pt")

# test_df = pd.read_csv("./datasets/gold-test-27446.csv")
# train_df = test_df.fillna(0)
test_dataset = HumorDataset(test_df, "humor_controversy")
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=test_dataset.collate)

preds = []

model.eval()
with torch.no_grad():
  for inputs, labels in test_loader:
    outputs = model(inputs)
    pred = torch.round(torch.sigmoid(outputs))
    pred = (pred > 0.5).float()
    
    preds += pred.squeeze(1).tolist()

test_df["cont_preds"] = preds
# print(f'Epoch {epoch + 1}/{NUM_EPOCHS}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')

In [9]:
test_df.to_csv("./lstm_test.csv")