In [3]:
import torch
from torch import nn, optim
from transformers import *
from torch.utils.data import Dataset, DataLoader
from time import time
import numpy as np
import torch
from torch import nn
import math
from pprint import pprint
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
def position_embeddings(max_pos, size):
    embeddings = np.zeros((max_pos, size))
    w = 1 / (10000 ** (2*np.arange(size // 2 )/size))
    for pos in range(max_pos):
        embeddings[pos,0::2] = np.sin(w*pos)
        embeddings[pos,1::2] = np.cos(w*pos)
    return torch.Tensor(embeddings)
    
pos_embed = position_embeddings(5000, 256)
pos_embed.shape

torch.Size([5000, 256])

In [5]:
bert = BertModel.from_pretrained('bert-base-cased')

bert_hidden_size = 768

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




In [8]:
bert.encoder.layer[11]

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [10]:
class Classifier(nn.Module):
  def __init__(self, bert_size, position_size):
    super().__init__()

    self.attention = nn.Linear(bert_size + position_size, 1)
    self.softmax = nn.Softmax(1)

    self.prediction = nn.Sequential(
        nn.Linear(bert_size, 1024),
        nn.LeakyReLU(),
        nn.Linear(1024, 1024),
        nn.LeakyReLU(),
        nn.Linear(1024, 6),
        nn.Sigmoid()
    )

  '''
  embeddings: shape (segment count, 512, bert_hidden_size)
  position_encodings:  shape (segment count, 512, position_encoding_size)
  comment_bounds: Array of tuples of the form [(start, end)]. comment_bounds[i] = (a, b) indicates that comment i's embeddings can be extracted as embeddings[a:b]
  '''
  def forward(self, embeddings, position_encodings, comment_bounds = None):
    attention_input = torch.cat([embeddings, position_encodings], dim=2) # (batch, 512, position_size + bert_hidden_size)
    
    # (batch, 512, 1)
    attentions = self.attention(attention_input)
    if comment_bounds is None:
      attentions = self.softmax(attentions) # (batch, 512, 1)
      vecs = torch.sum(attentions * embeddings, dim=1) # (batch, bert_hidden_size)
      return self.prediction(vecs) # (batch, 1)

    vecs = []
    for (a,b) in comment_bounds:
      comment_embeddings = embeddings[a:b] # (segment_count, 512, bert_hidden_size)
      comment_attentions = attentions[a:b] # (segment_count, 512, 1)
      attention_weights = self.softmax(comment_attentions) # (segment_count, 512, 1)
      weighted_embeddings = attention_weights * embeddings[a:b] # (segment_count, 512, bert_hidden_size)
      vec = torch.sum(weighted_embeddings.view(-1, weighted_embeddings.shape[-1]), dim=0, keepdim=True) # (segment_count, bert_hidden_size)
      vecs.append(vec)
    return self.prediction(torch.cat(vecs))

In [41]:
class MyDataset(Dataset):
  def __init__(self, file_format):
    super().__init__()
    
    # Load the data from files
    input_ids = torch.load(file_format.format("input_ids"))
    positions = torch.load(file_format.format("positions"))
    comment_ids = torch.load(file_format.format("ids"))
    targets = torch.load(file_format.format("targets"))
    
    # Treat the targets as binary to separate the possible outputs
    target_ids = torch.sum(torch.Tensor([32, 16, 8, 4, 2, 1]) * targets, axis=1)
    
    # Store the data according to the target. Useful for normalization
    self.data = [[] for i in range(64)]
    
    # Load the data into the array
    curr_id = 0
    start_index = 0
    for i in range(comment_ids.shape[0]):
        if comment_ids[i] != curr_id:
            target_id = int(target_ids[curr_id].item())
            data = (input_ids[start_index:i], positions[start_index:i], targets[curr_id])
            self.data[target_id].append(data)
            
            curr_id = comment_ids[i]
            start_index = i
            
    target_id = int(target_ids[curr_id].item())
    data = (input_ids[start_index:i], positions[start_index:i], targets[curr_id])
    self.data[target_id].append(data)
    
    n_nontoxic = len(self.data[0])
    
    n_of_each = n_nontoxic // (len(self.data)-1)
    
    # Remove the empty arrays from the data
    self.data = [data for data in self.data if data]
    
    n_copies = np.array([1]+[n_of_each // len(self.data[i]) for i in range(1,len(self.data))])
    self.data_length = np.array([len(data) for data in self.data])
    
    segment_lengths = n_copies*self.data_length
    
    self.length = np.sum(segment_lengths)
    
    self.boundaries = np.zeros_like(segment_lengths+1)
    self.boundaries[1:] = np.cumsum(segment_lengths)[:-1]


  def __len__(self):
    return self.length

  def __getitem__(self, index):
    for i in range(self.boundaries.shape[0] - 1):
        if index > self.boundaries[i] and index < self.boundaries[i+1]:
            inner_index = (index - self.boundaries[i]) % self.data_length[i]
            
            return self.data[i][inner_index]

def collate_samples(batch):
  split_comments, positions, targets = zip(*batch)
  input_ids = []
  comment_bounds = []
  start = 0
  for comment in split_comments:
      input_ids += split
      comment_bounds.append((start, start+len(split)))
      start += len(split)
  input_ids = torch.stack(input_ids, dim=0)
  encoded_positions = torch.cat([
                          # Use the position array as indices into the position embedding
                          pos_embed[position_arr]
                          # For each comment in the batch
                          for position_arr in positions                     
                      ])
  
  targets = torch.Tensor(targets)
  return input_ids, encoded_positions, comment_bounds, targets

In [42]:
train_dataset = MyDataset("train_{}.pt")
test_dataset = MyDataset("test_{}.pt")
len(train_dataset), len(test_dataset)

(226493, 214512)

In [43]:
bert.eval()
device = torch.device("cuda:0")
bert = bert.to(device)

AssertionError: Torch not compiled with CUDA enabled