In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import shutil
import sys
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split

In [4]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [5]:
df_train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation why the edits made under my userna...,0,0,0,0,0,0
1,d aww he matches this background colour i am s...,0,0,0,0,0,0
2,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,more i can not make any real suggestions on im...,0,0,0,0,0,0
4,you sir are my hero any chance you remember wh...,0,0,0,0,0,0


In [5]:
labels_list = ['toxic',	'severe_toxic',	'obscene',	'threat',	'insult', 'identity_hate']

In [6]:
MAX_TOKEN_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 1e-05

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [9]:
class CommentsDataset(torch.utils.data.Dataset):
  def __init__(self, df, tokenizer, max_token_len):
    self.df = df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len
    self.comment = self.df['comment_text']
    self.labels = self.df[labels_list].values
  
  def __len__(self):
    return len(self.comment)
  
  def __getitem__(self, index):
    comment = str(self.comment)
    
    inputs = self.tokenizer.encode_plus(
        comment,
        None,
        add_special_tokens=True,
        max_length=self.max_token_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return {
        'input_ids': inputs['input_ids'].flatten(),
        'attention_mask': inputs['attention_mask'].flatten(),
        'targets': torch.FloatTensor(self.labels[index])
    }

In [10]:
train_df, val_df = train_test_split(df_train, test_size=0.05)

In [11]:
train_toxic = train_df[train_df[labels_list].sum(axis=1) > 0]
train_clean = train_df[train_df[labels_list].sum(axis=1) == 0]

In [12]:
train_df = pd.concat([
  train_toxic,
  train_clean.sample(20_000)
])

In [13]:
train_dataset = CommentsDataset(train_df, tokenizer, MAX_TOKEN_LEN)
val_dataset = CommentsDataset(val_df, tokenizer, MAX_TOKEN_LEN)

In [14]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(
    val_dataset,
    shuffle=False,
    batch_size=VALID_BATCH_SIZE,
    num_workers=0
)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
def load_checkpoint(checkpoint_path, model, optimizer):
  checkpoint = torch.load(checkpoint_path)
  model.load_state_dict(checkpoint['state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer'])
  valid_loss_min = checkpoint['valid_loss_min']
  return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_checkpoint(state, is_best, checkpoint_path, best_model_path):
  f_path = checkpoint_path
  torch.save(state, f_path)
  if is_best:
    best_path = best_model_path
    shutil.copyfile(f_path, best_path)

In [25]:
class BERTClassifier(nn.Module):
  def __init__(self):
    super(BERTClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
    self.dropout = nn.Dropout(0.3)
    self.linear = nn.Linear(768, 6)


  def forward(self, input_ids, attention_mask):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output_dropout = self.dropout(output.pooler_output)
    output = self.linear(output_dropout)
    return output

In [None]:
model = BERTClassifier()
model.to(device)

In [30]:
def loss_fn(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [35]:
def train_model(n_epochs, training_loader, validation_loader, model, optimizer, checkpoint_path, best_model_path):
  valid_loss_min=np.Inf

  for epoch in range(1, n_epochs+1):
    train_loss=0
    valid_loss=0
    model.train()

    #training loop
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for index, batch in enumerate(training_loader):
      print(' epoch', index)
      input_ids = batch['input_ids'].to(device, dtype=torch.long)
      attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
      targets = batch['targets'].to(device, dtype=torch.float)

      # Model outputs
      output = model(input_ids, attention_mask)
      
      optimizer.zero_grad()
      loss = loss_fn(output, targets)

      if index%5000==0:
         print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

      loss.backward()
      optimizer.step()
      print('Before loss data in training', loss.item(), train_loss)
      train_loss = train_loss + ((1/(index+1))*(loss.item()-train_loss))
      print('After loss data in training', loss.item(), train_loss)

    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    
    #validation loop
    model.eval()
    with torch.no_grad():
      for index, batch in enumerate(validation_loader):
        input_ids = batch['input_ids'].to(device, dtype=torch.long)
        attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
        targets = batch['targets'].to(device, dtype=torch.float)
        output = model(input_ids, attention_mask)
        loss = loss_fn(output, targets)
        valid_loss = valid_loss + ((1/(index+1))*(loss.item()-valid_loss))
    
      checkpoint = {
          'epoch': epoch+1,
          'valid_loss_min': valid_loss,
          'state_dict': model.state_dict(),
          'optimizer': optimizer.state_dict()
      }

      save_checkpoint(checkpoint, False, checkpoint_path, best_model_path)

      if valid_loss <= valid_loss_min:
        save_checkpoint(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
      
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, valid_loss))

  return model

In [36]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, '/curr_chckpt', '/best.pt')

############# Epoch 1: Training Start   #############
 epoch 0
Epoch: 1, Training Loss:  0.7565498352050781
Before loss data in training 0.7565498352050781 0
After loss data in training 0.7565498352050781 0.7565498352050781
 epoch 1
Before loss data in training 0.7017446160316467 0.7565498352050781
After loss data in training 0.7017446160316467 0.7291472256183624
 epoch 2
Before loss data in training 0.6635946035385132 0.7291472256183624
After loss data in training 0.6635946035385132 0.707296351591746
 epoch 3
Before loss data in training 0.6585714817047119 0.707296351591746
After loss data in training 0.6585714817047119 0.6951151341199875
 epoch 4
Before loss data in training 0.6828012466430664 0.6951151341199875
After loss data in training 0.6828012466430664 0.6926523566246032
 epoch 5
Before loss data in training 0.5852746367454529 0.6926523566246032
After loss data in training 0.5852746367454529 0.6747560699780781
 epoch 6
Before loss data in training 0.5975963473320007 0.674756069

KeyboardInterrupt: 