# Setup

In [1]:
! pip install datasets transformers[sentencepiece] accelerate

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/86/27/9c91ddee87b06d2de12f134c5171a49890427e398389f07f6463485723c3/datasets-1.9.0-py3-none-any.whl (262kB)
[K     |████████████████████████████████| 266kB 7.5MB/s 
[?25hCollecting transformers[sentencepiece]
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 47.3MB/s 
[?25hCollecting accelerate
[?25l  Downloading https://files.pythonhosted.org/packages/f7/fa/d173d923c953d930702066894abf128a7e5258c6f64cf088d2c5a83f46a3/accelerate-0.3.0-py3-none-any.whl (49kB)
[K     |████████████████████████████████| 51kB 9.4MB/s 
Collecting xxhash
[?25l  Downloading https://files.pythonhosted.org/packages/7d/4f/0a862cad26aa2ed7a7cd87178cbbfa824fc1383e472d63596a0d018374e7/xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243kB)
[K     |████████████████

# Introduction

# Importing libraries and preparing the environment

In [2]:
import argparse
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from sklearn import metrics

from accelerate import Accelerator

import torch
from torch import cuda
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

import datasets
from datasets import load_dataset, load_metric

import transformers
from transformers import BertTokenizer, BertModel, BertConfig

# Preparing for TPU usage (research about this)
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [3]:
device = 'cuda' if cuda.is_available() else 'cpu'

# Importing and preprocessing domain data

In [4]:
df = pd.read_csv("../data/complaints_cleaned.csv")

In [5]:
one_hot_df = pd.get_dummies(df.Product, prefix='Product').head(5)

In [6]:
df = pd.concat([df, one_hot_df], axis=1)
df['List'] = df[df.columns[2:]].values.tolist()
new_df = df[['Narrative', 'List']].copy()
new_df.head()

Unnamed: 0,Narrative,List
0,i have tried to send my disputes to the major ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
1,this company optimal management group disclose...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
2,my account was cleared out by fraud and key ba...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,to whom it may concern the credit bureau is re...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
4,two accounts are still on my credit history af...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


# Preparing the Dataset and the Dataloader

In [7]:
class ComplaintDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.narrative = dataframe.Narrative
        self.targets = dataframe.List
        self.max_length = max_length
    
    def __len__(self):
        return len(self.narrative)
    
    def __getitem__(self, idx):
        narrative = str(self.narrative[idx])
        narrative = ' '.join(narrative.split())
        
        inputs = self.tokenizer.encode_plus(
            narrative,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[idx], dtype=torch.float)
        }

In [8]:
MAX_LENGTH = 400
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05
TRAIN_SIZE = 0.8

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
train_dataset = new_df.sample(frac=TRAIN_SIZE, random_state=42)
test_dataset = new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = ComplaintDataset(train_dataset, tokenizer, MAX_LENGTH)
validation_set = ComplaintDataset(test_dataset, tokenizer, MAX_LENGTH)

FULL Dataset: (743718, 2)
TRAIN Dataset: (594974, 2)
TEST Dataset: (148744, 2)


In [11]:
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}

validation_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}

trainin_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **validation_params)

# Creating model for fine-tuning

In [12]:
class ComplaintBERT(torch.nn.Module):
    def __init__(self):
        super(ComplaintBERT, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.2)
        self.l3 = torch.nn.Linear(768, 18)
    
    def forward(self, ids, mask, token_type_ids):
        _, output1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        output2 = self.l2(output1)
        final_output = self.l3(output2)
        return final_output

In [13]:
model = ComplaintBERT()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ComplaintBERT(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [14]:
def loss_funciton(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [15]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# Fine tuning the model

In [16]:
def train(epoch):
    model.train()
    for _, data in enumerate(trainin_loader, 0):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        
        optimizer.zero_grad()
        loss = loss_funciton(outputs, targets)
        if _%5000 == 0:
            print(f'Epoch: {epoch}, Loss: {loss}')
        
        optimizer.zero_grad()
        loss.backwad()
        optimizer.step()

In [17]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


RuntimeError: CUDA out of memory. Tried to allocate 38.00 MiB (GPU 0; 6.00 GiB total capacity; 4.09 GiB already allocated; 1.12 MiB free; 4.26 GiB reserved in total by PyTorch)