In [None]:
# fine-tune LM

# python run_mlm.py     
#     --model_name_or_path distilbert-base-uncased     
#     --train_file ./tr.csv     
#     --validation_file ./va.csv     
#     --do_train     
#     --do_eval     
#     --output_dir ./tmp/test-mlm_vocab 
#     --line_by_line 
#     --per_device_train_batch_size 16 
#     --num_train_epochs 1 
#     --fp16

In [2]:
!pip install -q transformers

In [1]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig, DistilBertTokenizerFast
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from tokenizers import BertWordPieceTokenizer

from sklearn.feature_extraction.text import CountVectorizer


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
df = pd.read_csv("./train.csv")
df['labels'] = df[df.columns[2:]].values.tolist()
df['text'] = df['comment_text']
df = df[['text', 'labels']].copy()
df['text'] = df['text'].apply(lambda x: 'qwedsa\n' + x)
df.head()

Unnamed: 0,text,labels
0,qwedsa\nExplanation\nWhy the edits made under ...,"[0, 0, 0, 0, 0, 0]"
1,qwedsa\nD'aww! He matches this background colo...,"[0, 0, 0, 0, 0, 0]"
2,"qwedsa\nHey man, I'm really not trying to edit...","[0, 0, 0, 0, 0, 0]"
3,"qwedsa\n""\nMore\nI can't make any real suggest...","[0, 0, 0, 0, 0, 0]"
4,"qwedsa\nYou, sir, are my hero. Any chance you ...","[0, 0, 0, 0, 0, 0]"


In [4]:
vec = CountVectorizer(stop_words=['english'], max_features=7000)
vec.fit_transform(df['text'])

<159571x7000 sparse matrix of type '<class 'numpy.int64'>'
	with 6407728 stored elements in Compressed Sparse Row format>

In [5]:
len(vec.vocabulary_)

7000

In [48]:
v = pd.read_csv('./tmp/test-mlm/checkpoint-500/vocab.txt', sep='delimiter', header=None)

  """Entry point for launching an IPython kernel.


In [None]:
v.values[:,0]

In [7]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 1e-05
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizerFast.from_pretrained('./dist_tok2', lowercase=True, max_len=MAX_LEN)

In [8]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [9]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (159571, 2)
TRAIN Dataset: (127657, 2)
TEST Dataset: (31914, 2)


In [26]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [39]:
class DistilBertModelClf(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.distilbert = transformers.DistilBertModel.from_pretrained('./tmp/test-mlm_vocab/checkpoint-1000')
        self.pre_classifier = nn.Linear(768, 768)
        self.classifier = nn.Linear(768, 6)
        self.dropout = nn.Dropout(.3)
    
    def forward(self, ids, mask):
        distilbert_output = self.distilbert(ids, attention_mask=mask)
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, num_labels)
        
        return logits

model = DistilBertModelClf()
model.to(device)

BERTClass(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_fea

In [40]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [41]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [42]:
loss_fct = nn.BCEWithLogitsLoss()
#https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb#scrollTo=-G-Jojya6jH4
#https://dzlab.github.io/dltips/en/tensorflow/create-bert-vocab/

def train(epoch):
    tr_loss = .0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for idx,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        
        logits = model(ids, mask)
    
        loss = loss_fct(logits.view(-1), targets.view(-1))
        tr_loss += loss.item()
        
        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        if idx%500==0:
            loss_step = tr_loss/nb_tr_steps
            print(f'Epoch: {epoch}, Loss:  {loss_step}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [43]:
for epoch in range(2):
    train(epoch)

Epoch: 0, Loss:  0.694076418876648
Epoch: 0, Loss:  0.1337188199114151
Epoch: 0, Loss:  0.09560536285564601
Epoch: 0, Loss:  0.08268626467904931
Epoch: 0, Loss:  0.07541574616574388
Epoch: 0, Loss:  0.07046875738786806
Epoch: 0, Loss:  0.06664797075785465
Epoch: 0, Loss:  0.06359285154585136
Epoch: 1, Loss:  0.04991975426673889
Epoch: 1, Loss:  0.041443429269666175
Epoch: 1, Loss:  0.04097720029719451
Epoch: 1, Loss:  0.04128706095525982
Epoch: 1, Loss:  0.0413398045919608
Epoch: 1, Loss:  0.04121532103329132
Epoch: 1, Loss:  0.04109208280957601
Epoch: 1, Loss:  0.04085181092596218


In [None]:
Epoch: 0, Loss:  0.6644080281257629
Epoch: 0, Loss:  0.03623255342245102
Epoch: 0, Loss:  0.04684004932641983
Epoch: 0, Loss:  0.012734170071780682
Epoch: 0, Loss:  0.03388965129852295
Epoch: 0, Loss:  0.04856885224580765
Epoch: 0, Loss:  0.026575788855552673
Epoch: 0, Loss:  0.017798999324440956

In [44]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [45]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.9227298364354202
F1 Score (Micro) = 0.7774229380727576
F1 Score (Macro) = 0.6597536970366582


In [None]:
Accuracy Score = 0.9181863758851915
F1 Score (Micro) = 0.7672067580667497
F1 Score (Macro) = 0.5775035534966305

Accuracy Score = 0.9235758601240834
F1 Score (Micro) = 0.7640705810769698
F1 Score (Macro) = 0.5330122532324771

Accuracy Score = 0.9211004574794761
F1 Score (Micro) = 0.730064388311045
F1 Score (Macro) = 0.42474141979046487