This contains code for pytorch bert implementation. The code was adapted from 
https://mccormickml.com/2019/07/22/BERT-fine-tuning/

In [4]:
from pathlib import Path
  

In [5]:
import numpy as np
import pandas as pd
import torch
from pytorch_pretrained_bert import BertTokenizer
import logging
import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io

In [13]:
device = torch.device("cpu")
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

W0823 10:45:37.154357 140735584027520 tokenization.py:161] The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
100%|██████████| 213450/213450 [00:00<00:00, 828928.42B/s]


In [14]:
fileNameSchiz1 = 'data/dataOut/schiz/annFinalSchiz_1.csv'
fileNameSchiz2 = 'data/dataOut/schiz/annFinalSchiz_2.csv'
fileNameStig = 'data/dataOut/stigma/annFinalStig.csv'

In [15]:
socialDf = pd.read_csv(fileNameSchiz2, encoding='utf-8')
textSchiz2 = socialDf['Tweet']
labelsSchiz2 = socialDf['Classification']

In [16]:
socialDf = pd.read_csv(fileNameSchiz1, encoding='utf-8')
tweetsSchiz1 = socialDf.Tweet.values
labelsSchiz1 = socialDf.Classification.values


Firs we prepare the text in the Bert format with the [CLS] separator and then convert into tensor form. Once we have the data in the right form, including the attention masks we train the model computing a foward pass then using the BertAdam optimizer to minimize our loss. Once this is done we put the model into evaluation mode and evaluate. Can take a little while to run

In [17]:
def getBertFormat(tweets):
    tweets = ['[CLS] ' + s + ' [SEP]' for s in tweets]
    tokens = [tokenizer.tokenize(t) for t in tweets]
    return tokens

In [18]:
def getTorchTensors(**kwargs):
    return {k:torch.tensor(v) for k,v in kwargs.items()}


def getParameters(model):
    
    param_optimizer = list(model.named_parameters())
    
    no_decay = ['bias', 'gamma', 'beta']
    optimizerParameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}]
    
    return optimizerParameters
        
        
def trainModel(model, trainDataLoader, optimizer, epochs=4):
        
    for _ in trange(epochs, desc="Epoch"):
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
  
        for step, batch in enumerate(trainDataLoader):

            batch = tuple(t.to(device) for t in batch)

            b_input_ids, b_input_mask, b_labels = batch

            optimizer.zero_grad()

            loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)   
            loss.backward()
            optimizer.step()

            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

            print("Train loss: {}".format(tr_loss/nb_tr_steps))
        
        
def evalModel(model, optimizer, valDataLoader):
        
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in valDataLoader:

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
   
           

def getBert(text, device, batch=16):
    
    tokens = getBertFormat(text)
    #maxLength = max([len(t) for t in tokens]) + 2
    maxLength = 200
    ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
    inputIds = pad_sequences(ids, maxlen=maxLength, dtype='long', truncating='post', padding='post')
    
    attentionMasks = [list(map(lambda x: float(x>0), s)) for s in inputIds]
    
    trainInputs, valInputs, trainLabels, valLabels = train_test_split(inputIds, labelsSchiz1, random_state=42, test_size=0.2)
    trainMasks, valMasks, _, _ = train_test_split(attentionMasks, inputIds, random_state=42, test_size=0.2)
    
    inputs = getTorchTensors(trainInputs=trainInputs, valInputs=valInputs, trainLabels=trainLabels, valLabels=valLabels)
    masks = getTorchTensors(trainMasks=trainMasks, valMasks=valMasks)
    
    train_data = TensorDataset(inputs['trainInputs'], masks['trainMasks'], inputs['trainLabels'])
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch)

    valData = TensorDataset(inputs['valInputs'], masks['valMasks'], inputs['valLabels'])
    valSampler = SequentialSampler(valData)
    valDataLoader = DataLoader(valData, sampler=valSampler, batch_size=batch)
    
    model = BertForSequenceClassification.from_pretrained("bert-large-cased", num_labels=2)
    optimizerParameters = getParameters(model)
    optimizer = BertAdam(optimizerParameters, lr=2e-5, warmup=.1)
        
    model = trainModel(model, valDataLoader, optimizer)
    model = evalModel(model, valDataLoader, optimizer)
  

In [19]:
getBert(tweetsSchiz1, device)

100%|██████████| 1242874899/1242874899 [08:37<00:00, 2402193.65B/s]
W0823 10:54:40.950658 140735584027520 optimization.py:46] t_total value of -1 results in schedule not being applied
Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Train loss: 0.8509582281112671
Train loss: 0.7697545289993286
Train loss: 0.7695855498313904
Train loss: 0.7783420830965042
Train loss: 0.7576043844223023
Train loss: 0.7294232845306396
Train loss: 0.7202801874705723
Train loss: 0.7079262137413025
Train loss: 0.781946619351705


Epoch:  25%|██▌       | 1/4 [15:58<47:55, 958.59s/it]

Train loss: 0.8015725314617157
Train loss: 0.5950214862823486
Train loss: 0.6569541692733765
Train loss: 0.6484621365865072
Train loss: 0.6931104511022568
Train loss: 0.6918761372566223
Train loss: 0.7162438333034515
Train loss: 0.7241217408861432
Train loss: 0.7201010212302208
Train loss: 0.7144479751586914


Epoch:  50%|█████     | 2/4 [31:56<31:56, 958.49s/it]

Train loss: 0.7020955622196198
Train loss: 0.631625235080719
Train loss: 0.6417377591133118
Train loss: 0.6175366640090942
Train loss: 0.5411438345909119
Train loss: 0.6146926403045654
Train loss: 0.6852958798408508
Train loss: 0.7031082596097674
Train loss: 0.676782812923193
Train loss: 0.6537459426456027


Epoch:  75%|███████▌  | 3/4 [47:54<15:58, 958.18s/it]

Train loss: 0.6243840396404267
Train loss: 0.42754799127578735
Train loss: 0.5440468192100525
Train loss: 0.5130421916643778
Train loss: 0.5658839643001556
Train loss: 0.6946876287460327
Train loss: 0.6330302357673645
Train loss: 0.6203817980630058
Train loss: 0.5989611111581326
Train loss: 0.635890758699841


Epoch: 100%|██████████| 4/4 [1:04:09<00:00, 963.14s/it]

Train loss: 0.5896166175603866





AttributeError: 'NoneType' object has no attribute 'eval'