In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import argparse
import logging
import os
import random
from io import open

import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
import torch.nn as nn
import pandas as pd

from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME, BertConfig
from pytorch_transformers.modeling_bert import BertPreTrainedModel,BertModel,BertForTokenClassification
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
from torch.nn import CrossEntropyLoss


In [2]:
class BertForActionItemDetection(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForActionItemDetection, self).__init__(config)
        self.num_labels = config.num_labels
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.apply(self.init_weights)
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
                position_ids=None, head_mask=None):
        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
                            attention_mask=attention_mask, head_mask=head_mask)
        #print((outputs[0]))
        pooled_output = outputs[1]
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
        
        return outputs  # (loss), scores, (hidden_states), (attentions)

In [3]:
class InputFeatures(object):
    
    def __init__(self, input_ids, input_mask, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.label = label
        
class ActionItemDataSet(Dataset):
    
    def __init__(self, file_path, max_seq_len, tokenizer):
        self.data = pd.read_csv(file_path)
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        X_text = self.data.iloc[index,0]
        y_label = self.data.iloc[index, 1]
        tokenized_text = self.tokenizer.tokenize(X_text)
        curr_feats = convert_example_to_features(tokenized_text,y_label,self.max_seq_len,self.tokenizer)
        cur_tensors = (torch.tensor(curr_feats.input_ids),
                        torch.tensor(curr_feats.input_mask),
                        torch.tensor(curr_feats.label))
        return cur_tensors

def _truncate_seq_pair(tokenized_text, max_length):
    while True:
        if len(tokenized_text) <= max_length:
            break
        else:
            tokenized_text.pop()
            
def convert_example_to_features(tokenized_text, label, max_seq_len, tokenizer):
    _truncate_seq_pair(tokenized_text, max_seq_len-2)
    tokens = []
    tokens.append("[CLS]")
    for token in tokenized_text:
        tokens.append(token)
    tokens.append("[SEP]")
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)
    while len(input_ids) < max_seq_len:
        input_ids.append(0)
        input_mask.append(0)
    
    features = InputFeatures(input_ids=input_ids,
                             input_mask=input_mask,
                             label=label)
    return features

def get_ai_probabilities(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    out = model(input_ids)[0]
    return (sfmax(out).detach().numpy()[0])

def getMetrics(model_df,focus_label):
    model_tp = len(model_df[(model_df['actual_label']==focus_label) & (model_df['isCorrect']==True)])
    model_fp = len(model_df[(model_df['actual_label']==abs(1-focus_label)) & (model_df['isCorrect']==False)])
    model_fn = len(model_df[(model_df['actual_label']==focus_label) & (model_df['isCorrect']==False)])

    if model_tp>0:
        precision = model_tp/(model_tp+model_fp)
        recall = model_tp/(model_tp+model_fn)
    else:
        precision=0.0
        recall=0.0
    print('Precision: ', precision)
    print('Recall: ', recall)
    missed_ai_preds = len(model_df[(model_df['actual_label']==focus_label) & (model_df['isCorrect']==False)])
    print('Missed Predictions: ', missed_ai_preds)
    hit_rate = model_tp/len(model_df[model_df['actual_label']==focus_label])
    print('Hit rate: ', hit_rate)
    return precision,recall,hit_rate

def getValidationMetrics():
    df_validation = pd.read_csv('/home/venkat/hdd/Venkat/action_item_detection/Shubham/training/seed_data/seed_val.csv')
    text = list(df_validation['text'])
    label_list = list(df_validation['is_action_item'])
    pred_list = []
    confidence_list = []
    model.eval()
    model.to('cpu')
    
    for sent in text:
        res = get_ai_probabilities(sent)
        pred_list.append(np.argmax(res))
        confidence_list.append(max(res))
        
    model_df = pd.DataFrame({'sentence': text,
                        'actual_label': label_list,
                        'predicted_label': pred_list,
                        'confidence': confidence_list})
    model_df['isCorrect'] = model_df['actual_label']==model_df['predicted_label']
    print('Action Item Metrics: ')
    ai_precision,ai_recall,ai_hit_rate = getMetrics(model_df,1)
    print()
    print('Non-Action Item Metrics: ')
    nai_precision,nai_recall,nai_hit_rate = getMetrics(model_df,0)
    if ai_precision!=0 and ai_recall!=0:
        f1 = 2*(ai_precision*ai_recall)/(ai_precision+ai_recall)
    else:
        f1 = 0.0
    print('F1 score: ', f1)
    model.train()
    model.to('cuda')
    
    return ai_precision,ai_recall,f1,ai_hit_rate,nai_hit_rate

In [4]:
bert_model = 'bert-base-uncased'
device = 'cuda'
model = BertForActionItemDetection.from_pretrained(bert_model)
model.to('cuda')
n_gpu = torch.cuda.device_count()
# if n_gpu>1:
#     model = torch.nn.DataParallel(model)
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
gradient_accumulation_steps = 1

In [5]:
train_dataset = ActionItemDataSet('/home/venkat/hdd/Venkat/action_item_detection/Shubham/training/new_train.csv',40,tokenizer)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=32, sampler=train_sampler)

model.train()
for name, param in model.bert.named_parameters():
    #print(name,param.requires_grad)
    if name.startswith('embeddings'):
        #print(param.requires_grad)
        param.requires_grad = True
    elif(name.startswith('encoder.layer.0.')):
        param.requires_grad = True
    elif(name.startswith('encoder.layer.1.')):
        param.requires_grad = True
    elif(name.startswith('encoder.layer.2')):
        param.requires_grad = False
    elif(name.startswith('encoder.layer.3')):
        param.requires_grad = False
    elif(name.startswith('encoder.layer.4')):
        param.requires_grad = False
    elif(name.startswith('encoder.layer.5')):
        param.requires_grad = False       
    elif(name.startswith('encoder.layer.6')):
        param.requires_grad = False
    elif(name.startswith('encoder.layer.7')):
        param.requires_grad = False
    elif(name.startswith('encoder.layer.8')):
        param.requires_grad = False
    elif(name.startswith('encoder.layer.9')):
        param.requires_grad = False
    elif(name.startswith('encoder.layer.10')):
        param.requires_grad = False
    elif(name.startswith('encoder.layer.11')):
        #print('-------------',param.requires_grad)
        param.requires_grad = False
        #print('-------------',param.requires_grad)

for name, param in model.bert.named_parameters():
    print(name,param.requires_grad)


global_step = 0
"""for param in model.parameters():
    print(len(model.parameters()))
    param.requires_grad = False"""

warmup_steps = 1800
learning_rate = 3e-5
train_batch_size = 32
gradient_accumulation_steps = 1
train_batch_size = train_batch_size//gradient_accumulation_steps

num_train_epochs = 100
num_train_optimization_steps = int(
            len(train_dataset) / train_batch_size / gradient_accumulation_steps) * num_train_epochs

param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-8 )
#t_total=num_train_epochs*warmup_steps - for triangulated lr
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=warmup_steps*2)

embeddings.word_embeddings.weight True
embeddings.position_embeddings.weight True
embeddings.token_type_embeddings.weight True
embeddings.LayerNorm.weight True
embeddings.LayerNorm.bias True
encoder.layer.0.attention.self.query.weight True
encoder.layer.0.attention.self.query.bias True
encoder.layer.0.attention.self.key.weight True
encoder.layer.0.attention.self.key.bias True
encoder.layer.0.attention.self.value.weight True
encoder.layer.0.attention.self.value.bias True
encoder.layer.0.attention.output.dense.weight True
encoder.layer.0.attention.output.dense.bias True
encoder.layer.0.attention.output.LayerNorm.weight True
encoder.layer.0.attention.output.LayerNorm.bias True
encoder.layer.0.intermediate.dense.weight True
encoder.layer.0.intermediate.dense.bias True
encoder.layer.0.output.dense.weight True
encoder.layer.0.output.dense.bias True
encoder.layer.0.output.LayerNorm.weight True
encoder.layer.0.output.LayerNorm.bias True
encoder.layer.1.attention.self.query.weight True
encoder.

In [6]:
infer_steps = 0
sfmax = nn.Softmax()

for epoch in range(num_train_epochs):
    for step, batch in enumerate((train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch
        loss,scores = model(input_ids, attention_mask = input_mask, labels=labels)
        loss = loss / gradient_accumulation_steps
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        if ((scheduler.get_lr()[0]) ==0):
            print(scheduler.get_lr()[0])
            break
        if global_step%200==0:
            print('Step Leanirng rate: ', global_step,'::',scheduler.get_lr())
            model.eval()
            print('current loss: ', loss)
            #validation
            ai_precision,ai_recall,f1,ai_hit_rate,nai_hit_rate = getValidationMetrics()
            mean_hit_rate = (ai_hit_rate+nai_hit_rate)/2
            if f1>0.9 and mean_hit_rate>0.8 and ai_precision>0.9:
                output_model_file = 'models/lr_3e-6_ws_1200/bert_ai_classifier_epc_'+str(epoch)+'_step_'+str(global_step)+'_lr_'+str(learning_rate)+'_ws_'+str(warmup_steps)+'_f1'+str(f1)+'.pkl'
                model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                torch.save(model_to_save.state_dict(), output_model_file)
                print('Model saved for the epoch: ', epoch,'and step: ', global_step)
            print('----------------------------------------------------------------')
            print('----------------------------------------------------------------')
            print()
        global_step += 1

Step Leanirng rate:  0 :: [1.6666666666666667e-08, 1.6666666666666667e-08]
current loss:  tensor(0.6801, device='cuda:0', grad_fn=<DivBackward0>)




Action Item Metrics: 
Precision:  0.46632124352331605
Recall:  0.967741935483871
Missed Predictions:  3
Hit rate:  0.967741935483871

Non-Action Item Metrics: 
Precision:  0.0
Recall:  0.0
Missed Predictions:  103
Hit rate:  0.0
F1 score:  0.6293706293706294
----------------------------------------------------------------
----------------------------------------------------------------

Step Leanirng rate:  200 :: [3.35e-06, 3.35e-06]
current loss:  tensor(0.6293, device='cuda:0', grad_fn=<DivBackward0>)
Action Item Metrics: 
Precision:  0.5058139534883721
Recall:  0.9354838709677419
Missed Predictions:  6
Hit rate:  0.9354838709677419

Non-Action Item Metrics: 
Precision:  0.75
Recall:  0.17475728155339806
Missed Predictions:  85
Hit rate:  0.17475728155339806
F1 score:  0.6566037735849056
----------------------------------------------------------------
----------------------------------------------------------------

Step Leanirng rate:  400 :: [6.6833333333333334e-06, 6.683333333333

Action Item Metrics: 
Precision:  0.9072164948453608
Recall:  0.946236559139785
Missed Predictions:  5
Hit rate:  0.946236559139785

Non-Action Item Metrics: 
Precision:  0.9494949494949495
Recall:  0.912621359223301
Missed Predictions:  9
Hit rate:  0.912621359223301
F1 score:  0.9263157894736843
Model saved for the epoch:  5 and step:  2800
----------------------------------------------------------------
----------------------------------------------------------------

Step Leanirng rate:  3000 :: [9.983333333333333e-06, 9.983333333333333e-06]
current loss:  tensor(0.0006, device='cuda:0', grad_fn=<DivBackward0>)
Action Item Metrics: 
Precision:  0.9072164948453608
Recall:  0.946236559139785
Missed Predictions:  5
Hit rate:  0.946236559139785

Non-Action Item Metrics: 
Precision:  0.9494949494949495
Recall:  0.912621359223301
Missed Predictions:  9
Hit rate:  0.912621359223301
F1 score:  0.9263157894736843
Model saved for the epoch:  5 and step:  3000
--------------------------------