In [1]:
import torch
import torch.nn as nn
import os
import pandas as pd
from transformers import *
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import BertTokenizer
from tqdm import tqdm_notebook as tqdm

In [2]:
GITHUB_REPO = "huggingface/pytorch-pretrained-BERT" # from Huggingface
PRETRAINED_MODEL_NAME = "bert-base-uncased"

# Load BERT tokenizer
tokenizer = torch.hub.load(GITHUB_REPO, 'tokenizer', PRETRAINED_MODEL_NAME)

Using cache found in /home/dean/.cache/torch/hub/huggingface_pytorch-pretrained-BERT_master


In [3]:
CWD = os.getcwd()
if 'task1' not in CWD:
    CWD = os.path.join(CWD, 'task1')

In [4]:
df_train = pd.read_csv(os.path.join('data/task1_trainset.csv'), dtype=str)

In [5]:
df_train.head()

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date,Task 1
0,D00001,A Brain-Inspired Trust Management Model to Ass...,Rapid popularity of Internet of Things (IoT) a...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,BACKGROUND OBJECTIVES METHODS METHODS RESULTS ...
1,D00002,On Efficient Computation of Shortest Dubins Pa...,"In this paper, we address the problem of compu...",Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,OBJECTIVES OTHERS METHODS/RESULTS RESULTS RESULTS
2,D00003,Data-driven Upsampling of Point Clouds,High quality upsampling of sparse 3D point clo...,Zhang/Jiang/Yang/Yamakawa/Shimada/Kara,cs.CV,2018-07-07,BACKGROUND OBJECTIVES METHODS METHODS METHODS ...
3,D00004,Accessibility or Usability of InteractSE? A He...,Internet is the main source of information now...,Aqle/Khowaja/Al-Thani,cs.HC,2018-08-29,BACKGROUND BACKGROUND BACKGROUND OBJECTIVES OB...
4,D00005,Spatio-Temporal Facial Expression Recognition ...,Automated Facial Expression Recognition (FER) ...,Hasani/Mahoor,cs.CV,2017-03-20,BACKGROUND BACKGROUND BACKGROUND BACKGROUND ME...


In [6]:
### Remove (current) redundant columns.
df_train.drop('Id',axis=1,inplace=True)
df_train.drop('Title',axis=1,inplace=True)
df_train.drop('Categories',axis=1,inplace=True)
df_train.drop('Created Date',axis=1, inplace=True)
df_train.drop('Authors',axis=1,inplace=True)
#df_train['Abstract'] = df_train['Abstract'].str.lower()

In [7]:
trainset, validset = train_test_split(df_train, test_size=0.1, random_state=42)

In [8]:
def label_to_onehot(raw_labels):
    """ Convert label to onehot .
        Args:
            labels (string): sentence's labels.
        Return:
            outputs (onehot list): sentence's onehot label.
    """
    label_dict = {'BACKGROUND': 0, 'OBJECTIVES':1, 'METHODS':2, 'RESULTS':3, 'CONCLUSIONS':4, 'OTHERS':5}
    label_onehot = [0,0,0,0,0,0]
    for l in raw_labels.split('/'):
        label_onehot[label_dict[l]] = 1
    return label_onehot


def convert_to_bert_indices(sentence):
    word_pieces = ["[CLS]"]
    tokenized_sent = tokenizer.tokenize(sentence)
    word_pieces = word_pieces + tokenized_sent
    word_pieces = word_pieces + ["[SEP]"]

    word_pieces_ids = tokenizer.convert_tokens_to_ids(word_pieces)
    word_pieces_mask = [0]*len(word_pieces_ids)
    if len(word_pieces_ids) != len(word_pieces_mask):
        print("Fuck")
    return (word_pieces_ids, word_pieces_mask)


def preprocess_sample(data):
    """
    Args:
        data (dict)
    Returns:
        dict
    """
    processed = {}
    
    ## clean abstracts by removing $$$    
    processed['Abstract'] = [convert_to_bert_indices(sent) for sent in data['Abstract'].split('$$$')]
    
    ## convert the labels into one-hot encoding
    if 'Task 1' in data:
        processed['Label'] = [label_to_onehot(label) for label in data['Task 1'].split(' ')]
        
    return processed


def preprocess_samples(dataset):
    """ Worker function.

    Args:
        dataset (list of dict)
    Returns:
        list of processed dict.
    """
    processed = []
    for sample in dataset.iterrows():
        processed.append(preprocess_sample(sample[1]))

    return processed


def get_dataset(raw_dataset):
    """ Load data and return dataset for training and validating.

    Args:
        data_path (str): Path to the data.
    """
    processed = preprocess_samples(raw_dataset)
    
    return processed

In [9]:
print('[INFO] Start processing trainset...')
train = get_dataset(trainset)
print('[INFO] Start processing validset...')
valid = get_dataset(validset)

[INFO] Start processing trainset...
[INFO] Start processing validset...


In [10]:
class AbstractDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __getitem__(self, idx):
        return self.data[idx]
    
    def __len__(self):
        return len(self.data)

In [11]:
trainData = AbstractDataset(train)
validData = AbstractDataset(valid)

In [12]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence    

def create_mini_batch(samples):
    max_sent = max([len(sample['Abstract']) for sample in samples]) # 文章最多有幾句
    #print(max_sent)
    max_len = max([len(sentence[0]) for sample in samples for sentence in sample['Abstract']]) # 單句最長的長度
    
    batch_abstract = []
    batch_segment = []
    batch_label = []
    sent_len = []
    
    for sample in samples:
        sample_sent_num = len(sample['Abstract'])
        pad_abstract = []
        pad_segment = []
        
        for sentence, segment in sample['Abstract']:
            if len(sentence) >= max_len:
                pad_abstract.append(sentence[:max_len])
                pad_segment.append(segment[:max_len])
            else:
                pad_abstract.append( sentence +  [0] * (max_len - len(sentence)) )
                pad_segment.append( segment + [0] * (max_len - len(segment)) )
                
        sent_len.append(len(pad_abstract))
        #print('pad_abstract:', len(pad_abstract))
        #print('sample_sent_num:', sample_sent_num)
        #print('pad_segment:', len(pad_segment))
        
        pad_abstract.extend([ [101] + [102] + [0] * (max_len -2) ]* (max_sent - sample_sent_num))
        #pad_abstract.extend([[0]*max_len]*(max_sent - len(pad_abstract)))
        pad_segment.extend([[0]*max_len]*(max_sent - len(pad_segment)))
        #print('pad_abstract2:', len(pad_abstract))
        #print('pad_segment2:', len(pad_segment))
        batch_abstract.append(pad_abstract)  
        batch_segment.append(pad_segment)
        
        # gather labels
        if 'Label' in sample:
            pad_label = sample['Label']
            pad_label.extend([[0]*6]*(max_sent-len(pad_label)))
            batch_label.append(pad_label)
            
    batch_abstract = torch.LongTensor(batch_abstract)
    #batch_abstract = pad_sequence(batch_abstract, batch_first=True)     
    batch_segment = torch.LongTensor(batch_segment)
    #batch_segment = pad_sequence(batch_segment, batch_first=True)
    batch_label = torch.FloatTensor(batch_label)
    #print('pad_abstract:', len(pad_abstract))
    #print('sent_len:', len(sent_len))
    
    # attention
    masks_tensors = torch.zeros(batch_abstract.shape)
    masks_tensors = masks_tensors.masked_fill(batch_abstract != 0, 1)
    #print('batch_label:', batch_label)
    #print('batch_label_size:', batch_label.shape)
        
    return batch_abstract, batch_segment, masks_tensors, batch_label, sent_len

In [13]:
batch_size = 2
NUM_LABELS = 6
learning_rate = 2e-5
path_name = 'bert'
max_epoch = 5

In [17]:
#from transformers import BertTokenizer, BertModel, BertConfig, BertPreTrainedModel
#BertConfig.from_pretrained('bert-base-uncased')

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

In [None]:
'''
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForSequenceClassification, self).__init__(config)
        self.hidden_size = config.hidden_size
        self.bert = BertModel(config)  # load pre-trained BERT
        #self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # add a linear layer
        self.classifier = nn.Linear(config.hidden_size, 6)
        self.sigmoid = torch.nn.Sigmoid()
        

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, position_ids=None):
        b, s, w = input_ids.shape
        input_ids = input_ids.view(b*s, w)
        attention_mask = attention_mask.view(b*s, w)
        token_type_ids = token_type_ids.view(b*s, w)

        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids)
        sequence_outputs = outputs[0]
        #pooled_output = self.dropout(outputs)
        logits = self.sigmoid(self.classifier(sequence_outputs))
        #print(logits[0].shape)

        return logits
'''

In [14]:
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForSequenceClassification, self).__init__(config)
        self.hidden_size = config.hidden_size
        self.hidden_dim = 512
        self.bert = BertModel(config)  # load pre-trained BERT
        #self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # add a linear layer
        #self.sent_rnn = nn.GRU(config.hidden_size,
        #                       self.hidden_dim,
        #                       num_layers=1,
        #                       bidirectional=True,
        #                       batch_first=True)
        self.l1 = nn.Linear(self.hidden_size, self.hidden_dim)
        self.classifier = nn.Linear(self.hidden_dim, 6)
        self.sigmoid = torch.nn.Sigmoid()
        

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, position_ids=None):
        b, s, w = input_ids.shape
        input_ids = input_ids.view(b*s, w)
        attention_mask = attention_mask.view(b*s, w)
        token_type_ids = token_type_ids.view(b*s, w)

        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids)
        sequence_output = outputs[0]
        sequence_output = torch.mean(sequence_output, dim=1)
        #sequence_output = sequence_output.view(b, s, self.hidden_size)
        #sequence_output, _ = self.sent_rnn(sequence_output)
        sequence_output = sequence_output.view(b, s, self.hidden_size)
        x = torch.relu(self.l1(sequence_output))
        #pooled_output = self.dropout(outputs)
        logits = self.sigmoid(self.classifier(x))
        #print(logits[0].shape)

        return logits

In [15]:
### Helper functions for scoring

class F1():
    def __init__(self):
        self.threshold = 0.5
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0
        self.name = 'F1'

    def reset(self):
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0

    def update(self, predicts, groundTruth):
        predicts = predicts > self.threshold
        self.n_precision += torch.sum(predicts).data.item()
        self.n_recall += torch.sum(groundTruth).data.item()
        self.n_corrects += torch.sum(groundTruth.type(torch.bool) * predicts).data.item()

    def get_score(self):
        recall = self.n_corrects / self.n_recall
        precision = self.n_corrects / (self.n_precision + 1e-20)
        return 2 * (recall * precision) / (recall + precision + 1e-20)

    def print_score(self):
        score = self.get_score()
        return '{:.5f}'.format(score)

In [16]:
def _run_epoch(epoch, mode):
    model.train(True)
    if mode=="train":
        description = 'Train'
        dataset = trainData
        shuffle = True
    else:
        description = 'Valid'
        dataset = validData
        shuffle = False
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            collate_fn=create_mini_batch,
                            num_workers=8)

    trange = tqdm(enumerate(dataloader), total=len(dataloader), desc=description)
    loss = 0
    f1_score = F1()
    for i, (token_tensors, segments_tensors, masks_tensors, label_ids, sent_len) in trange:
        o_labels, batch_losses = _run_iter(token_tensors, segments_tensors, masks_tensors, label_ids, sent_len)
        _loss = 0
        if mode=="train":
            opt.zero_grad()
            for k, batch_loss in enumerate(batch_losses):
                batch_loss.backward(retain_graph=True)
                _loss += batch_loss.item()
            opt.step()
        else:
            for k, batch_loss in enumerate(batch_losses):
                _loss += batch_loss.item()

        #loss += batch_loss.item()
        loss += _loss
        #if i%500==0:
        #    print(o_labels)
            
        for j, dl in enumerate(o_labels):
            #print('sent_len', len(sent_len))
            #print(j)
        #    #print('label_ids:', label_ids[j][:sent_len[j]])
        #    #print('dl:', dl[:sent_len[j]])
            f1_score.update(dl[:sent_len[j]].cpu(), label_ids[j][:sent_len[j]])
        #f1_score.update(o_labels, label_ids)
            
        trange.set_postfix(loss=loss / (i + 1), f1=f1_score.print_score())
        torch.cuda.empty_cache()
    
    if mode=="train":
        history['train'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})
        writer.add_scalar('Loss/train', loss/ len(trange), epoch)
        writer.add_scalar('F1_score/train', f1_score.get_score(), epoch)
    else:
        history['valid'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})
        writer.add_scalar('Loss/valid', loss/ len(trange), epoch)
        writer.add_scalar('F1_score/valid', f1_score.get_score(), epoch)
    trange.close()
    

def _run_iter(tokens_tensors, segments_tensors ,masks_tensors, label_ids, sent_len):
    tokens_tensors = tokens_tensors.to(device)
    segments_tensors = segments_tensors.to(device)
    masks_tensors = masks_tensors.to(device)
    
    o_labels = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors)
    l_losses = []
    for i, sent in enumerate(sent_len):
        l_loss = criteria(o_labels[i][:sent_len[i]].cpu(), label_ids[i][:sent_len[i]])
        l_losses.append(l_loss)
    
    return o_labels, l_losses

def save(epoch, path_name):
    if not os.path.exists(os.path.join(CWD, 'model', path_name)):
        os.makedirs(os.path.join(CWD, 'model', path_name))
    path = os.path.join(CWD, 'model', path_name)
    torch.save(model.state_dict(), os.path.join(path, 'model.pkl.'+str(epoch) ))
    with open( os.path.join( path, 'history.json'), 'w') as f:
        json.dump(history, f, indent=4)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
model = BertForSequenceClassification(BertConfig.from_pretrained('bert-base-uncased'))

opt = torch.optim.Adam(model.parameters(), lr=learning_rate)
criteria = torch.nn.BCELoss()
model.to(device)
history = {'train':[],'valid':[]}

## Tensorboard
## save path: test_experiment/
tf_path = os.path.join(CWD, 'model', path_name)
if not os.path.exists(tf_path):
    os.mkdir(tf_path)
writer = SummaryWriter(tf_path)

for epoch in range(max_epoch):
    print('Epoch: {}'.format(epoch))
    _run_epoch(epoch, 'train')
    _run_epoch(epoch, 'valid')
    save(epoch, 'bert')

# Plot the training results 
with open(os.path.join(CWD,'model/bert/history.json'), 'r') as f:
    history = json.loads(f.read())
    
train_loss = [l['loss'] for l in history['train']]
valid_loss = [l['loss'] for l in history['valid']]
train_f1 = [l['f1'] for l in history['train']]
valid_f1 = [l['f1'] for l in history['valid']]

plt.figure(figsize=(7,5))
plt.title('Loss')
plt.plot(train_loss, label='train')
plt.plot(valid_loss, label='valid')
plt.legend()
plt.show()

plt.figure(figsize=(7,5))
plt.title('F1 Score')
plt.plot(train_f1, label='train')
plt.plot(valid_f1, label='valid')
plt.legend()
plt.show()

print('Best F1 score ', max([[l['f1'], idx] for idx, l in enumerate(history['valid'])]))

Epoch: 0


HBox(children=(IntProgress(value=0, description='Train', max=3150, style=ProgressStyle(description_width='init…

RuntimeError: CUDA out of memory. Tried to allocate 42.00 MiB (GPU 0; 7.93 GiB total capacity; 6.85 GiB already allocated; 17.62 MiB free; 521.40 MiB cached)