In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1


import datetime
import os

import os
import pandas as pd
import numpy as np
import torch
from pytorch_transformers import BertTokenizer,BertConfig, BertModel, BertForSequenceClassification, AdamW, WarmupLinearSchedule
from tqdm import trange
import logging

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split





train_file = 'omt_train_tab.csv'
test_file = 'omt_test_tab.csv'
output_dir = 'bert_for_seq_out'
cache_dir = os.path.join(output_dir, 'cache')
task_name = 'omt-classification'
try:
    os.mkdir(output_dir)
    print('Directory', output_dir, ' created')
except FileExistsError:
        print('Directory already exists')
try:
    os.mkdir(cache_dir)
    print('Directory', output_dir, ' created')
except FileExistsError:
        print('Directory already exists')
        

from sklearn.metrics import f1_score, accuracy_score

def acc_and_f1(preds, labels):
    assert len(preds) == len(labels)
    acc = accuracy_score(y_true=labels, y_pred=preds)
    f1 = f1_score(y_true = labels, y_pred = preds, average='micro')
    return{
        'acc': acc,
        'f1': f1,
    }



def tokenize_padd_trunc(tokenizer, corpus, max_len = 32, num_sentences = 1):
    truncate_or_pad = max_len -2
    doclist=[]
    prefix = '[CLS]'
    suffix = '[SEP]'
    if num_sentences == 1:
        i = 0
        for doc in corpus:           
            new_list = []
            new_list.append(prefix)
            doc = tokenizer.tokenize(doc)
            if len(doc)> truncate_or_pad:
                doc = doc[:truncate_or_pad]
            for tok in doc:
                new_list.append(tok)
            new_list.append(suffix)
            doclist.append(new_list)
    if num_sentences == 2:
        return doclist
        #TO DO: Handle the two inputs
    if num_sentences != 1 and num_sentences !=2:
        print('Choose num_sentences between one and two please')
    return doclist

data = pd.read_csv(filepath_or_buffer=train_file, sep='\t', header=None)

MAX_LEN = 64 



basic_tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased', do_lower_case=False)
tokenized_texts = tokenize_padd_trunc(basic_tokenizer, data[0], max_len= MAX_LEN)

labels = data[1].tolist()
labels_vals = list(set(data[1].values))
num_labels = len(labels_vals)
label_map = {label: i for i, label in enumerate(labels_vals)}


batchsize=90
epochs = 5
train_examples = len(tokenized_texts)
num_train_optimization_steps = train_examples // batchsize

#from run_glue.py

learning_rate = 5e-5
adam_epsilon = 1e-8

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
save_steps = 50
logging_steps = 50
max_steps = -1









#data_train, data_dev = train_test_split(data, test_size = 0.1)

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


Using TensorFlow backend.


Directory already exists
Directory already exists


In [2]:


logging.basicConfig(level=logging.INFO)



input_ids = pad_sequences([basic_tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                         maxlen=MAX_LEN, value = 0, dtype= 'long', truncating='post')
label_ids = [label_map[l] for l in labels]
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
token_type_ids = [[0 for i in ii] for ii in input_ids]

In [3]:
print(num_labels)

4


In [4]:
from sklearn.model_selection import train_test_split
tr_inputs, val_inputs, tr_label, val_label = train_test_split(input_ids, label_ids, test_size = 0.1, random_state = 2019)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, random_state = 2019, test_size = 0.1)
tr_token_type_ids, val_token_type_ids, _, _ = train_test_split(token_type_ids, input_ids, random_state = 2019, test_size = 0.1)

In [5]:
nichtnull = 0
docs = 0
for ii in token_type_ids:
    if(len(ii) == MAX_LEN):
        docs = docs + 1
    for i in ii:
        if i == 0:
            nichtnull = nichtnull+1
print(docs, nichtnull)
 
if (nichtnull / MAX_LEN) == docs:
    print('Max len worked fine, all tokentype id = 0')
        

146579 9381056
Max len worked fine, all tokentype id = 0


In [6]:
tr_inputs_tensor = torch.tensor(tr_inputs, dtype=torch.long)
val_inputs_tensor = torch.tensor(val_inputs, dtype=torch.long)
tr_label_tensor = torch.tensor(tr_label, dtype=torch.long)
val_label_tensor = torch.tensor(val_label, dtype=torch.long)
tr_masks_tensor = torch.tensor(tr_masks, dtype=torch.long)
val_masks_tensor = torch.tensor(val_masks, dtype=torch.long)
tr_token_type_tensor = torch.tensor(tr_token_type_ids, dtype=torch.long)
val_token_type_tensor = torch.tensor(val_token_type_ids, dtype=torch.long)

In [7]:
train_data,train_sampler, train_dataloader = [],[],[]
train_data = TensorDataset(tr_inputs_tensor, tr_masks_tensor, tr_token_type_tensor, tr_label_tensor)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = batchsize)

valid_data, valid_sampler, valid_dataloader = [],[],[]
valid_data = TensorDataset(val_inputs_tensor, val_masks_tensor, val_token_type_tensor, val_label_tensor)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler= valid_sampler, batch_size=batchsize)

In [8]:
model = []
model = BertForSequenceClassification.from_pretrained('bert-base-german-cased', cache_dir = cache_dir, num_labels = num_labels)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediat

In [9]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)]}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate, eps = 1e-8)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
epochs = 8

for _ in trange(epochs, desc = 'Epoch'):
    #train loop
    model.zero_grad()
    tr_loss = 0
    nb_tr_steps = 0
    for step, batch in enumerate(train_dataloader):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids':     batch[0],
                 'attention_mask': batch[1],
                 'token_type_ids': batch[2],
                 'labels':         batch[3]}
        outputs = model(**inputs)
        loss = outputs[0]    # since "pytorch-transformer" : all outputs are tuple
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm= 1)
        tr_loss += loss.item()
        nb_tr_steps += 1
        
        #update params
        
        optimizer.step()
        model.zero_grad()
    
    #print loss per epoch
    print('Train loss: {}'.format(tr_loss/nb_tr_steps))
    
    
    #eval
    
    results = {}
    eval_loss = 0.0
    nb_eval_steps= 0
    nb_eval_examples = 0
    preds, out_label_ids = None,None
    
    for batch in valid_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            
            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps +=1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis = 0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis = 0)
    
    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=1)
    
    result = acc_and_f1(preds, out_label_ids)
    results.update(result)
    
    print("Validation loss: {}".format(eval_loss))
    print("Validation accuracy: {}".format(results['acc']))
    print("F1- Score: {}".format(results['f1']))
    
    

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Train loss: 0.4684773057516242


Epoch:  12%|█▎        | 1/8 [23:59<2:47:57, 1439.63s/it]

Validation loss: 0.41748140834591874
Validation accuracy: 0.8490244235229909
F1- Score: 0.8490244235229909
Train loss: 0.3656677764828989


Epoch:  25%|██▌       | 2/8 [48:02<2:24:03, 1440.57s/it]

Validation loss: 0.4216595947559626
Validation accuracy: 0.8446582071223905
F1- Score: 0.8446582071223905
Train loss: 0.28262221003091026


Epoch:  38%|███▊      | 3/8 [1:12:06<2:00:08, 1441.60s/it]

Validation loss: 0.44905696297350106
Validation accuracy: 0.8458179833538
F1- Score: 0.8458179833538001
Train loss: 0.2001722648201096


Epoch:  50%|█████     | 4/8 [1:36:10<1:36:09, 1442.48s/it]

Validation loss: 0.5139184189171879
Validation accuracy: 0.8428162095783872
F1- Score: 0.8428162095783872
Train loss: 0.14544192813133558


Epoch:  62%|██████▎   | 5/8 [2:00:16<1:12:10, 1443.55s/it]

Validation loss: 0.5853581348079845
Validation accuracy: 0.8260335652885796
F1- Score: 0.8260335652885796
Train loss: 0.11243087691820314


Epoch:  75%|███████▌  | 6/8 [2:24:23<48:08, 1444.38s/it]  

Validation loss: 0.6742031523055094
Validation accuracy: 0.8347659980897804
F1- Score: 0.8347659980897804
Train loss: 0.09239296659244044


Epoch:  88%|████████▊ | 7/8 [2:48:25<24:03, 1443.86s/it]

Validation loss: 0.6987789262291844
Validation accuracy: 0.8292400054577705
F1- Score: 0.8292400054577705
Train loss: 0.07875737926533946


Epoch: 100%|██████████| 8/8 [3:12:25<00:00, 1442.67s/it]

Validation loss: 0.7664080061064176
Validation accuracy: 0.8271933415199891
F1- Score: 0.827193341519989





In [11]:
#output_model_file = os.path.join(output_dir, "pytorch_model.bin")
#torch.save(model.bert.state_dict(), output_model_file)

In [12]:
#model.bert.load_state_dict(torch.load(os.path.join(output_dir, "pytorch_model.bin")))

# Test the model

In [13]:
data_test = pd.read_csv(filepath_or_buffer=test_file, sep = '\t', header = None)
tokenized_texts_test = tokenize_padd_trunc(basic_tokenizer, data_test[0], max_len= MAX_LEN)
labels_test = data_test[1].tolist()

input_ids_test = pad_sequences([basic_tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_test],
                         maxlen=MAX_LEN, value = 0, dtype= 'long', truncating='post')
label_ids_test = [label_map[l] for l in labels_test]
attention_masks_test = [[float(i>0) for i in ii] for ii in input_ids_test]
token_type_ids_test = [[0 for i in ii] for ii in input_ids_test]

test_inputs_tensor = torch.tensor(input_ids_test, dtype=torch.long)
test_label_tensor = torch.tensor(label_ids_test, dtype=torch.long)
test_masks_tensor = torch.tensor(attention_masks_test, dtype=torch.long)
test_token_type_tensor = torch.tensor(token_type_ids_test, dtype=torch.long)

test_data, test_sampler, test_dataloader = [],[],[]
test_data = TensorDataset(test_inputs_tensor, test_masks_tensor, test_token_type_tensor, test_label_tensor)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size = 64)


In [14]:
## test the model with unseen test data
model.eval()
test_results = {}
test_loss = 0.0
nb_test_steps= 0
nb_test_examples = 0
preds, out_label_ids = None,None
   
for batch in test_dataloader:
    model.eval()
    batch = tuple(t.to(device) for t in batch)
    
    with torch.no_grad():
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2],
                  'labels':         batch[3]}
        outputs = model(**inputs)
        tmp_test_loss, logits = outputs[:2]
            
        test_loss += tmp_test_loss.mean().item()
    nb_test_steps +=1
    if preds is None:
        preds = logits.detach().cpu().numpy()
        out_label_ids = inputs['labels'].detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis = 0)
        out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis = 0)
    
test_loss = test_loss / nb_test_steps
preds = np.argmax(preds, axis=1)
    
result = acc_and_f1(preds, out_label_ids)
test_results.update(result)
    
print("Validation loss: {}".format(test_loss))
print("Validation accuracy: {}".format(test_results['acc']))
print("F1- Score: {}".format(test_results['f1']))
    

Validation loss: 0.7837988262628101
Validation accuracy: 0.8231773320598536
F1- Score: 0.8231773320598536
