In [1]:
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
!pip install transformers

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
import transformers
transformers.__version__

'2.11.0'

In [3]:
from transformers import *

In [4]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2070


## Data Preparation

In [5]:
import pandas as pd

header_list = ['sentence', 'class']
df = pd.read_csv("./data/raw_1709.csv",  header = [0])
df.head()

Unnamed: 0,sentence,class
0,(1) Cash and cash equivalents,7
1,(1) Cash and cash equivalents,7
2,(2) Financial assets and liabilities at fair v...,0
3,(3) Foreign currency translation Items includ...,8
4,(4) Financial assets measured at cost A.,7


In [6]:
df.columns

Index(['sentence', 'class'], dtype='object')

In [7]:
df.keys()

Index(['sentence', 'class'], dtype='object')

In [8]:
df_notopic = df[df['class'] == 0] 
df_notopic_sample = df_notopic.sample(n=1000, random_state=1111)
df_topic =  df[df['class'] != 0] 
df1 = df_topic.append(df_notopic_sample, ignore_index=False, sort=False)

In [9]:
df1

Unnamed: 0,sentence,class
0,(1) Cash and cash equivalents,7
1,(1) Cash and cash equivalents,7
3,(3) Foreign currency translation Items includ...,8
4,(4) Financial assets measured at cost A.,7
5,(4) Foreign currency translation currency of ...,8
...,...,...
353,An impairment loss is recognized for the amoun...,0
33020,"Depreciation of property, plant and equipment...",0
38018,i) Fair value hierarchy and measurement method,0
39888,Deferred tax is provided using the liability ...,0


In [10]:
def preprocess_data(df, train_size = 0.80):
    df.rename(columns = {'sent': 'sentence', 'class': 'label'}, inplace = True)
    
    df_train = df.sample(int(train_size * len(df)), random_state = 1)
    df_test = df.drop(df_train.index)
    
    return df_train, df_test
    
df_train, df_test = preprocess_data(df1)    

In [11]:
df1['label'].value_counts()

0     1000
1      642
3      404
5      403
4      306
6      278
10     274
7      206
2      177
8      173
9      145
Name: label, dtype: int64

In [12]:
df_train['label'].value_counts()

0     792
1     532
3     323
5     311
4     237
6     226
10    219
7     159
8     148
2     145
9     114
Name: label, dtype: int64

In [13]:
df_test['label'].value_counts()

0     208
1     110
5      92
3      81
4      69
10     55
6      52
7      47
2      32
9      31
8      25
Name: label, dtype: int64

## Model Preparation

In [14]:
# key definitions
TRAIN_VALIDATION_SPLIT = 0.80
NUM_OUTPUT_LABELS = 11

#MODEL_NAME = 'bert-large-uncased'
MODEL_NAME = 'albert-large-v1'
MAX_LEN = 250
BATCH_SIZE = 4
EPOCHS = 15

ENCODE_CAT =\
{
    0: 'No topic',
    1: 'income_statement_net_sales',
    2:'Income_Statement_net_profit_related', 
    3:'Income_Statement_profitbeforetax_related', 
    4:'Income_Statement_operating_profit_related', 
    5: 'Risk', 
    6:'Non Current Assets', 
    7:'Current Asset', 
    8: 'Net Worth / Capital', 
    9: 'Cashflow', 
    10: 'Total Liabilities', 
}

In [15]:
# Get the lists of sentences and their labels.
sentences = df_train.sentence.values
labels = df_train.label.values

In [16]:
from transformers import AlbertTokenizer

# Load the BERT tokenizer.
print('Loading ALBERT tokenizer...')
tokenizer = AlbertTokenizer.from_pretrained('./model/albert-large-v1', do_lower_case=True)

Loading ALBERT tokenizer...


In [17]:
# from transformers import BertTokenizer
# # Load the BERT tokenizer.
# print('Loading BERT tokenizer...')
# tokenizer = BertTokenizer.from_pretrained("../input/bertlargeuncasedpytorch", do_lower_case=True)

In [18]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  3% following the merger of Global Ship Lease with Poseidon Containers, hence generated a dilution loss amounting to USD 65. 
Tokenized:  ['▁', '3%', '▁following', '▁the', '▁merger', '▁of', '▁global', '▁ship', '▁lease', '▁with', '▁poseidon', '▁containers', ',', '▁hence', '▁generated', '▁a', '▁di', 'lu', 'tion', '▁loss', '▁amount', 'ing', '▁to', '▁us', 'd', '▁65', '.']
Token IDs:  [13, 2560, 249, 14, 6546, 16, 2062, 995, 9140, 29, 29667, 18988, 15, 5796, 6756, 21, 926, 2377, 3309, 1526, 2006, 68, 20, 182, 43, 2074, 9]


In [19]:
%%time
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
#     print (sent)
#     print (type(sent))
    encoded_dict = tokenizer.encode_plus(
                        str(sent),                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_LEN,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  3% following the merger of Global Ship Lease with Poseidon Containers, hence generated a dilution loss amounting to USD 65. 
Token IDs: tensor([    2,    13,  2560,   249,    14,  6546,    16,  2062,   995,  9140,
           29, 29667, 18988,    15,  5796,  6756,    21,   926,  2377,  3309,
         1526,  2006,    68,    20,   182,    43,  2074,     9,     3,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,   

In [20]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup

In [21]:
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True

In [22]:
from transformers import AlbertForSequenceClassification, AdamW, AlbertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = AlbertForSequenceClassification.from_pretrained(
    './model/albert-large-v1', # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = NUM_OUTPUT_LABELS, # The number of output labels--4 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=1024, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_feat

In [23]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The model has 27 different named parameters.

==== Embedding Layer ====

albert.embeddings.word_embeddings.weight                (30000, 128)
albert.embeddings.position_embeddings.weight              (512, 128)
albert.embeddings.token_type_embeddings.weight              (2, 128)
albert.embeddings.LayerNorm.weight                            (128,)
albert.embeddings.LayerNorm.bias                              (128,)

==== First Transformer ====

albert.encoder.embedding_hidden_mapping_in.weight        (1024, 128)
albert.encoder.embedding_hidden_mapping_in.bias              (1024,)
albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight      (1024,)
albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias      (1024,)
albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight (1024, 1024)
albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias      (1024,)
albert.encoder.albert_layer_groups.0.albert_layers

In [24]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [25]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [26]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [27]:
from torch.utils.data import TensorDataset, random_split

def train(network, epochs, save_Model = False):
    total_acc = 0
    for fold, (train_index, val_index) in enumerate(kfold.split(input_ids, attention_masks, labels)):
#         print (train_index)
#         print (val_index)
        ### Dividing data into folds
        input_ids_train_fold = input_ids[train_index]
        input_ids_val_fold = input_ids[val_index]
        attention_masks_train_fold = attention_masks[train_index]
        attention_masks_val_fold = attention_masks[val_index]
        labels_train_fold = labels[train_index]
        labels_val_fold = labels[val_index]
        # Combine the training inputs into a TensorDataset.
        train_dataset = TensorDataset(input_ids_train_fold, attention_masks_train_fold, labels_train_fold)
        val_dataset = TensorDataset(input_ids_val_fold, attention_masks_val_fold, labels_val_fold)
        #train_dataset, val_dataset = random_split(dataset, [train_index, val_index])
        train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = BATCH_SIZE # Trains with this batch size.
        )
        # For validation the order doesn't matter, so we'll just read them sequentially.
        validation_dataloader = DataLoader(
                    val_dataset, # The validation samples.
                    sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
                    batch_size = BATCH_SIZE # Evaluate with this batch size.
        )
        total_steps = len(train_dataloader) * EPOCHS
        # Create the learning rate scheduler.
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = 0, # Default value in run_glue.py
                                                    num_training_steps = total_steps)
        seed_val = 42
        random.seed(seed_val)
        np.random.seed(seed_val)
        torch.manual_seed(seed_val)
        torch.cuda.manual_seed_all(seed_val)
        # We'll store a number of quantities such as training and validation loss, 
        # validation accuracy, and timings.
        training_stats = []
        # Measure the total training time for the whole run.
        total_t0 = time.time()
        # For each epoch...
        for epoch_i in range(0, EPOCHS):
            print('\nEpoch {} / {} \nFold number {} / {}'.format(epoch_i + 1, EPOCHS, fold + 1 , kfold.get_n_splits()))

            # ========================================
            #               Training
            # ========================================

            # Perform one full pass over the training set.
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
            print('Training...')
            # Measure how long the training epoch takes.
            t0 = time.time()
            # Reset the total loss for this epoch.
            total_train_loss = 0
            # Put the model into training mode. Don't be mislead--the call to 
            # `train` just changes the *mode*, it doesn't *perform* the training.
            # `dropout` and `batchnorm` layers behave differently during training
            # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
            model.train()
            # For each batch of training data...
            for step, batch in enumerate(train_dataloader):
                # Progress update every 40 batches.
                if step % 40 == 0 and not step == 0:
                    # Calculate elapsed time in minutes.
                    elapsed = format_time(time.time() - t0)

                    # Report progress.
                    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
                # Unpack this training batch from our dataloader. 
                #
                # As we unpack the batch, we'll also copy each tensor to the GPU using the 
                # `to` method.
                #
                # `batch` contains three pytorch tensors:
                #   [0]: input ids 
                #   [1]: attention masks
                #   [2]: labels 
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                # Always clear any previously calculated gradients before performing a
                # backward pass. PyTorch doesn't do this automatically because 
                # accumulating the gradients is "convenient while training RNNs". 
                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
                model.zero_grad()        
                # Perform a forward pass (evaluate the model on this training batch).
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # It returns different numbers of parameters depending on what arguments
                # arge given and what flags are set. For our useage here, it returns
                # the loss (because we provided labels) and the "logits"--the model
                # outputs prior to activation.
                loss, logits = model(b_input_ids, 
                                     token_type_ids=None, 
                                     attention_mask=b_input_mask, 
                                     labels=b_labels)
                # Accumulate the training loss over all of the batches so that we can
                # calculate the average loss at the end. `loss` is a Tensor containing a
                # single value; the `.item()` function just returns the Python value 
                # from the tensor.
                total_train_loss += loss.item()
                # Perform a backward pass to calculate the gradients.
                loss.backward()
                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                # Update parameters and take a step using the computed gradient.
                # The optimizer dictates the "update rule"--how the parameters are
                # modified based on their gradients, the learning rate, etc.
                optimizer.step()
                # Update the learning rate.
                scheduler.step()
            # Calculate the average loss over all of the batches.
            avg_train_loss = total_train_loss / len(train_dataloader)            

            # Measure how long this epoch took.
            training_time = format_time(time.time() - t0)
            print("")
            print("  Average training loss: {0:.2f}".format(avg_train_loss))
            print("  Training epcoh took: {:}".format(training_time))

            # ========================================
            #               Validation
            # ========================================
            # After the completion of each training epoch, measure our performance on
            # our validation set.
            print("")
            print("Running Validation...")
            t0 = time.time()
            # Put the model in evaluation mode--the dropout layers behave differently
            # during evaluation.
            model.eval()
            # Tracking variables 
            total_eval_accuracy = 0
            total_eval_loss = 0
            nb_eval_steps = 0
            # Evaluate data for one epoch
            for batch in validation_dataloader:

                # Unpack this training batch from our dataloader. 
                #
                # As we unpack the batch, we'll also copy each tensor to the GPU using 
                # the `to` method.
                #
                # `batch` contains three pytorch tensors:
                #   [0]: input ids 
                #   [1]: attention masks
                #   [2]: labels 
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                # Tell pytorch not to bother with constructing the compute graph during
                # the forward pass, since this is only needed for backprop (training).
                with torch.no_grad():        
                    # Forward pass, calculate logit predictions.
                    # token_type_ids is the same as the "segment ids", which 
                    # differentiates sentence 1 and 2 in 2-sentence tasks.
                    # The documentation for this `model` function is here: 
                    # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                    # Get the "logits" output by the model. The "logits" are the output
                    # values prior to applying an activation function like the softmax.
                    (loss, logits) = model(b_input_ids, 
                                           token_type_ids=None, 
                                           attention_mask=b_input_mask,
                                           labels=b_labels)

                # Accumulate the validation loss.
                total_eval_loss += loss.item()
                # Move logits and labels to CPU
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                # Calculate the accuracy for this batch of test sentences, and
                # accumulate it over all batches.
                total_eval_accuracy += flat_accuracy(logits, label_ids)

            # Report the final accuracy for this validation run.
            avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
            print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
            # Calculate the average loss over all of the batches.
            avg_val_loss = total_eval_loss / len(validation_dataloader)

            # Measure how long the validation run took.
            validation_time = format_time(time.time() - t0)

            print("  Validation Loss: {0:.2f}".format(avg_val_loss))
            print("  Validation took: {:}".format(validation_time))
            # Record all statistics from this epoch.
            training_stats.append(
                {
                    'epoch': epoch_i + 1,
                    'Training Loss': avg_train_loss,
                    'Valid. Loss': avg_val_loss,
                    'Valid. Accur.': avg_val_accuracy,
                    'Training Time': training_time,
                    'Validation Time': validation_time
                }
            )
            
            ## Calculate number of correct preds 
            pred_flat_epoch = np.argmax(logits, axis=1).flatten()
            labels_flat_epoch = label_ids.flatten()
            corr_pred_fold = np.sum(pred_flat_epoch == labels_flat_epoch)
        print("")
        print("Training complete!")
        print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
#         total_acc_fold = corr_pred_fold/ len(validation_dataloader)
        total_acc_fold = avg_val_accuracy
        total_acc += total_acc_fold
    total_acc = (total_acc / kfold.get_n_splits())
    print('\n\nTotal accuracy cross validation: {:.3f}%'.format(total_acc))   



In [28]:
import random
import sklearn
from sklearn.model_selection import KFold
kfold = KFold(n_splits=3,shuffle=False)

In [29]:
train(model, EPOCHS, save_Model = False)


Epoch 1 / 15 
Fold number 1 / 3

Training...
  Batch    40  of    535.    Elapsed: 0:00:17.
  Batch    80  of    535.    Elapsed: 0:00:34.
  Batch   120  of    535.    Elapsed: 0:00:51.
  Batch   160  of    535.    Elapsed: 0:01:07.
  Batch   200  of    535.    Elapsed: 0:01:24.
  Batch   240  of    535.    Elapsed: 0:01:41.
  Batch   280  of    535.    Elapsed: 0:01:58.
  Batch   320  of    535.    Elapsed: 0:02:15.
  Batch   360  of    535.    Elapsed: 0:02:32.
  Batch   400  of    535.    Elapsed: 0:02:49.
  Batch   440  of    535.    Elapsed: 0:03:06.
  Batch   480  of    535.    Elapsed: 0:03:23.
  Batch   520  of    535.    Elapsed: 0:03:40.

  Average training loss: 1.71
  Training epcoh took: 0:03:46

Running Validation...
  Accuracy: 0.60
  Validation Loss: 1.26
  Validation took: 0:00:41

Epoch 2 / 15 
Fold number 1 / 3

Training...
  Batch    40  of    535.    Elapsed: 0:00:17.
  Batch    80  of    535.    Elapsed: 0:00:35.
  Batch   120  of    535.    Elapsed: 0:00:52.
  B

  Batch   520  of    535.    Elapsed: 0:03:44.

  Average training loss: 0.08
  Training epcoh took: 0:03:50

Running Validation...
  Accuracy: 0.72
  Validation Loss: 2.20
  Validation took: 0:00:41

Epoch 11 / 15 
Fold number 1 / 3

Training...
  Batch    40  of    535.    Elapsed: 0:00:17.
  Batch    80  of    535.    Elapsed: 0:00:34.
  Batch   120  of    535.    Elapsed: 0:00:52.
  Batch   160  of    535.    Elapsed: 0:01:10.
  Batch   200  of    535.    Elapsed: 0:01:28.
  Batch   240  of    535.    Elapsed: 0:01:46.
  Batch   280  of    535.    Elapsed: 0:02:03.
  Batch   320  of    535.    Elapsed: 0:02:20.
  Batch   360  of    535.    Elapsed: 0:02:38.
  Batch   400  of    535.    Elapsed: 0:02:55.
  Batch   440  of    535.    Elapsed: 0:03:13.
  Batch   480  of    535.    Elapsed: 0:03:30.
  Batch   520  of    535.    Elapsed: 0:03:48.

  Average training loss: 0.05
  Training epcoh took: 0:03:54

Running Validation...
  Accuracy: 0.72
  Validation Loss: 2.27
  Validation too

  Batch   280  of    535.    Elapsed: 0:02:10.
  Batch   320  of    535.    Elapsed: 0:02:29.
  Batch   360  of    535.    Elapsed: 0:02:48.
  Batch   400  of    535.    Elapsed: 0:03:06.
  Batch   440  of    535.    Elapsed: 0:03:25.
  Batch   480  of    535.    Elapsed: 0:03:43.
  Batch   520  of    535.    Elapsed: 0:04:02.

  Average training loss: 0.22
  Training epcoh took: 0:04:09

Running Validation...
  Accuracy: 0.86
  Validation Loss: 0.86
  Validation took: 0:00:44

Epoch 6 / 15 
Fold number 2 / 3

Training...
  Batch    40  of    535.    Elapsed: 0:00:19.
  Batch    80  of    535.    Elapsed: 0:00:37.
  Batch   120  of    535.    Elapsed: 0:00:56.
  Batch   160  of    535.    Elapsed: 0:01:14.
  Batch   200  of    535.    Elapsed: 0:01:33.
  Batch   240  of    535.    Elapsed: 0:01:52.
  Batch   280  of    535.    Elapsed: 0:02:10.
  Batch   320  of    535.    Elapsed: 0:02:29.
  Batch   360  of    535.    Elapsed: 0:02:48.
  Batch   400  of    535.    Elapsed: 0:03:06.
  

  Batch   120  of    535.    Elapsed: 0:00:51.
  Batch   160  of    535.    Elapsed: 0:01:08.
  Batch   200  of    535.    Elapsed: 0:01:25.
  Batch   240  of    535.    Elapsed: 0:01:42.
  Batch   280  of    535.    Elapsed: 0:01:59.
  Batch   320  of    535.    Elapsed: 0:02:16.
  Batch   360  of    535.    Elapsed: 0:02:34.
  Batch   400  of    535.    Elapsed: 0:02:51.
  Batch   440  of    535.    Elapsed: 0:03:08.
  Batch   480  of    535.    Elapsed: 0:03:25.
  Batch   520  of    535.    Elapsed: 0:03:42.

  Average training loss: 0.01
  Training epcoh took: 0:03:48

Running Validation...
  Accuracy: 0.88
  Validation Loss: 0.95
  Validation took: 0:00:41

Training complete!
Total training took 1:09:23 (h:mm:ss)

Epoch 1 / 15 
Fold number 3 / 3

Training...
  Batch    40  of    535.    Elapsed: 0:00:17.
  Batch    80  of    535.    Elapsed: 0:00:34.
  Batch   120  of    535.    Elapsed: 0:00:51.
  Batch   160  of    535.    Elapsed: 0:01:08.
  Batch   200  of    535.    Elapsed: 

  Accuracy: 0.95
  Validation Loss: 0.38
  Validation took: 0:00:40

Epoch 10 / 15 
Fold number 3 / 3

Training...
  Batch    40  of    535.    Elapsed: 0:00:17.
  Batch    80  of    535.    Elapsed: 0:00:34.
  Batch   120  of    535.    Elapsed: 0:00:51.
  Batch   160  of    535.    Elapsed: 0:01:08.
  Batch   200  of    535.    Elapsed: 0:01:25.
  Batch   240  of    535.    Elapsed: 0:01:42.
  Batch   280  of    535.    Elapsed: 0:01:59.
  Batch   320  of    535.    Elapsed: 0:02:16.
  Batch   360  of    535.    Elapsed: 0:02:34.
  Batch   400  of    535.    Elapsed: 0:02:51.
  Batch   440  of    535.    Elapsed: 0:03:08.
  Batch   480  of    535.    Elapsed: 0:03:25.
  Batch   520  of    535.    Elapsed: 0:03:42.

  Average training loss: 0.02
  Training epcoh took: 0:03:48

Running Validation...
  Accuracy: 0.96
  Validation Loss: 0.34
  Validation took: 0:00:40

Epoch 11 / 15 
Fold number 3 / 3

Training...
  Batch    40  of    535.    Elapsed: 0:00:17.
  Batch    80  of    535.  

In [30]:
# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(df_test.shape[0]))

# Create sentence and label lists
sentences = df_test.sentence.values
labels = df_test.label.values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_LEN,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Set the batch size.  
batch_size = BATCH_SIZE

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Number of test sentences: 802



In [31]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
    with torch.no_grad():
      # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

    logits = outputs[0]

  # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

Predicting labels for 802 test sentences...


In [32]:
y_pred = []
y_true = []

# To interpret the predictions and true labels
for i in range(len(true_labels)):

  # The predictions for this batch are a 4-column ndarray. Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
    y_true += true_labels[i].tolist()

    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    y_pred += pred_labels_i.tolist()

print("y_true...{}".format(y_true))
print("y_pred...{}".format(y_pred))

y_true...[5, 5, 6, 7, 7, 1, 1, 1, 1, 1, 2, 6, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 10, 2, 10, 1, 1, 1, 1, 1, 4, 4, 4, 4, 3, 3, 3, 2, 1, 4, 3, 3, 4, 4, 3, 9, 7, 9, 7, 7, 7, 8, 9, 9, 7, 7, 5, 10, 8, 5, 2, 4, 10, 3, 10, 10, 4, 10, 6, 6, 3, 10, 10, 5, 10, 5, 5, 5, 5, 5, 8, 6, 2, 5, 10, 8, 8, 8, 8, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 10, 10, 2, 2, 5, 5, 5, 5, 3, 4, 3, 7, 5, 6, 6, 7, 3, 10, 10, 6, 6, 3, 6, 1, 2, 10, 6, 6, 7, 7, 4, 3, 3, 7, 7, 10, 6, 3, 4, 5, 5, 10, 4, 5, 5, 7, 2, 5, 8, 5, 1, 1, 5, 5, 1, 5, 3, 7, 10, 6, 4, 8, 4, 5, 1, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 5, 5, 5, 9, 5, 5, 5, 6, 6, 4, 4, 4, 6, 6, 6, 10, 10, 7, 7, 5, 10, 10, 10, 5, 4, 7, 3, 3, 3, 3, 3, 6, 6, 6, 9, 3, 6, 7, 7, 8, 10, 2, 2, 4, 1, 7, 9, 9, 7, 7, 3, 5, 5, 1, 3, 1, 1, 8, 8, 3, 5, 5, 5, 1, 1, 9, 9, 9, 7, 10, 10, 3, 1, 9, 4, 8, 10, 2, 6, 4, 6, 3, 3, 9, 7, 3, 1, 1, 1, 7, 3, 1, 7, 4, 4, 10, 10, 4, 1, 3, 9, 

In [33]:
y_true_labels = []
y_pred_labels = []

for i in range(len(y_true)):
    y_true_labels += [ENCODE_CAT[y_true[i]]]
    y_pred_labels += [ENCODE_CAT[y_pred[i]]]


print("y_true...{}".format(y_true_labels))
print("y_pred...{}".format(y_pred_labels))

y_true...['Risk', 'Risk', 'Non Current Assets', 'Current Asset', 'Current Asset', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'Income_Statement_net_profit_related', 'Non Current Assets', 'Income_Statement_profitbeforetax_related', 'Income_Statement_profitbeforetax_related', 'Income_Statement_profitbeforetax_related', 'Income_Statement_profitbeforetax_related', 'Income_Statement_profitbeforetax_related', 'Income_Statement_profitbeforetax_related', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_sales', 'income_statement_net_s

In [34]:
from sklearn import metrics

print(metrics.confusion_matrix(y_true_labels, y_pred_labels))

[[ 26   0   0   0   1   0   2   1   0   1   0]
 [  2  26   1   1   1   2   1   3   4   5   1]
 [  0   1  23   1   4   1   2   0   0   0   0]
 [  0   0   2  52   2   0   4   0   2   1   6]
 [  0   2   1   5  66   1   2   3   0   1   0]
 [  1   0   0   0   0  19   5   0   0   0   0]
 [  2   2   3   1  11   2 147   5  22   6   7]
 [  1   1   0   2   1   2   5  38   0   1   1]
 [  0   1   0   4   1   0  16   1  57   5   7]
 [  0   2   1   2   2   0   6   3   1  37   1]
 [  1   0   0   2   1   1   3   2   4   0  96]]


In [35]:
# albert
print(metrics.classification_report(y_true_labels, y_pred_labels, digits=3))

                                           precision    recall  f1-score   support

                                 Cashflow      0.788     0.839     0.812        31
                            Current Asset      0.743     0.553     0.634        47
      Income_Statement_net_profit_related      0.742     0.719     0.730        32
Income_Statement_operating_profit_related      0.743     0.754     0.748        69
 Income_Statement_profitbeforetax_related      0.733     0.815     0.772        81
                      Net Worth / Capital      0.679     0.760     0.717        25
                                 No topic      0.762     0.707     0.733       208
                       Non Current Assets      0.679     0.731     0.704        52
                                     Risk      0.633     0.620     0.626        92
                        Total Liabilities      0.649     0.673     0.661        55
               income_statement_net_sales      0.807     0.873     0.838       110

  

In [39]:
!pip install pytorch_transformers

Collecting pytorch_transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[K     |████████████████████████████████| 176 kB 3.0 MB/s eta 0:00:01
Installing collected packages: pytorch-transformers
Successfully installed pytorch-transformers-1.2.0
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [40]:
import pytorch_transformers
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
import os

os.mkdir('./albert_large_v1_250_23092020')
output_dir = './albert_large_v1_250_23092020'


output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)

# torch.save(model.state_dict())
# model.config.to_json_file()
# tokenizer.save_vocabulary()

FileNotFoundError: [Errno 2] No such file or directory: './albert_large_v1_250_23092020/pytorch_model.bin'

In [36]:
torch.save(model, './albert_large_v1_24092020.pkl')

In [37]:
model_loaded = torch.load('./albert_large_v1_24092020.pkl')
model_loaded.eval()

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=1024, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_feat