In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Looking in indexes: http://mirrors.aliyun.com/pypi/simple


In [2]:
import torch
import io
import torch.nn.functional as F
import random
import numpy as np
import time
import math
import pandas as pd
import datetime
from tqdm.auto import tqdm
import torch.nn as nn
from transformers import *
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)
#!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html
#!pip install sentencepiece

##Set random values
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(seed_val)

In [3]:
from datasets import load_dataset
dataset = load_dataset("financial_phrasebank",'sentences_50agree',revision='main')

Reusing dataset financial_phrasebank (/root/.cache/huggingface/datasets/financial_phrasebank/sentences_50agree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
train_num=len(dataset['train'])
print(train_num)

4846


In [5]:
def get_dict(data):
  data_dict={}
  for i in range(len(data)):
    data_dict[data[i]['sentence']]=data[i]['label']
  return data_dict
train_dict=get_dict(dataset['train'])

In [6]:
train_df = pd.DataFrame(list(train_dict.items()),columns = ['review_body','label']) 

In [7]:
train_df.groupby(['label']).count()

Unnamed: 0_level_0,review_body
label,Unnamed: 1_level_1
0,604
1,2872
2,1362


In [8]:
len(train_df)

4838

In [9]:
from sklearn.model_selection import StratifiedKFold
label_list = ['UNK_UNK','0','1','2']
n_labels=len(label_list)

## Split train test dataset,80% train, 20% test

In [10]:
def decrease_train(train_df,n_fold):
  cv = StratifiedKFold(n_splits=n_fold,shuffle = True,random_state=42)
  i=0
  train_idx=[]
  test_idx=[]
  for train, test in cv.split(train_df,train_df['label']):
    if i not in [0,5]:
      train_idx=np.concatenate((train_idx,test), axis=None)
      i+=1
    else:
      test_idx=np.concatenate((test_idx,test), axis=None)
      i+=1
  return train_df.loc[train_idx,:],train_df.loc[test_idx,:]
train_example,test_example=decrease_train(train_df,10)

In [11]:
train_example.groupby(['label']).count()

Unnamed: 0_level_0,review_body
label,Unnamed: 1_level_1
0,483
1,2297
2,1090


In [12]:
test_example.groupby(['label']).count()

Unnamed: 0_level_0,review_body
label,Unnamed: 1_level_1
0,121
1,575
2,272


In [13]:
train_example.reset_index(drop=True,inplace=True)
test_example.reset_index(drop=True,inplace=True)

## Use 2% label data, the rest as unlabel data

In [14]:
def get_label_unlabel(train_df):
  cv = StratifiedKFold(n_splits=100,shuffle = True,random_state=42)
  i=0
  unlabel_idx=[]
  label_idx=[]
  for train_idxs, test_idxs in cv.split(train_df,train_df['label']):
    if i in [i for i in range(0,100,50)]:
      label_idx=np.concatenate((label_idx,test_idxs), axis=None)
      i+=1
    # elif i in range(4,400,2):
    #   unlabel_idx=np.concatenate((unlabel_idx,test_idxs), axis=None)
    #   i+=1
    else:
      unlabel_idx=np.concatenate((unlabel_idx,test_idxs), axis=None)
      i+=1
  return train_df.loc[label_idx,:],train_df.loc[unlabel_idx,:]

# def get_test_data(test_df):
#   cv = StratifiedKFold(n_splits=5,shuffle = True,random_state=42)
#   i=0
#   for train_idxs, test_idxs in cv.split(test_df,test_df['stars']):
#     if i==0:
#       label_data=test_df.loc[test_idxs,:]
#   return label_data

In [15]:
label_data,unlabel_data=get_label_unlabel(train_example)

In [16]:
# test_data=get_test_data(test_df)

In [17]:
test_data=test_example

In [18]:
print('label_data：',len(label_data))
print('unlabel_data：',len(unlabel_data))
print('test_data：',len(test_data))

label_data： 78
unlabel_data： 3792
test_data： 968


In [19]:
label_data.groupby('label').count()

Unnamed: 0_level_0,review_body
label,Unnamed: 1_level_1
0,10
1,46
2,22


In [20]:
unlabel_data.groupby('label').count()

Unnamed: 0_level_0,review_body
label,Unnamed: 1_level_1
0,473
1,2251
2,1068


In [21]:
test_data.groupby('label').count()

Unnamed: 0_level_0,review_body
label,Unnamed: 1_level_1
0,121
1,575
2,272


In [22]:
label_data.reset_index(drop=True,inplace=True)
unlabel_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

In [23]:
def get_qc_examples(data,label=False):
  """Creates examples for the training and dev sets."""
  examples = []
  if label==True:
    for i in range(len(data)):
      x=data.loc[i,'review_body']
      y=data.loc[i,'label']
      examples.append((x,y))
  else:
    for i in range(len(data)):
      x=data.loc[i,'review_body']
      y='UNK_UNK'
      examples.append((x,y))

  return examples

In [24]:
#Load the examples
labeled_examples = get_qc_examples(label_data,label=True)
unlabeled_examples = get_qc_examples(unlabel_data)
test_examples = get_qc_examples(test_data,label=True)

# original size
# train 688  
# unlabel 14433
# test 972

In [25]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA RTX A5000


### Input Parameters


In [26]:
#--------------------------------
#  Transformer parameters
#--------------------------------
max_seq_length = 64
batch_size = 32

#--------------------------------
#  GAN-BERT specific parameters
#--------------------------------
# number of hidden layers in the generator, 
# each of the size of the output space
num_hidden_layers_g = 1; 
# number of hidden layers in the discriminator, 
# each of the size of the input space
num_hidden_layers_d = 1; 
# size of the generator's input noisy vectors
noise_size = 100
# dropout to be applied to discriminator's input vectors
out_dropout_rate = 0.2

# Replicate labeled data to balance poorly represented datasets, 
# e.g., less than 1% of labeled material
apply_balance = True

#--------------------------------
#  Optimization parameters
#--------------------------------
learning_rate_discriminator = 5e-5
learning_rate_generator = 5e-5
epsilon = 1e-8
num_train_epochs = 10
multi_gpu = True
# Scheduler
apply_scheduler = False
warmup_proportion = 0.1
# Print
print_each_n_step = 10

#--------------------------------
#  Adopted Tranformer model
#--------------------------------
# Since this version is compatible with Huggingface transformers, you can uncomment
# (or add) transformer models compatible with GAN

# model_name = "bert-base-cased"
model_name = "gpt2"
#model_name = "bert-base-uncased"
#model_name = "roberta-base"
#model_name = "albert-base-v2"
#model_name = "xlm-roberta-base"
#model_name = "amazon/bort"


In [27]:
# Get model configuration.
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=n_labels,output_hidden_states=True)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
transformer = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, config=model_config)

# resize model embedding to match new tokenizer
transformer.resize_token_embeddings(len(tokenizer))

# fix model padding token id
transformer.config.pad_token_id = transformer.config.eos_token_id

# Load model to defined device.
transformer.to(device)
print('Model loaded to `%s`'%device)

Loading configuraiton...
Loading tokenizer...
Loading model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to `cuda`


In [28]:
lengths_dict = {}
from tqdm.auto import tqdm

# get train_label length
lengths = []
tk0 = tqdm(labeled_examples, total=len(labeled_examples))
for text in tk0:
    length = len(tokenizer(text[0], add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['label'] = lengths

# get train_unlabel length
lengths = []
tk0 = tqdm(unlabeled_examples, total=len(unlabeled_examples))
for text in tk0:
    length = len(tokenizer(text[0], add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['unlabel'] = lengths

# get test length
lengths = []
tk0 = tqdm(test_examples, total=len(test_examples))
for text in tk0:
    length = len(tokenizer(text[0], add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['test'] = lengths
    
max_len = max(max(lengths_dict['label']),max(lengths_dict['test']),max(lengths_dict['unlabel'])) + 2 # CLS + SEP 
if max_len>256:
  max_seq_length=256
else:
  max_seq_length = max_len
print(f"max_seq_length: {max_seq_length}")

  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/3792 [00:00<?, ?it/s]

  0%|          | 0/968 [00:00<?, ?it/s]

max_seq_length: 133


Functions required to convert examples into Dataloader

In [29]:
def generate_data_loader(input_examples, label_masks, label_map, do_shuffle = False, balance_label_examples = False):
  '''
  Generate a Dataloader given the input examples, eventually masked if they are 
  to be considered NOT labeled.
  '''
  examples = []

  # Count the percentage of labeled examples  
  num_labeled_examples = 0
  for label_mask in label_masks:
    if label_mask: 
      num_labeled_examples += 1
  label_mask_rate = num_labeled_examples/len(input_examples)

  # if required it applies the balance
  for index, ex in enumerate(input_examples): 
    if label_mask_rate == 1 or not balance_label_examples:
      examples.append((ex, label_masks[index]))
    else:
      # IT SIMULATE A LABELED EXAMPLE
      if label_masks[index]:
        balance = int(1/label_mask_rate)
        balance = int(math.log(balance,2))
        if balance < 1:
          balance = 1
        for b in range(0, int(balance)):
          examples.append((ex, label_masks[index]))
      else:
        examples.append((ex, label_masks[index]))
  
  #-----------------------------------------------
  # Generate input examples to the Transformer
  #-----------------------------------------------
  input_ids = []
  input_mask_array = []
  label_mask_array = []
  label_id_array = []

  # Tokenization 
  for (text, label_mask) in examples:
    encoded_sent = tokenizer.encode(text[0], add_special_tokens=True, max_length=max_seq_length, padding="max_length", truncation=True)
    input_ids.append(encoded_sent)
    label_id_array.append(label_map[str(text[1])])
    label_mask_array.append(label_mask)
  
  # Attention to token (to ignore padded input wordpieces)
  for sent in input_ids:
    att_mask = [int(token_id >0) for token_id in sent]                          
    input_mask_array.append(att_mask)
  # Convertion to Tensor
  input_ids = torch.tensor(input_ids) 
  input_mask_array = torch.tensor(input_mask_array)
  label_id_array = torch.tensor(label_id_array, dtype=torch.long)
  label_mask_array = torch.tensor(label_mask_array)

  # Building the TensorDataset
  dataset = TensorDataset(input_ids, input_mask_array, label_id_array, label_mask_array)

  if do_shuffle:
    sampler = RandomSampler
  else:
    sampler = SequentialSampler

  # Building the DataLoader
  return DataLoader(
              dataset,  # The training samples.
              sampler = sampler(dataset), 
              batch_size = batch_size) # Trains with this batch size.

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

Convert the input examples into DataLoader

In [30]:
label_map = {}
for (i, label) in enumerate(label_list):
  label_map[label] = i
#------------------------------
#   Load the train dataset
#------------------------------
train_examples = labeled_examples
#The labeled (train) dataset is assigned with a mask set to True
train_label_masks = np.ones(len(labeled_examples), dtype=bool)
#If unlabel examples are available
if unlabeled_examples:
  train_examples = train_examples + unlabeled_examples
  #The unlabeled (train) dataset is assigned with a mask set to False
  tmp_masks = np.zeros(len(unlabeled_examples), dtype=bool)
  train_label_masks = np.concatenate([train_label_masks,tmp_masks])

train_dataloader = generate_data_loader(train_examples, train_label_masks, label_map, do_shuffle = True, balance_label_examples = apply_balance)

#------------------------------
#   Load the test dataset
#------------------------------
#The labeled (test) dataset is assigned with a mask set to True
test_label_masks = np.ones(len(test_examples), dtype=bool)

test_dataloader = generate_data_loader(test_examples, test_label_masks, label_map, do_shuffle = False, balance_label_examples = False)

  label_mask_array = torch.tensor(label_mask_array)


In [31]:
len(train_label_masks)

3870

In [32]:
len(unlabeled_examples)

3792

We define the Generator and Discriminator as discussed in https://www.aclweb.org/anthology/2020.acl-main.191/

In [33]:
#------------------------------
#   The Generator as in 
#   https://www.aclweb.org/anthology/2020.acl-main.191/
#   https://github.com/crux82/ganbert
#------------------------------
class Generator(nn.Module):
    def __init__(self, noise_size=100, output_size=512, hidden_sizes=[512], dropout_rate=0.1):
        super(Generator, self).__init__()
        layers = []
        hidden_sizes = [noise_size] + hidden_sizes
        for i in range(len(hidden_sizes)-1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        layers.append(nn.Linear(hidden_sizes[-1],output_size))
        self.layers = nn.Sequential(*layers)

    def forward(self, noise):
        output_rep = self.layers(noise)
        return output_rep

#------------------------------
#   The Discriminator
#   https://www.aclweb.org/anthology/2020.acl-main.191/
#   https://github.com/crux82/ganbert
#------------------------------
class Discriminator(nn.Module):
    def __init__(self, input_size=512, hidden_sizes=[512], num_labels=2, dropout_rate=0.1):
        super(Discriminator, self).__init__()
        self.input_dropout = nn.Dropout(p=dropout_rate)
        layers = []
        hidden_sizes = [input_size] + hidden_sizes
        for i in range(len(hidden_sizes)-1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        self.layers = nn.Sequential(*layers) #per il flatten
        self.logit = nn.Linear(hidden_sizes[-1],num_labels+1) # +1 for the probability of this sample being fake/real.
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input_rep):
        input_rep = self.input_dropout(input_rep)
        last_rep = self.layers(input_rep)
        logits = self.logit(last_rep)
        probs = self.softmax(logits)
        return last_rep, logits, probs

In [34]:
model_config.hidden_size

768

We instantiate the Discriminator and Generator

In [35]:
# The config file is required to get the dimension of the vector produced by 
# the underlying transformer
hidden_size = int(model_config.hidden_size)
# Define the number and width of hidden layers
hidden_levels_g = [hidden_size for i in range(0, num_hidden_layers_g)]
hidden_levels_d = [hidden_size for i in range(0, num_hidden_layers_d)]

#-------------------------------------------------
#   Instantiate the Generator and Discriminator
#-------------------------------------------------
generator = Generator(noise_size=noise_size, output_size=hidden_size, hidden_sizes=hidden_levels_g, dropout_rate=out_dropout_rate)
discriminator = Discriminator(input_size=hidden_size, hidden_sizes=hidden_levels_d, num_labels=len(label_list), dropout_rate=out_dropout_rate)

# Put everything in the GPU if available
if torch.cuda.is_available():    
  generator.cuda()
  discriminator.cuda()
  transformer.cuda()
  if multi_gpu:
    transformer = torch.nn.DataParallel(transformer)

# print(config)

Let's go with the training procedure

In [36]:
from sklearn.metrics import f1_score
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

#models parameters
transformer_vars = [i for i in transformer.parameters()]
d_vars = transformer_vars + [v for v in discriminator.parameters()]
g_vars = [v for v in generator.parameters()]

#optimizer
dis_optimizer = torch.optim.AdamW(d_vars, lr=learning_rate_discriminator)
gen_optimizer = torch.optim.AdamW(g_vars, lr=learning_rate_generator) 

#scheduler
if apply_scheduler:
  num_train_examples = len(train_examples)
  num_train_steps = int(num_train_examples / batch_size * num_train_epochs)
  num_warmup_steps = int(num_train_steps * warmup_proportion)

  scheduler_d = get_constant_schedule_with_warmup(dis_optimizer, 
                                           num_warmup_steps = num_warmup_steps)
  scheduler_g = get_constant_schedule_with_warmup(gen_optimizer, 
                                           num_warmup_steps = num_warmup_steps)

# For each epoch...
for epoch_i in range(0, num_train_epochs):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_train_epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    tr_g_loss = 0
    tr_d_loss = 0

    # Put the model into training mode.
    transformer.train() 
    generator.train()
    discriminator.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every print_each_n_step batches.
        if step % print_each_n_step == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_label_mask = batch[3].to(device)

        real_batch_size = b_input_ids.shape[0]
     
        # Encode real data in the Transformer
        model_outputs = transformer(b_input_ids, attention_mask=b_input_mask)
        hidden_states = model_outputs.hidden_states[-1][:,-1,:]
        
        # Generate fake data that should have the same distribution of the ones
        # encoded by the transformer. 
        # First noisy input are used in input to the Generator
        noise = torch.zeros(real_batch_size, noise_size, device=device).uniform_(0, 1)
        # Gnerate Fake data
        gen_rep = generator(noise)

        # Generate the output of the Discriminator for real and fake data.
        # First, we put together the output of the tranformer and the generator
        disciminator_input = torch.cat([hidden_states, gen_rep], dim=0)
        # Then, we select the output of the disciminator
        features, logits, probs = discriminator(disciminator_input)

        # Finally, we separate the discriminator's output for the real and fake
        # data
        features_list = torch.split(features, real_batch_size)
        D_real_features = features_list[0]
        D_fake_features = features_list[1]
      
        logits_list = torch.split(logits, real_batch_size)
        D_real_logits = logits_list[0]
        D_fake_logits = logits_list[1]
        
        probs_list = torch.split(probs, real_batch_size)
        D_real_probs = probs_list[0]
        D_fake_probs = probs_list[1]

        #---------------------------------
        #  LOSS evaluation
        #---------------------------------
        # Generator's LOSS estimation
        g_loss_d = -1 * torch.mean(torch.log(1 - D_fake_probs[:,-1] + epsilon))
        g_feat_reg = torch.mean(torch.pow(torch.mean(D_real_features, dim=0) - torch.mean(D_fake_features, dim=0), 2))
        g_loss = g_loss_d + g_feat_reg
  
        # Disciminator's LOSS estimation
        logits = D_real_logits[:,0:-1]
        log_probs = F.log_softmax(logits, dim=-1)
        # The discriminator provides an output for labeled and unlabeled real data
        # so the loss evaluated for unlabeled data is ignored (masked)
        label2one_hot = torch.nn.functional.one_hot(b_labels, len(label_list))
        per_example_loss = -torch.sum(label2one_hot * log_probs, dim=-1)
        per_example_loss = torch.masked_select(per_example_loss, b_label_mask.to(device))
        labeled_example_count = per_example_loss.type(torch.float32).numel()

        # It may be the case that a batch does not contain labeled examples, 
        # so the "supervised loss" in this case is not evaluated
        if labeled_example_count == 0:
          D_L_Supervised = 0
        else:
          D_L_Supervised = torch.div(torch.sum(per_example_loss.to(device)), labeled_example_count)
                 
        D_L_unsupervised1U = -1 * torch.mean(torch.log(1 - D_real_probs[:, -1] + epsilon))
        D_L_unsupervised2U = -1 * torch.mean(torch.log(D_fake_probs[:, -1] + epsilon))
        d_loss = D_L_Supervised + D_L_unsupervised1U + D_L_unsupervised2U

        #---------------------------------
        #  OPTIMIZATION
        #---------------------------------
        # Avoid gradient accumulation
        gen_optimizer.zero_grad()
        dis_optimizer.zero_grad()

        # Calculate weigth updates
        # retain_graph=True is required since the underlying graph will be deleted after backward
        g_loss.backward(retain_graph=True)
        d_loss.backward() 
        
        # Apply modifications
        gen_optimizer.step()
        dis_optimizer.step()

        # A detail log of the individual losses
        #print("{0:.4f}\t{1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}".
        #      format(D_L_Supervised, D_L_unsupervised1U, D_L_unsupervised2U,
        #             g_loss_d, g_feat_reg))

        # Save the losses to print them later
        tr_g_loss += g_loss.item()
        tr_d_loss += d_loss.item()

        # Update the learning rate with the scheduler
        if apply_scheduler:
          scheduler_d.step()
          scheduler_g.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss_g = tr_g_loss / len(train_dataloader)
    avg_train_loss_d = tr_d_loss / len(train_dataloader)             
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss generetor: {0:.3f}".format(avg_train_loss_g))
    print("  Average training loss discriminator: {0:.3f}".format(avg_train_loss_d))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #     TEST ON THE EVALUATION DATASET
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our test set.
    print("")
    print("Running Test...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    transformer.eval() #maybe redundant
    discriminator.eval()
    generator.eval()

    # Tracking variables 
    total_test_accuracy = 0
   
    total_test_loss = 0
    nb_test_steps = 0

    all_preds = []
    all_labels_ids = []

    #loss
    nll_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)

    # Evaluate data for one epoch
    for batch in test_dataloader:
        
        # Unpack this training batch from our dataloader. 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            model_outputs = transformer(b_input_ids, attention_mask=b_input_mask)
            hidden_states = model_outputs.hidden_states[-1][:,-1,:]
            _, logits, probs = discriminator(hidden_states)
            ###log_probs = F.log_softmax(probs[:,1:], dim=-1)
            filtered_logits = logits[:,0:-1]
            # Accumulate the test loss.
            total_test_loss += nll_loss(filtered_logits, b_labels)
            
        # Accumulate the predictions and the input labels
        _, preds = torch.max(filtered_logits, 1)
        all_preds += preds.detach().cpu()
        all_labels_ids += b_labels.detach().cpu()

    # Report the final accuracy for this validation run.
    all_preds = torch.stack(all_preds).numpy()
    print('all_preds')
    print(all_preds)
    all_labels_ids = torch.stack(all_labels_ids).numpy()
    print('all_labels')
    print(all_labels_ids)
    test_f1 = f1_score(all_labels_ids,all_preds,average='weighted')
    print("  F1 score: {0:.3f}".format(test_f1))

    # Calculate the average loss over all of the batches.
    avg_test_loss = total_test_loss / len(test_dataloader)
    avg_test_loss = avg_test_loss.item()
    
    # Measure how long the validation run took.
    test_time = format_time(time.time() - t0)
    
    print("  Test Loss: {0:.3f}".format(avg_test_loss))
    print("  Test took: {:}".format(test_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss generator': avg_train_loss_g,
            'Training Loss discriminator': avg_train_loss_d,
            'Valid. Loss': avg_test_loss,
            'Valid. F1.': test_f1,
            'Training Time': training_time,
            'Test Time': test_time
        }
    )


Training...
  Batch    10  of    131.    Elapsed: 0:00:06.
  Batch    20  of    131.    Elapsed: 0:00:12.
  Batch    30  of    131.    Elapsed: 0:00:18.
  Batch    40  of    131.    Elapsed: 0:00:23.
  Batch    50  of    131.    Elapsed: 0:00:29.
  Batch    60  of    131.    Elapsed: 0:00:35.
  Batch    70  of    131.    Elapsed: 0:00:41.
  Batch    80  of    131.    Elapsed: 0:00:47.
  Batch    90  of    131.    Elapsed: 0:00:53.
  Batch   100  of    131.    Elapsed: 0:00:59.
  Batch   110  of    131.    Elapsed: 0:01:05.
  Batch   120  of    131.    Elapsed: 0:01:11.
  Batch   130  of    131.    Elapsed: 0:01:16.

  Average training loss generetor: 0.955
  Average training loss discriminator: 1.965
  Training epcoh took: 0:01:17

Running Test...
all_preds
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2

In [37]:
for stat in training_stats:
  print(stat)

print("\nTraining complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

{'epoch': 1, 'Training Loss generator': 0.9552972580640371, 'Training Loss discriminator': 1.9653472550042712, 'Valid. Loss': 0.9574716687202454, 'Valid. F1.': 0.4427151679405259, 'Training Time': '0:01:17', 'Test Time': '0:00:04'}
{'epoch': 2, 'Training Loss generator': 0.7483785666582239, 'Training Loss discriminator': 1.6172780640252673, 'Valid. Loss': 0.9824129939079285, 'Valid. F1.': 0.49528235092890266, 'Training Time': '0:01:17', 'Test Time': '0:00:04'}
{'epoch': 3, 'Training Loss generator': 0.754077912741945, 'Training Loss discriminator': 1.1289995926937073, 'Valid. Loss': 1.0812162160873413, 'Valid. F1.': 0.5759032267276156, 'Training Time': '0:01:17', 'Test Time': '0:00:04'}
{'epoch': 4, 'Training Loss generator': 0.746691677406544, 'Training Loss discriminator': 0.8542967734446052, 'Valid. Loss': 1.2715877294540405, 'Valid. F1.': 0.5835600609144288, 'Training Time': '0:01:17', 'Test Time': '0:00:04'}
{'epoch': 5, 'Training Loss generator': 0.7466824628014601, 'Training Los