Let's GO!

Required Imports.

In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Looking in indexes: http://mirrors.aliyun.com/pypi/simple


In [2]:
import torch
import io
import torch.nn.functional as F
import random
import numpy as np
import time
import math
import pandas as pd
import datetime
from tqdm.auto import tqdm
import torch.nn as nn
from transformers import *
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)
#!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html
#!pip install sentencepiece

##Set random values
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(seed_val)

In [3]:
from datasets import load_dataset
dataset = load_dataset("amazon_reviews_multi",'en')

Reusing dataset amazon_reviews_multi (/root/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset['train'][0]

{'review_id': 'en_0964290',
 'product_id': 'product_en_0740675',
 'reviewer_id': 'reviewer_en_0342986',
 'stars': 1,
 'review_body': "Arrived broken. Manufacturer defect. Two of the legs of the base were not completely formed, so there was no way to insert the casters. I unpackaged the entire chair and hardware before noticing this. So, I'll spend twice the amount of time boxing up the whole useless thing and send it back with a 1-star review of part of a chair I never got to sit in. I will go so far as to include a picture of what their injection molding and quality assurance process missed though. I will be hesitant to buy again. It makes me wonder if there aren't missing structures and supports that don't impede the assembly process.",
 'review_title': "I'll spend twice the amount of time boxing up the whole useless thing and send it back with a 1-star review ...",
 'language': 'en',
 'product_category': 'furniture'}

In [5]:
train_num=len(dataset['train'])
test_num=len(dataset['test'])
val_num=len(dataset['validation'])

In [6]:
print(train_num)
print(test_num)
print(val_num)

200000
5000
5000


In [7]:
def get_dict(data):
  data_dict={}
  for i in range(len(data)):
    data_dict[data[i]['review_body']]=data[i]['stars']
  return data_dict
train_dict=get_dict(dataset['train'])
test_dict=get_dict(dataset['test'])
val_dict=get_dict(dataset['validation'])

In [8]:
train_df = pd.DataFrame(list(train_dict.items()),columns = ['review_body','stars']) 
test_df  = pd.DataFrame(list(test_dict.items()),columns = ['review_body','stars'])
val_df   = pd.DataFrame(list(test_dict.items()),columns = ['review_body','stars'])

In [9]:
val_df.groupby(['stars']).count()

Unnamed: 0_level_0,review_body
stars,Unnamed: 1_level_1
1,1000
2,1000
3,1000
4,1000
5,1000


In [10]:
train_df.groupby(['stars']).count()

Unnamed: 0_level_0,review_body
stars,Unnamed: 1_level_1
1,39841
2,39930
3,39925
4,39861
5,39869


In [11]:
test_df.groupby(['stars']).count()

Unnamed: 0_level_0,review_body
stars,Unnamed: 1_level_1
1,1000
2,1000
3,1000
4,1000
5,1000


In [12]:
len(train_df)

199426

In [13]:
from sklearn.model_selection import StratifiedKFold
label_list = ['1','2','3','4','5']

## Use 5% label data, the rest as unlabel data

In [14]:
def get_label_unlabel(train_df):
  cv = StratifiedKFold(n_splits=100,shuffle = True,random_state=42)
  i=0
  unlabel_idx=[]
  label_idx=[]
  for train_idxs, test_idxs in cv.split(train_df,train_df['stars']):
    if i in [i for i in range(0,100,20)]:
      label_idx=np.concatenate((label_idx,test_idxs), axis=None)
      i+=1
    # elif i in range(4,400,2):
    #   unlabel_idx=np.concatenate((unlabel_idx,test_idxs), axis=None)
    #   i+=1
    else:
      unlabel_idx=np.concatenate((unlabel_idx,test_idxs), axis=None)
      i+=1
  return train_df.loc[label_idx,:],train_df.loc[unlabel_idx,:]

In [15]:
train_df,unlabel_data=get_label_unlabel(train_df)

In [16]:
print('label_data：',len(train_df))
print('test_data：',len(test_df))

label_data： 9972
test_data： 5000


In [17]:
train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [18]:
def get_qc_examples(data,label=False):
  """Creates examples for the training and dev sets."""
  examples = []
  if label==True:
    for i in range(len(data)):
      x=data.loc[i,'review_body']
      y=data.loc[i,'stars']
      examples.append((x,y))
  else:
    for i in range(len(data)):
      x=data.loc[i,'review_body']
      y='UNK_UNK'
      examples.append((x,y))

  return examples

In [19]:
#Load the examples
labeled_examples = get_qc_examples(train_df,label=True)
test_examples = get_qc_examples(test_df,label=True)

# original size
# train 688  
# unlabel 14433
# test 972

In [20]:
len(test_examples)

5000

In [21]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA RTX A5000


### Input Parameters


In [22]:
#--------------------------------
#  Transformer parameters
#--------------------------------
max_seq_length = 64
batch_size = 32


# Replicate labeled data to balance poorly represented datasets, 
# e.g., less than 1% of labeled material
apply_balance = True

#--------------------------------
#  Optimization parameters
#--------------------------------
learning_rate_discriminator = 5e-5
epsilon = 1e-8
num_train_epochs = 5
multi_gpu = True
# Scheduler
apply_scheduler = False
warmup_proportion = 0.1
# Print
print_each_n_step = 10

#--------------------------------
#  Adopted Tranformer model
#--------------------------------

model_name = "gpt2"
#model_name = "bert-base-uncased"
#model_name = "roberta-base"
#model_name = "albert-base-v2"
#model_name = "xlm-roberta-base"
#model_name = "amazon/bort"


In [23]:
# Get model configuration.
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=len(label_list))

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name,return_tensors="pt")
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
transformer = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, config=model_config)

# resize model embedding to match new tokenizer
transformer.resize_token_embeddings(len(tokenizer))

# fix model padding token id
transformer.config.pad_token_id = transformer.config.eos_token_id

# Load model to defined device.
transformer.to(device)
print('Model loaded to `%s`'%device)

Loading configuraiton...
Loading tokenizer...
Loading model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to `cuda`


In [24]:
lengths_dict = {}
from tqdm.auto import tqdm

# get train_label length
lengths = []
tk0 = tqdm(labeled_examples, total=len(labeled_examples))
for text in tk0:
    length = len(tokenizer(text[0], add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['label'] = lengths


# get test length
lengths = []
tk0 = tqdm(test_examples, total=len(test_examples))
for text in tk0:
    length = len(tokenizer(text[0], add_special_tokens=False)['input_ids'])
    lengths.append(length)
lengths_dict['test'] = lengths
    
max_len = max(max(lengths_dict['label']),max(lengths_dict['test'])) + 2 # CLS + SEP 
if max_len>512:
  max_seq_length=256
else:
  max_seq_length = max_len
print(f"max_seq_length: {max_seq_length}")

  0%|          | 0/9972 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

max_seq_length: 256


In [25]:
max_seq_length

256

Functions required to convert examples into Dataloader

In [26]:
def generate_data_loader(input_examples, label_map, do_shuffle = False, balance_label_examples = False):
  '''
  Generate a Dataloader given the input examples, eventually masked if they are 
  to be considered NOT labeled.
  '''
  
  #-----------------------------------------------
  # Generate input examples to the Transformer
  #-----------------------------------------------
  input_ids = []
  input_mask_array = []
  label_id_array = []

  # Tokenization 
  for text in input_examples:
    encoded_sent = tokenizer.encode(text[0], add_special_tokens=True, max_length=max_seq_length, padding="max_length", truncation=True)
    input_ids.append(encoded_sent)
    label_id_array.append(label_map[str(text[1])])
  
  # Attention to token (to ignore padded input wordpieces)
  for sent in input_ids:
    att_mask = [int(token_id !=50256) for token_id in sent] 
    input_mask_array.append(att_mask)
  # Convertion to Tensor
  input_ids = torch.tensor(input_ids) 
  input_mask_array = torch.tensor(input_mask_array)
  label_id_array = torch.tensor(label_id_array, dtype=torch.long)

  # Building the TensorDataset
  dataset = TensorDataset(input_ids, input_mask_array, label_id_array)

  if do_shuffle:
    sampler = RandomSampler
  else:
    sampler = SequentialSampler

  # Building the DataLoader
  return DataLoader(
              dataset,  # The training samples.
              sampler = sampler(dataset), 
              batch_size = batch_size) # Trains with this batch size.

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

Convert the input examples into DataLoader

In [27]:
label_map = {}
for (i, label) in enumerate(label_list):
  label_map[label] = i
#------------------------------
#   Load the train dataset
#------------------------------
train_examples = labeled_examples
#The labeled (train) dataset is assigned with a mask set to True


train_dataloader = generate_data_loader(train_examples, label_map, do_shuffle = True, balance_label_examples = apply_balance)

#------------------------------
#   Load the test dataset
#------------------------------
#The labeled (test) dataset is assigned with a mask set to True

test_dataloader = generate_data_loader(test_examples, label_map, do_shuffle = False, balance_label_examples = False)

We define the Generator and Discriminator as discussed in https://www.aclweb.org/anthology/2020.acl-main.191/

We instantiate the Discriminator and Generator

In [28]:
# The config file is required to get the dimension of the vector produced by 
# the underlying transformer
config = AutoConfig.from_pretrained(model_name)
hidden_size = int(config.hidden_size)
# Define the number and width of hidden layers

# Put everything in the GPU if available
if torch.cuda.is_available():    
  transformer.cuda()
  if multi_gpu:
    transformer = torch.nn.DataParallel(transformer)

# print(config)

Let's go with the training procedure

In [29]:
import gc
gc.collect()
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()

In [30]:
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

#models parameters
transformer_vars = [i for i in transformer.parameters()]


#optimizer
optimizer = torch.optim.AdamW(transformer_vars, lr=learning_rate_discriminator)

#scheduler
if apply_scheduler:
  num_train_examples = len(train_examples)
  num_train_steps = int(num_train_examples / batch_size * num_train_epochs)
  num_warmup_steps = int(num_train_steps * warmup_proportion)
  scheduler = get_constant_schedule_with_warmup(optimizer, 
                                           num_warmup_steps = num_warmup_steps)

# For each epoch...
for epoch_i in range(0, num_train_epochs):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_train_epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    tr_loss  = 0

    # Put the model into training mode.
    transformer.train() 

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every print_each_n_step batches.
        if step % print_each_n_step == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        real_batch_size = b_input_ids.shape[0]
     
        # Encode real data in the Transformer
        model_outputs = transformer(b_input_ids, attention_mask=b_input_mask,labels=b_labels)
        step_loss=model_outputs[0]
        

        #---------------------------------
        #  OPTIMIZATION
        #---------------------------------
        # Avoid gradient accumulation
        optimizer.zero_grad()

        # Calculate weigth updates
        # retain_graph=True is required since the underlying graph will be deleted after backward
        step_loss.backward() 
                
        # Apply modifications
        optimizer.step()
        # A detail log of the individual losses
        #print("{0:.4f}\t{1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}".
        #      format(D_L_Supervised, D_L_unsupervised1U, D_L_unsupervised2U,
        #             g_loss_d, g_feat_reg))

        # Save the losses to print them later
        tr_loss += step_loss.item()

        # Update the learning rate with the scheduler
        if apply_scheduler:
          scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = tr_loss / len(train_dataloader)
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #     TEST ON THE EVALUATION DATASET
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our test set.
    print("")
    print("Running Test...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    transformer.eval() #maybe redundant

    # Tracking variables 
    total_test_accuracy = 0
   
    total_test_loss = 0
    nb_test_steps = 0

    all_preds = []
    all_labels_ids = []

    #loss
    nll_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)

    # Evaluate data for one epoch
    for batch in test_dataloader:
        
        # Unpack this training batch from our dataloader. 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():    
            output=transformer(b_input_ids, attention_mask=b_input_mask,labels=b_labels)
            tmp_eval_loss, logits = output[0],output[1]
            # Accumulate the test loss.            
        # Accumulate the predictions and the input labels
        preds = np.argmax(logits.to('cpu'), axis=1)
        all_preds += preds.detach().cpu()
        all_labels_ids += b_labels.detach().cpu()
        total_test_loss+=tmp_eval_loss.item()

    # Report the final accuracy for this validation run.
    all_preds = torch.stack(all_preds).numpy()
    print('all_preds')
    print(all_preds)
    all_labels_ids = torch.stack(all_labels_ids).numpy()
    print('all_labels')
    print(all_labels_ids)
    test_accuracy = np.sum(all_preds == all_labels_ids) / len(all_preds)
    print("  Accuracy: {0:.3f}".format(test_accuracy))

    # Calculate the average loss over all of the batches.
    avg_test_loss = total_test_loss / len(test_dataloader)
    avg_test_loss = avg_test_loss
    
    # Measure how long the validation run took.
    test_time = format_time(time.time() - t0)
    
    print("  Test Loss: {0:.3f}".format(avg_test_loss))
    print("  Test took: {:}".format(test_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Valid. Loss': avg_test_loss,
            'Valid. Accur.': test_accuracy,
            'Training Time': training_time,
            'Test Time': test_time
        }
    )


Training...
  Batch    10  of    312.    Elapsed: 0:00:03.
  Batch    20  of    312.    Elapsed: 0:00:06.
  Batch    30  of    312.    Elapsed: 0:00:09.
  Batch    40  of    312.    Elapsed: 0:00:12.
  Batch    50  of    312.    Elapsed: 0:00:16.
  Batch    60  of    312.    Elapsed: 0:00:19.
  Batch    70  of    312.    Elapsed: 0:00:22.
  Batch    80  of    312.    Elapsed: 0:00:25.
  Batch    90  of    312.    Elapsed: 0:00:28.
  Batch   100  of    312.    Elapsed: 0:00:31.
  Batch   110  of    312.    Elapsed: 0:00:34.
  Batch   120  of    312.    Elapsed: 0:00:37.
  Batch   130  of    312.    Elapsed: 0:00:40.
  Batch   140  of    312.    Elapsed: 0:00:44.
  Batch   150  of    312.    Elapsed: 0:00:47.
  Batch   160  of    312.    Elapsed: 0:00:50.
  Batch   170  of    312.    Elapsed: 0:00:53.
  Batch   180  of    312.    Elapsed: 0:00:56.
  Batch   190  of    312.    Elapsed: 0:00:59.
  Batch   200  of    312.    Elapsed: 0:01:02.
  Batch   210  of    312.    Elapsed: 0:01:06.


In [31]:
for stat in training_stats:
  print(stat)

print("\nTraining complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

{'epoch': 1, 'Valid. Loss': 1.2845545972988104, 'Valid. Accur.': 0.4188, 'Training Time': '0:01:37', 'Test Time': '0:00:17'}
{'epoch': 2, 'Valid. Loss': 1.171353978336237, 'Valid. Accur.': 0.4768, 'Training Time': '0:01:38', 'Test Time': '0:00:17'}
{'epoch': 3, 'Valid. Loss': 1.2208026575434738, 'Valid. Accur.': 0.4546, 'Training Time': '0:01:38', 'Test Time': '0:00:17'}
{'epoch': 4, 'Valid. Loss': 1.1437237737285104, 'Valid. Accur.': 0.5022, 'Training Time': '0:01:38', 'Test Time': '0:00:17'}
{'epoch': 5, 'Valid. Loss': 1.219955934081108, 'Valid. Accur.': 0.502, 'Training Time': '0:01:38', 'Test Time': '0:00:17'}

Training complete!
Total training took 0:09:32 (h:mm:ss)
