# Train reward model with human feedback

In [None]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [None]:
import io
import json
import uuid
import time
import boto3
import botocore

# Amazon Python SDK clients
sagemaker = boto3.client("sagemaker", region)
a2i = boto3.client("sagemaker-a2i-runtime")
s3 = boto3.client("s3", region)

In [None]:
import os
import glob
import numpy as np
import argparse
import pprint
from collections import defaultdict

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import Dataset, DataLoader

from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

# Retrieve the `human_loop_name`

In [None]:
# %store -r human_loop_name

In [None]:
# try:
#     human_loop_name
# except NameError:
#     print("*** PLEASE RUN PREVIOUS NOTEBOOK BEFORE CONTINUING ***")

In [None]:
# print(human_loop_name)

# Verify the Human Loops are Completed

In [None]:
# import time

# completed_human_loops = []
# for human_loop_name in human_loops_started:
#     resp = a2i.describe_human_loop(HumanLoopName=human_loop_name)
#     print(f"HumanLoop Name: {human_loop_name}")
#     print(f'HumanLoop Status: {resp["HumanLoopStatus"]}')
#     print(f'HumanLoop Output Destination: {resp["HumanLoopOutput"]}')
#     print("")
#     while resp["HumanLoopStatus"] != "Completed":
#         print(f"Waiting for HumanLoop to complete.")
#         time.sleep(10)
#         resp = a2i.describe_human_loop(HumanLoopName=human_loop_name)
#     if resp["HumanLoopStatus"] == "Completed":
#         completed_human_loops.append(resp)
#         print(f"Completed!")
#         print("")

# View Human Labels  

Once the work is complete, Amazon A2I stores the results in the specified S3 bucket and sends a Cloudwatch Event.  Let's check the S3 contents.

In [None]:
# import re
# import pprint

# pp = pprint.PrettyPrinter(indent=4)

# human_feedback_items = []
# human
# for resp in completed_human_loops:
#     human_feedback_s3_uri = "s3://" + bucket + "/", resp["HumanLoopOutput"]["OutputS3Uri"]
#     split_string = re.split(human_feedback_s3_uri)
#     output_bucket_key = split_string[1]

#     response = s3.get_object(Bucket=bucket, Key=output_bucket_key)
#     content = response["Body"].read().decode("utf-8")
#     json_output = json.loads(content)
#     print(json_output)

#     input_content = json_output["inputContent"]
#     human_answer = json_output["humanAnswers"][0]["answerContent"]
#     human_feedback_item = {"input_content": input_content, "human_answer": human_answer, s3_uri: "s3_uri"}
#     human_feedback_items.append(human_feedback_item)

# Prepare the Data for Re-training

In [None]:
# df_human_feedback_items = pd.DataFrame(human_feedback_items)
# df_human_feedback_items.head()

# Train a reward model with human preference, instruction, and alignment data
This is typically a language model initialized from the supervised-fine-tuned (SFT) model (trained in a previous notebook), but with an additional binary-classification layer placed on top.  This reward model is used to train the reinforcement-learning model in the next step.  The reinforcement-learning model is what is deployed into production to serve applications.

In [None]:
%store -r model_checkpoint

In [None]:
try:
    model_checkpoint
except NameError:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the previous section before you continue.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
print(model_checkpoint)

In [None]:
%store -r supervised_fine_tuned_model_path

In [None]:
try:
    supervised_fine_tuned_model_path
except NameError:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the previous section before you continue.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [None]:
print(supervised_fine_tuned_model_path)

In [None]:
def create_list_input_files(path):
    input_files = glob.glob('{}/*.parquet'.format(path))
    print(input_files)
    return input_files

def save_transformer_model(model, model_dir):
    path = os.path.join(model_dir, 'transformer')
    os.makedirs(path, exist_ok=True)                              
    print('Saving Transformer model to {}'.format(path))
    model.save_pretrained(path)

def save_pytorch_model(model, model_checkpoint, model_dir):
    os.makedirs(model_dir, exist_ok=True) 
    print('Saving PyTorch model to {}'.format(model_dir))
    save_path = os.path.join(model_dir, model_checkpoint.replace('/', '-'))
    torch.save(model.state_dict(), save_path)

# PyTorch Dataset and DataLoader 

In [None]:
# PyTorch dataset retrieves the dataset’s features and labels one sample at a time
# Create a custom Dataset class for the reviews
class ReviewDataset(Dataset):
    
    def __init__(self, input_ids_list, label_id_list):
        self.input_ids_list = input_ids_list
        self.label_id_list = label_id_list

    def __len__(self):
        return len(self.input_ids_list)

    def __getitem__(self, item):
        # convert list of token_ids into an array of PyTorch LongTensors
        input_ids = json.loads(self.input_ids_list[item]) 
        label_id = self.label_id_list[item]

        input_ids_tensor = torch.LongTensor(input_ids)
        label_id_tensor = torch.tensor(label_id, dtype=torch.long)

        return input_ids_tensor, label_id_tensor

    
# PyTorch DataLoader helps to to organise the input training data in “minibatches” and reshuffle the data at every epoch
# It takes Dataset as an input
def create_data_loader(path, batch_size): 
    print("Get data loader")

    df = pd.DataFrame(columns=['input_ids', 'label_id'])
    
    input_files = create_list_input_files(path)

    for file in input_files:
        # df_temp = pd.read_csv(file, 
        #                       sep='\t', 
        #                       usecols=['input_ids', 'label_id'])
        df_temp = pd.read_parquet(file)
        df = df.append(df_temp)
        print('adding df_temp: {}'.format(df_temp))
        
    ds = ReviewDataset(
        input_ids_list=df.input_ids.to_numpy(),
        label_id_list=df.label_id.to_numpy(),
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
    ), df



# Configure the model

In [None]:
# TODO:  Change this to binary classification
#        where 1 is assigned to the human-selected (presumably-correct) label
#        and 0 is assigned to all of other labels

def get_model_config():
    classes = [1, 2, 3, 4, 5]

    config = AutoConfig.from_pretrained(
        supervised_fine_tuned_model_path,        
        num_labels=len(classes),
        id2label={
            0: 1, 
            1: 2, 
            2: 3, 
            3: 4, 
            4: 5            
        },
        label2id={
            1: 0,
            2: 1,
            3: 2,
            4: 3,
            5: 4
        }
    )
    
    config.output_attentions=True

    return config

# Train the reward model

In [None]:
def train_model(model,
                train_data_loader,
                df_train,
                val_data_loader, 
                df_val,
                args):
    
    loss_function = nn.CrossEntropyLoss()    
    optimizer = optim.Adam(params=model.parameters(), lr=args.learning_rate)
    
    if args.freeze_base_layers:
        print('Freezing base layers...')
        for name, param in model.named_parameters():
            if 'classifier' not in name:  # classifier layer
                param.requires_grad = False
        print('Set classifier layers to `param.requires_grad=False`.')        
    
    train_correct = 0
    train_total = 0

    for epoch in range(args.epochs):
        print('EPOCH -- {}'.format(epoch))

        for i, (sent, label) in enumerate(train_data_loader):
            print('i: ' + i)
            print('sent: ' + sent)
            print('label: ' + label)            
            if i < args.train_steps_per_epoch:
                model.train()
                optimizer.zero_grad()
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                    sent = sent.cuda()
                    label = label.cuda()
                output = model(sent)[0]
                _, predicted = torch.max(output, 1)

                loss = loss_function(output, label)
                loss.backward()
                optimizer.step()
            
                if args.run_validation and i % args.validation_steps_per_epoch == 0:
                    print('RUNNING VALIDATION:')
                    correct = 0
                    total = 0
                    model.eval()

                    for sent, label in val_data_loader:
                        sent = sent.squeeze(0)
                        if torch.cuda.is_available():
                            sent = sent.cuda()
                            label = label.cuda()
                        output = model(sent)[0]
                        _, predicted = torch.max(output.data, 1)

                        total += label.size(0)
                        correct += (predicted.cpu() ==label.cpu()).sum()

                    accuracy = 100.00 * correct.numpy() / total
                    print('[epoch/step: {0}/{1}] val_loss: {2:.2f} - val_acc: {3:.2f}%'.format(epoch, i, loss.item(), accuracy))
            else:
                break           

    print('TRAINING COMPLETED.')
    return model

In [None]:
#if __name__ == '__main__':
    
# Parse args


os.environ['SM_HOSTS'] = '{"hosts": ["algo-1"]}'
os.environ['SM_CURRENT_HOST'] = 'algo-1'
os.environ['SM_NUM_GPUS'] = '0'
os.environ['SM_MODEL_DIR'] = './model/reward_model/'
os.environ['SM_CHANNEL_TRAIN'] = './data/train'
os.environ['SM_CHANNEL_VALIDATION'] = './data/validation'
os.environ['SM_OUTPUT_DIR'] = './model_output/'


parser = argparse.ArgumentParser()

# CLI args

parser.add_argument('--train_batch_size', 
                    type=int, 
                    default=64)

parser.add_argument('--train_steps_per_epoch',
                    type=int,
                    default=64)

parser.add_argument('--validation_batch_size', 
                    type=int, 
                    default=64)

parser.add_argument('--validation_steps_per_epoch',
                    type=int,
                    default=64)

parser.add_argument('--epochs', 
                    type=int, 
                    default=10)

parser.add_argument('--freeze_base_layers', 
                    type=eval, 
                    default=False)

parser.add_argument('--learning_rate', 
                    type=float, 
                    default=0.01)

parser.add_argument('--momentum', 
                    type=float, 
                    default=0.5)

parser.add_argument('--seed', 
                    type=int, 
                    default=42)

parser.add_argument('--log_interval', 
                    type=int, 
                    default=100)

parser.add_argument('--backend', 
                    type=str, 
                    default=None)

parser.add_argument('--run_validation', 
                    type=eval,
                    default=False)

parser.add_argument('--model-checkpoint', 
                    type=str,
                    default=model_checkpoint)


# Container environment  

parser.add_argument('--hosts', 
                    type=list, 
                    default=json.loads(os.environ['SM_HOSTS']))

parser.add_argument('--current_host', 
                    type=str, 
                    default=os.environ['SM_CURRENT_HOST'])

parser.add_argument('--model_dir', 
                    type=str, 
                    default=os.environ['SM_MODEL_DIR'])

parser.add_argument('--train_data', 
                    type=str, 
                    default=os.environ['SM_CHANNEL_TRAIN'])

parser.add_argument('--validation_data', 
                    type=str, 
                    default=os.environ['SM_CHANNEL_VALIDATION'])

parser.add_argument('--output_dir', 
                    type=str, 
                    default=os.environ['SM_OUTPUT_DIR'])

parser.add_argument('--num_gpus', 
                    type=int, 
                    default=os.environ['SM_NUM_GPUS'])

# Debugger args

parser.add_argument("--save-frequency", 
                    type=int, 
                    default=10, 
                    help="frequency with which to save steps")

parser.add_argument("--smdebug_path",
                    type=str,
                    help="output directory to save data in",
                    default="/opt/ml/output/tensors",)

parser.add_argument("--hook-type",
                    type=str,
                    choices=["saveall", "module-input-output", "weights-bias-gradients"],
                    default="saveall",)

args, _ = parser.parse_known_args()


print('Loaded arguments:')
print(args)

# Get environment variables

env_var = os.environ 
print('Environment variables:')
pprint.pprint(dict(env_var), width = 1) 

# Check if distributed training

is_distributed = len(args.hosts) > 1 and args.backend is not None

print("Distributed training - {}".format(is_distributed))
use_cuda = args.num_gpus > 0
print("Number of gpus available - {}".format(args.num_gpus))
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

device = torch.device('cuda' if use_cuda else 'cpu')

# Initialize the distributed environment.

if is_distributed:
    world_size = len(args.hosts)
    os.environ['WORLD_SIZE'] = str(world_size)
    host_rank = args.hosts.index(args.current_host)
    os.environ['RANK'] = str(host_rank)
    dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size)
    print('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
        args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format(
        dist.get_rank(), args.num_gpus))

# Set the seed for generating random numbers

torch.manual_seed(args.seed)
if use_cuda:
    torch.cuda.manual_seed(args.seed) 

# Instantiate model

config = None
model = None

successful_download = False
retries = 0

while (retries < 5 and not successful_download):
    try:
        # Setup model
        config = get_model_config()
        model = AutoModelForSequenceClassification.from_pretrained(
            supervised_fine_tuned_model_path,
            config=config
        )

        model.to(device)
        successful_download = True
        print('Sucessfully downloaded after {} retries.'.format(retries))

    except:
        retries = retries + 1
        random_sleep = random.randint(1, 30)
        print('Retry #{}.  Sleeping for {} seconds'.format(retries, random_sleep))
        time.sleep(random_sleep)

if not model:
     print('Not properly initialized...')

# Create data loaders

train_data_loader, df_train = create_data_loader(args.train_data, args.train_batch_size)
val_data_loader, df_val = create_data_loader(args.validation_data, args.validation_batch_size)

print("Processes {}/{} ({:.0f}%) of train data".format(
    len(train_data_loader.sampler), len(train_data_loader.dataset),
    100. * len(train_data_loader.sampler) / len(train_data_loader.dataset)
))

print("Processes {}/{} ({:.0f}%) of validation data".format(
    len(val_data_loader.sampler), len(val_data_loader.dataset),
    100. * len(val_data_loader.sampler) / len(val_data_loader.dataset)
)) 

print('model_dir: {}'.format(args.model_dir))    
#print('model summary: {}'.format(model))

callbacks = []
initial_epoch_number = 0

# Start training

model = train_model(
    model,
    train_data_loader,
    df_train,
    val_data_loader, 
    df_val,
    args
)

In [None]:
save_transformer_model(model, args.model_dir)
save_pytorch_model(model, args.model_checkpoint, args.model_dir)

In [None]:
from transformers import TextClassificationPipeline
from transformers import pipeline

reward_model_path = os.path.join(args.model_dir, 'transformer'),

device = torch.device('cuda' if use_cuda else 'cpu')

inference_pipeline = pipeline("text-classification", 
                              reward_model_path,
                             )


In [None]:
# %%html

# <p><b>Shutting down your kernel for this notebook to release resources.</b></p>
# <button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
# <script>
# try {
#     els = document.getElementsByClassName("sm-command-button");
#     els[0].click();
# }
# catch(err) {
#     // NoOp
# }    
# </script>