# Fine-tune the model with the Amazon Customer Reviews Dataset and a set of prompts

In [2]:
import psutil

notebook_memory = psutil.virtual_memory()
print(notebook_memory)

if notebook_memory.total < 32 * 1000 * 1000 * 1000:
    print('*******************************************')    
    print('YOU ARE NOT USING THE CORRECT INSTANCE TYPE')
    print('PLEASE CHANGE INSTANCE TYPE TO  m5.2xlarge ')
    print('*******************************************')
else:
    correct_instance_type=True

svmem(total=33242578944, available=31007662080, percent=6.7, used=1821356032, free=13780353024, active=1140105216, inactive=16285843456, buffers=2768896, cached=17638100992, shared=847872, slab=1533464576)


In [3]:
#model_checkpoint = "facebook/opt-350m"
model_checkpoint = "bigscience/bloomz-560m"

# Create prompts for few-shot, one-shot, zero-shot inference on sample data

In [5]:
import pandas as pd
import csv
file = './data-tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz'

# Read the file
df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")

df.isna().values.any()
df = df.dropna()
df = df.reset_index(drop=True)    

print("Shape of dataframe {}".format(df.shape))

# Convert Pandas dataframes into Datasets
import datasets
from datasets import Dataset

# Create Dataset objects (Arrow PyTables) from Pandas dataframes
dataset = Dataset.from_pandas(df)

# Apply prompt    
from promptsource.templates import DatasetTemplates
prompt_templates = DatasetTemplates('amazon_us_reviews/Wireless_v1_00') 

for template in prompt_templates.templates.values():
    print(template.get_name())

prompt = prompt_templates["Given the review body return a categorical rating"]
print(prompt.answer_choices)    
print(prompt.__dict__)

dataset = dataset.select([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).map(lambda row : {'prompt': prompt.apply(row)[0], 'label': prompt.apply(row)[1]})
prompt = dataset[0]['prompt']
label = dataset[0]['label']
print(prompt)
print(label)

Shape of dataframe (145427, 15)
Generate review headline based on review body
Generate review based on rating and category
Given the review headline return a categorical rating
Generate review headline based on rating
Given the review body return a categorical rating
1 ||| 2 ||| 3 ||| 4 ||| 5
{'answer_choices': '1 ||| 2 ||| 3 ||| 4 ||| 5', 'id': 'e6a1bbde-715d-4dad-9178-e2bcfaf5c646', 'jinja': "Given the following review:\n{{review_body}}\npredict the associated rating from the following choices (1 being lowest and 5 being highest)\n- {{ answer_choices | join('\\n- ') }} \n|||\n{{answer_choices[star_rating-1]}}", 'metadata': <promptsource.templates.Template.Metadata object at 0x7f0a69f8d350>, 'name': 'Given the review body return a categorical rating', 'reference': 'Given the review body, return a categorical rating. '}


  0%|          | 0/10 [00:00<?, ?ex/s]

Given the following review:
I keep buying madden every year hoping they get back to football. This years version is a little better than last years -- but that's not saying much.The game looks great. The only thing wrong with the animation, is the way the players are always tripping on each other.<br /><br />The gameplay is still slowed down by the bloated pre-play controls. What used to take two buttons is now a giant PITA to get done before an opponent snaps the ball or the play clock runs out.<br /><br />The turbo button is back, but the player movement is still slow and awkward. If you liked last years version, I'm guessing you'll like this too. I haven't had a chance to play anything other than training and a few online games, so I'm crossing my fingers and hoping the rest is better.<br /><br />The one thing I can recommend is NOT TO BUY THE MADDEN BUNDLE. The game comes as a download. So if you hate it, there's no trading it in at Gamestop.
predict the associated rating from the 

In [6]:
prompt0 = dataset[0]
prompt1 = dataset[1]
prompt2 = dataset[2]
prompt3 = dataset[3]

few_shot_prompt = 'PROMPT: ' + prompt0['prompt'] + '\nRESPONSE: ' + prompt0['label'] + '\n\nPROMPT: ' + prompt1['prompt'] + '\nRESPONSE: ' + prompt1['label'] + '\n\nPROMPT: ' + prompt2['prompt'] + '\nRESPONSE: ' + prompt2['label'] + '\n\nPROMPT: ' + prompt3['prompt'] + '\nRESPONSE:'
one_shot_prompt = 'PROMPT: ' + prompt0['prompt'] + '\nRESPONSE: ' + prompt0['label'] + '\n\nPROMPT: ' + prompt1['prompt'] + '\nRESPONSE:'
zero_shot_prompt = 'PROMPT: ' + prompt0['prompt'] + '\nRESPONSE:'

# Perform few-shot, one-shot, zero-shot inference BEFORE fine-tuning

To tokenize all our texts with the same vocabulary that was used when training the model, we have to download a pretrained tokenizer. This is all done by the `AutoTokenizer` class:

In [7]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

We can now call the tokenizer on all our texts. This is very simple, using the [`map`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map) method from the Datasets library. First we define a function that call the tokenizer on our texts:

Now that the data has been loaded, we're ready to instantiate our `Trainer`. We will retrieve our pre-trained model:

In [8]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

# Few-shot

In [9]:
inputs = tokenizer(few_shot_prompt, return_tensors='pt')

print(tokenizer.decode(model.generate(inputs["input_ids"], 
                       max_new_tokens=1,
                       do_sample=True, 
                       top_k=50, 
                       top_p=0.9
                      )[0]))

print('EXPECTED RESPONSE: {}'.format(prompt3['label']))

PROMPT: Given the following review:
I keep buying madden every year hoping they get back to football. This years version is a little better than last years -- but that's not saying much.The game looks great. The only thing wrong with the animation, is the way the players are always tripping on each other.<br /><br />The gameplay is still slowed down by the bloated pre-play controls. What used to take two buttons is now a giant PITA to get done before an opponent snaps the ball or the play clock runs out.<br /><br />The turbo button is back, but the player movement is still slow and awkward. If you liked last years version, I'm guessing you'll like this too. I haven't had a chance to play anything other than training and a few online games, so I'm crossing my fingers and hoping the rest is better.<br /><br />The one thing I can recommend is NOT TO BUY THE MADDEN BUNDLE. The game comes as a download. So if you hate it, there's no trading it in at Gamestop.
predict the associated rating f

# One-shot

In [10]:
inputs = tokenizer(one_shot_prompt, return_tensors='pt')

print(tokenizer.decode(model.generate(inputs["input_ids"], 
                       max_new_tokens=1,
                       do_sample=True, 
                       top_k=50, 
                       top_p=0.9
                      )[0]))

print('EXPECTED RESPONSE: {}'.format(prompt1['label']))

PROMPT: Given the following review:
I keep buying madden every year hoping they get back to football. This years version is a little better than last years -- but that's not saying much.The game looks great. The only thing wrong with the animation, is the way the players are always tripping on each other.<br /><br />The gameplay is still slowed down by the bloated pre-play controls. What used to take two buttons is now a giant PITA to get done before an opponent snaps the ball or the play clock runs out.<br /><br />The turbo button is back, but the player movement is still slow and awkward. If you liked last years version, I'm guessing you'll like this too. I haven't had a chance to play anything other than training and a few online games, so I'm crossing my fingers and hoping the rest is better.<br /><br />The one thing I can recommend is NOT TO BUY THE MADDEN BUNDLE. The game comes as a download. So if you hate it, there's no trading it in at Gamestop.
predict the associated rating f

# Zero-shot

In [11]:
inputs = tokenizer(zero_shot_prompt, return_tensors='pt')

print(tokenizer.decode(model.generate(inputs["input_ids"], 
                       max_new_tokens=1,
                       do_sample=True, 
                       top_k=50, 
                       top_p=0.9
                      )[0]))

print('EXPECTED RESPONSE: {}'.format(prompt0['label']))

PROMPT: Given the following review:
I keep buying madden every year hoping they get back to football. This years version is a little better than last years -- but that's not saying much.The game looks great. The only thing wrong with the animation, is the way the players are always tripping on each other.<br /><br />The gameplay is still slowed down by the bloated pre-play controls. What used to take two buttons is now a giant PITA to get done before an opponent snaps the ball or the play clock runs out.<br /><br />The turbo button is back, but the player movement is still slow and awkward. If you liked last years version, I'm guessing you'll like this too. I haven't had a chance to play anything other than training and a few online games, so I'm crossing my fingers and hoping the rest is better.<br /><br />The one thing I can recommend is NOT TO BUY THE MADDEN BUNDLE. The game comes as a download. So if you hate it, there's no trading it in at Gamestop.
predict the associated rating f

# Fine-tune the model with the Amazon Customer Reviews Data

In [4]:
from datasets import Dataset

lm_dataset_train = Dataset.from_parquet('./data/train/*.parquet')
print(lm_dataset_train.shape)

Using custom data configuration default-0b7843339d711258
Found cached dataset parquet (/root/.cache/huggingface/datasets/parquet/default-0b7843339d711258/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


(36, 3)


In [12]:
from transformers import TrainingArguments
import torch

model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    f"{model_name}-finetuned-amazon-customer-reviews",
    learning_rate=2e-5,
    weight_decay=0.01, 
    max_steps=10,
    num_train_epochs=1.0,
    no_cuda=not torch.cuda.is_available()    
)

We pass along all of those to the `Trainer` class:

In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset_train
)

max_steps is given, it will override any value given in num_train_epochs


In [14]:
train_results = trainer.train()
train_results

***** Running training *****
  Num examples = 36
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 559214592


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=10, training_loss=9.286512756347657, metrics={'train_runtime': 117.0914, 'train_samples_per_second': 0.683, 'train_steps_per_second': 0.085, 'total_flos': 16716725747712.0, 'train_loss': 9.286512756347657, 'epoch': 2.0})

# Save fine-tuned model

In [15]:
model_path = './tmp_models/{}/'.format(model_checkpoint)

model.save_pretrained(model_path)

Configuration saved in ./tmp_models/bigscience/bloomz-560m/config.json
Configuration saved in ./tmp_models/bigscience/bloomz-560m/generation_config.json
Model weights saved in ./tmp_models/bigscience/bloomz-560m/pytorch_model.bin


# Perform few-shot, one-shot, zero-shot inference AFTER fine-tuning

In [16]:
import transformers
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(model_path)

loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--bigscience--bloomz-560m/snapshots/64a6f1765615d2c38a7fe4474b8bff5a6c0e14e6/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--bigscience--bloomz-560m/snapshots/64a6f1765615d2c38a7fe4474b8bff5a6c0e14e6/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bigscience--bloomz-560m/snapshots/64a6f1765615d2c38a7fe4474b8bff5a6c0e14e6/tokenizer_config.json
loading configuration file ./tmp_models/bigscience/bloomz-560m/config.json
Model config BloomConfig {
  "_name_or_path": "./tmp_models/bigscience/bloomz-560m/",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_

This model also supports many advanced parameters while performing inference including the following:

**max_length**: Model generates text until the output length (which includes the input context length) reaches max_length. If specified, it must be a positive integer.

**num_return_sequences**: Number of output sequences returned. If specified, it must be a positive integer.

**num_beams**: Number of beams used in the greedy search. If specified, it must be integer greater than or equal to num_return_sequences.

**no_repeat_ngram_size**: Model ensures that a sequence of words of no_repeat_ngram_size is not repeated in the output sequence. If specified, it must be a positive integer greater than 1.

**temperature**: Controls the randomness in the output. Higher temperature results in output sequence with low-probability words and lower temperature results in output sequence with high-probability words. If temperature -> 0, it results in greedy decoding. If specified, it must be a positive float.

**early_stopping**: If True, text generation is finished when all beam hypotheses reach the end of stence token. If specified, it must be boolean.

**do_sample**: If True, sample the next word as per the likelyhood. If specified, it must be boolean.

**top_k**: In each step of text generation, sample from only the top_k most likely words. If specified, it must be a positive integer.

**top_p**: In each step of text generation, sample from the smallest possible set of words with cumulative probability top_p. If specified, it must be a float between 0 and 1.

**seed**: Fix the randomized state for reproducibility. If specified, it must be an integer.

In [17]:
inputs = tokenizer(few_shot_prompt, return_tensors='pt')

print(tokenizer.decode(model.generate(inputs["input_ids"], 
                       max_new_tokens=1,
                       do_sample=True, 
                       top_k=50, 
                       top_p=0.9
                      )[0]))

print('EXPECTED RESPONSE: {}'.format(prompt3['label']))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.1"
}



PROMPT: Given the following review:
I keep buying madden every year hoping they get back to football. This years version is a little better than last years -- but that's not saying much.The game looks great. The only thing wrong with the animation, is the way the players are always tripping on each other.<br /><br />The gameplay is still slowed down by the bloated pre-play controls. What used to take two buttons is now a giant PITA to get done before an opponent snaps the ball or the play clock runs out.<br /><br />The turbo button is back, but the player movement is still slow and awkward. If you liked last years version, I'm guessing you'll like this too. I haven't had a chance to play anything other than training and a few online games, so I'm crossing my fingers and hoping the rest is better.<br /><br />The one thing I can recommend is NOT TO BUY THE MADDEN BUNDLE. The game comes as a download. So if you hate it, there's no trading it in at Gamestop.
predict the associated rating f

In [18]:
inputs = tokenizer(one_shot_prompt, return_tensors='pt')

print(tokenizer.decode(model.generate(inputs["input_ids"], 
                       max_new_tokens=1,
                       do_sample=True, 
                       top_k=50, 
                       top_p=0.9
                      )[0]))

print('EXPECTED RESPONSE: {}'.format(prompt1['label']))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.1"
}



PROMPT: Given the following review:
I keep buying madden every year hoping they get back to football. This years version is a little better than last years -- but that's not saying much.The game looks great. The only thing wrong with the animation, is the way the players are always tripping on each other.<br /><br />The gameplay is still slowed down by the bloated pre-play controls. What used to take two buttons is now a giant PITA to get done before an opponent snaps the ball or the play clock runs out.<br /><br />The turbo button is back, but the player movement is still slow and awkward. If you liked last years version, I'm guessing you'll like this too. I haven't had a chance to play anything other than training and a few online games, so I'm crossing my fingers and hoping the rest is better.<br /><br />The one thing I can recommend is NOT TO BUY THE MADDEN BUNDLE. The game comes as a download. So if you hate it, there's no trading it in at Gamestop.
predict the associated rating f

In [19]:
inputs = tokenizer(zero_shot_prompt, return_tensors='pt')

print(tokenizer.decode(model.generate(inputs["input_ids"], 
                       max_new_tokens=1,
                       do_sample=True, 
                       top_k=50, 
                       top_p=0.9
                      )[0]))

print('EXPECTED RESPONSE: {}'.format(prompt0['label']))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.1"
}



PROMPT: Given the following review:
I keep buying madden every year hoping they get back to football. This years version is a little better than last years -- but that's not saying much.The game looks great. The only thing wrong with the animation, is the way the players are always tripping on each other.<br /><br />The gameplay is still slowed down by the bloated pre-play controls. What used to take two buttons is now a giant PITA to get done before an opponent snaps the ball or the play clock runs out.<br /><br />The turbo button is back, but the player movement is still slow and awkward. If you liked last years version, I'm guessing you'll like this too. I haven't had a chance to play anything other than training and a few online games, so I'm crossing my fingers and hoping the rest is better.<br /><br />The one thing I can recommend is NOT TO BUY THE MADDEN BUNDLE. The game comes as a download. So if you hate it, there's no trading it in at Gamestop.
predict the associated rating f

In [23]:
import transformers
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

print(model.get_memory_footprint())

quantized_model = AutoModelForCausalLM.from_pretrained(model_path, load_in_8bit=True, device_map='auto')

print(quantized_model.get_memory_footprint())


loading configuration file ./tmp_models/bigscience/bloomz-560m/config.json
Model config BloomConfig {
  "_name_or_path": "./tmp_models/bigscience/bloomz-560m/",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropout": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "masked_softmax_fusion": true,
  "model_type": "bloom",
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "offset_alibi": 100,
  "pad_token_id": 3,
  "pretraining_tp": 1,
  "seq_length": 2048,
  "skip_bias_add": true,
  "skip_bias_add_qkv": false,
  "slow_but_exact": false,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "unk_token_id": 0,
  "use_cache": true,
  "vocab_size": 250880
}

Loading the model in mixed int8 - forcing the weights to be casted in float16


2236858368


Instantiating BloomForCausalLM model under default dtype torch.float16.


NameError: name 'init_empty_weights' is not defined

# EXTRAS (work-in-progress)

In [None]:
# # Create Dataset objects (Arrow PyTables) from Pandas dataframes
# dataset = Dataset.from_pandas(df)

# # Apply prompt    
# from promptsource.templates import DatasetTemplates
# prompt_templates = DatasetTemplates('amazon_us_reviews/Wireless_v1_00') 

# for template in prompt_templates.templates.values():
#     print(template.get_name())

# prompt = prompt_templates["Given the review body return a categorical rating"]
# print(prompt.answer_choices)    
# print(prompt.__dict__)

# dataset = dataset.select([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).map(lambda row : {'prompt': prompt.apply(row)[0], 'label': prompt.apply(row)[1]})
# prompt = dataset[0]['prompt']
# label = dataset[0]['label']
# print(prompt)
# print(label)

In [None]:
# prompt0 = dataset[0]
# prompt1 = dataset[1]
# prompt2 = dataset[2]
# prompt3 = dataset[3]

# #few_shot_prompt = prompt0['prompt'] + '\nFor the previous review, is the sentiment positive, neutral, or negative?' + prompt0['label'] + '\n\nPROMPT: ' + prompt1['prompt'] + '\nRESPONSE: ' + prompt1['label'] + '\n\nPROMPT: ' + prompt2['prompt'] + '\nRESPONSE: ' + prompt2['label'] + '\n\nPROMPT: ' + prompt3['prompt'] + '\nRESPONSE:'
# #one_shot_prompt = 'PROMPT: ' + prompt0['prompt'] + '\nRESPONSE: ' + prompt0['label'] + '\n\nPROMPT: ' + prompt1['prompt'] + '\nRESPONSE:'
# zero_shot_prompt = prompt0['prompt'] + '\nFor the previous review, is the sentiment positive, neutral, or negative?'

In [None]:
# inputs = tokenizer(zero_shot_prompt, return_tensors='pt')

# print(tokenizer.decode(model.generate(inputs["input_ids"], 
#                        max_new_tokens=1,
#                        do_sample=True, 
#                        top_k=50, 
#                        top_p=0.9
#                       )[0]))

# print('EXPECTED RESPONSE: {}'.format(prompt0['label']))

In [None]:
# %%html

# <p><b>Shutting down your kernel for this notebook to release resources.</b></p>
# <button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
# <script>
# try {
#     els = document.getElementsByClassName("sm-command-button");
#     els[0].click();
# }
# catch(err) {
#     // NoOp
# }    
# </script>