In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoTokenizer, Trainer, TrainingArguments
from reward_model import GPTRewardModel
import os
from typing import List
from transformers import GPT2LMHeadModel, GPT2Tokenizer

import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer

import trlx_big
from configs import (
    ModelConfig,
    OptimizerConfig,
    SchedulerConfig,
    TokenizerConfig,
    TrainConfig,
    TRLConfig,
)

from modeling_ppo import PPOConfig

2024-05-25 12:46:33.981611: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2024-05-25 12:46:35,371] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/rfaro/anaconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




2024-05-25 12:46:36,796	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.6.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-05-25 12:46:36,841	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.6.5 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Initialize the reward model from the GPT-2 model (optionally SFT GPT-2)
model = GPTRewardModel("gpt2")

# Freeze the first 70% of the hidden layers of the reward model backbone
layers = model.transformer.h
num_layers = len(layers)
num_unfrozen = int(0.3 * num_layers)
for layer in layers[:-num_unfrozen]:
    layer.requires_grad_(False)



In [3]:
class PairwiseDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length):
        self.chosen_input_ids = []
        self.chosen_attn_masks = []
        self.rejected_input_ids = []
        self.rejected_attn_masks = []
        for pair in tqdm(pairs):
            chosen, rejected = pair["chosen"], pair["rejected"]
            chosen_encodings_dict = tokenizer(
                "<|startoftext|>" + chosen + "<|endoftext|>",
                truncation=True,
                max_length=max_length,
                padding="max_length",
                return_tensors="pt",
            )
            rejected_encodings_dict = tokenizer(
                "<|startoftext|>" + rejected + "<|endoftext|>",
                truncation=True,
                max_length=max_length,
                padding="max_length",
                return_tensors="pt",
            )
            self.chosen_input_ids.append(chosen_encodings_dict["input_ids"])
            self.chosen_attn_masks.append(chosen_encodings_dict["attention_mask"])
            self.rejected_input_ids.append(rejected_encodings_dict["input_ids"])
            self.rejected_attn_masks.append(rejected_encodings_dict["attention_mask"])

    def __len__(self):
        return len(self.chosen_input_ids)

    def __getitem__(self, idx):
        return (
            self.chosen_input_ids[idx],
            self.chosen_attn_masks[idx],
            self.rejected_input_ids[idx],
            self.rejected_attn_masks[idx],
        )


class DataCollatorReward:
    def __call__(self, data):
        batch = {}
        batch["input_ids"] = torch.cat([f[0] for f in data] + [f[2] for f in data])
        batch["attention_mask"] = torch.cat([f[1] for f in data] + [f[3] for f in data])
        batch["labels"] = torch.tensor([0] * len(data) + [1] * len(data))
        return batch


def compute_metrics(eval_preds):
    chosen_end_scores = eval_preds.predictions[0]  # chosen scores
    rejected_end_scores = eval_preds.predictions[1]  # rejected scores

    result = {}
    acc = sum(chosen_end_scores > rejected_end_scores) / len(rejected_end_scores)
    result["accuracy"] = acc

    return result


In [4]:
# load json file
import json

with open("dpo_dataset_RL.json") as f:
    data = json.load(f)


In [5]:
pairs = []
for i in range(len(data["prompt"])):
    prompt = data["prompt"][i]
    chosen = data["chosen"][i]
    rejected = data["rejected"][i]
    pair = {
        'chosen': prompt + '\n' + chosen,
        'rejected': prompt + '\n' + rejected
    }
    pairs.append(pair)

In [6]:
train_size = int(0.8*len(pairs))
train_pairs = pairs[:train_size]
val_pairs = pairs[train_size:]

max_length = 550
train_dataset = PairwiseDataset(train_pairs, tokenizer, max_length)
val_dataset = PairwiseDataset(val_pairs, tokenizer, max_length)

data_collator = DataCollatorReward()    

100%|██████████| 707/707 [00:00<00:00, 1575.56it/s]
100%|██████████| 177/177 [00:00<00:00, 1328.29it/s]


# Train Language Model

In [4]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # 0 è 1, 2 è 0, 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
tokenizer.add_special_tokens({"pad_token": "<pad>"})
tokenizer.add_tokens(["<bot>: "])
model = GPT2LMHeadModel.from_pretrained("gpt2-large")
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load("/home/rfaro/anaconda3/EPFL_ML/ml-project-2-betterthanpoli/model_state_2_large_v2.pt", map_location=torch.device(device)))


<All keys matched successfully>

In [8]:
REWARD_CHECKPOINT_PATH = "reward_model.pth"
#SFT_MODEL_PATH = "gpt2-large"

config = TRLConfig(
    train=TrainConfig(
        seq_length=550,
        epochs=50,
        total_steps=100000,
        batch_size=4,
        checkpoint_interval=10000,
        eval_interval=200,
        pipeline="PromptPipeline",
        trainer="AcceleratePPOTrainer",
        save_best = False
    ),
    model=ModelConfig(
        model_path=model,
        num_layers_unfrozen=8,
    ),
    tokenizer=TokenizerConfig(
        tokenizer_path="gpt2",
        truncation_side="right",
    ),
    optimizer=OptimizerConfig(
        name="adamw",
        kwargs={
            "lr": 5.0e-6,
            "betas": [0.9, 0.999],
            "eps": 1.0e-8,
            "weight_decay": 0.01,
        },
    ),
    scheduler=SchedulerConfig(
        name="cosine_annealing",
        kwargs={
            "T_max": 100000,
            "eta_min": 5.0e-6,
        },
    ),
    method=PPOConfig(
        name="PPOConfig",
        num_rollouts=128,
        chunk_size=16,
        ppo_epochs=4,
        init_kl_coef=0.1,
        target=6,
        horizon=10000,
        gamma=1,
        lam=0.95,
        cliprange=0.2,
        cliprange_value=0.2,
        vf_coef=0.2,
        scale_reward=None,
        ref_mean=None,
        ref_std=None,
        cliprange_reward=10,
        gen_kwargs={
            "max_new_tokens": 50,
        },
    ),
)


# Load the pre-trained reward model
rw_tokenizer = AutoTokenizer.from_pretrained("gpt2")
rw_tokenizer.pad_token = rw_tokenizer.eos_token
rw_model = GPTRewardModel('gpt2').to(device)
rw_model.load_state_dict(torch.load('/home/rfaro/anaconda3/Marco/RL/reward_model.pth', map_location=device))
rw_model.half()
rw_model.eval()
max_length_input = config.train.seq_length - config.method.gen_kwargs["max_new_tokens"]
# rw_device = torch.device("cuda:{}".format(1))  # set reward model device
# rw_model.to(rw_device)

In [9]:
import json
with open("data_finetuned.json") as f:
    d = json.load(f)

dataset = []
for i in range(len(d["queries"])):
    row = (d['queries'][i], d['answers'][i])
    dataset.append(row)

print(dataset[0])
    

('What are the main applications of heat pump systems?', ' Heat pump systems are extensively used for residential heating and cooling in the European Union, particularly in the UK, where they are a significant energy-saving option. They are also widely used in the US, Australia, and other regions for commercial and industrial purposes. ')


In [10]:
train_set = dataset[:int(0.8*len(dataset))]
val_set = dataset[int(0.8*len(dataset)):]
train_queries, train_answers = zip(*train_set)
val_queries, val_answers = zip(*val_set)

In [11]:
def get_scores(samples: List[str]):
    scores_list = []
    batch_size = 2
    for i in range(0, len(samples), batch_size):
        sub_samples = samples[i : i + batch_size]
        sub_samples = ["<|startoftext|>" + chosen + "<|endoftext|>" for chosen in sub_samples]
        encodings_dict = rw_tokenizer(
            sub_samples,
            truncation=True,
            max_length=config.train.seq_length,
            padding="max_length",
            return_tensors="pt",
        )
        input_ids = encodings_dict["input_ids"].to(device)
        attn_masks = encodings_dict["attention_mask"].to(device)
        input_ids = input_ids.repeat(2, 1)
        attn_masks = attn_masks.repeat(2, 1)
        with torch.no_grad():
            sub_scores = rw_model(input_ids=input_ids, attention_mask=attn_masks)
        scores_list.append(sub_scores["chosen_end_scores"])
    scores = torch.cat(scores_list, dim=0)
    return scores

def get_prompt_dataset(prompts, max_length):
    """
    Get the prompt after T5 decoding to make sure dictionary
    of prompts and summaries is consistent decode prompt from trlX pipeline
    """
    formatted_prompts = []
    for i in tqdm(range(len(prompts))):
        tmp = tokenizer.decode(
            tokenizer(
                prompts[i],
                truncation=True,
                max_length=max_length,
                add_special_tokens=False,
            )["input_ids"],
            skip_special_tokens=True,
        ).strip()
        tmp = tmp + "\nAnswer:"
        tmp = tokenizer.decode(
            tokenizer(tmp, truncation=True, max_length=max_length, add_special_tokens=False)["input_ids"],
            skip_special_tokens=True,
        ).strip()
        formatted_prompts.append(tmp)
    return formatted_prompts

# Get the OpenAI summaries
answers_dict = {}
train_prompts = get_prompt_dataset(train_queries, max_length_input)
for i in range(len(train_prompts)):
    answers_dict[train_prompts[i]] = train_answers[i]
val_prompts = get_prompt_dataset(val_queries, max_length_input)
for i in range(len(val_prompts)):
    answers_dict[val_prompts[i]] = val_answers[i]

def reward_fn(samples: List[str], **kwargs):
    original_samples = [text.split("Answer:")[0] + "Answer:" for text in samples]
    original_samples = [text + answers_dict[text.strip()] for text in original_samples]
    original_scores = get_scores(original_samples)
    scores = get_scores(samples)
    norms_scores = scores - original_scores
    return norms_scores

100%|██████████| 707/707 [00:00<00:00, 1472.52it/s]
100%|██████████| 177/177 [00:00<00:00, 1159.70it/s]


In [12]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrobinfaro[0m ([33mdeepsensing[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [13]:
import trlx
trainer = trlx.train(
    reward_fn=reward_fn,
    prompts=train_prompts,
    eval_prompts=val_prompts, 
    config=config,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[RANK 0] Initializing model: GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,),

[RANK 0] Starting training
[RANK 0] Collecting rollouts
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  torch.tensor(score, dtype=torch.float, device=device).view(
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


  0%|          | 0/6400 [00:00<?, ?it/s]

[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Collecting rollouts
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


[RANK 0] Collecting rollouts
[RANK 0] Saving pretrained model into ckpts/checkpoint_6400/hf_model
[RANK 0] Evaluating model


[generation sweep 0/1 | eval batch 0/12]:   0%|          | 0/12 [00:00<?, ?it/s]

[RANK 0] Computing rewards
[RANK 0] Summarizing evaluation


In [14]:
model = trainer.model

In [15]:
model_compare = GPT2LMHeadModel.from_pretrained("gpt2-large").to(device)
model_compare.resize_token_embeddings(len(tokenizer))
model_compare.load_state_dict(torch.load("/home/rfaro/anaconda3/EPFL_ML/ml-project-2-betterthanpoli/model_state_2_large_v2.pt", map_location=torch.device(device)))

# check if model has same parameters as model_compare
for p1, p2 in zip(model_compare.parameters(), model.parameters()):
    if p1.data.ne(p2.data).sum() > 0:
        print("Models are not equal")
        break



Models are not equal


In [16]:
# load questions from json file
import json

with open('heat_pump_questions.json') as json_file:
    q = json.load(json_file)

q_list = q['questions']


In [20]:
q_list[1]

'What are the main types of heat pumps?'

In [24]:
answers = []
for q in q_list:
    inputs = tokenizer(q, return_tensors="pt")["input_ids"].to(device)
    attention_mask = tokenizer(q, return_tensors="pt")["attention_mask"].to(device)
    outputs = model.generate(inputs = inputs, attention_mask = attention_mask, max_length=150)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).split(q)[1]
    answers.append(answer)

answers

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

[' A heat pump operates on a refrigerant fluid, typically a refrigerant derived from fossil fuels, such as natural gas, propane, or oil-derived hydrocarbon blends_ ',
 ' Air source heat pumps, such as air-source heat pumps (ASHPs) and ground source or geothermal heat pumps, provide efficient heating and cooling in indoor environments. Ground source or geothermal heat pumps, which operate on the ground, provide heat for outdoor environments. ',
 ' Heat pumps are significantly more efficient than traditional heating methods, offering energy savings of 50-80% compared to electric heaters. ',
 '  Heat pumps are highly efficient, offering energy efficiency in both heating and cooling operations. Their thermodynamic cycle, comprising a refrigerant flow-through cycle and an air-source heat pump_ ',
 '  Heat pumps are typically less efficient in heating and cooling than electric heaters, with energy usage typically comprising between 50% and 60% of the energy cost.  ',
 ' Ground-source heat pu

In [26]:
# save answers in a json file
answers_dict = {'answers': answers}

with open('ppo_heat_pump_answers.json', 'w') as json_file:
    json.dump(answers_dict, json_file)



In [27]:
print(len(answers))

100


In [32]:
# generate outputs from model
question = "What is a heat pump?"

inputs = tokenizer(question, return_tensors="pt")['input_ids'].to(device)
attention_mask = tokenizer(question, return_tensors="pt")['attention_mask'].to(device)
outputs = model.generate(inputs = inputs, attention_mask = attention_mask, max_length=150)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True).split(question)[1]
answer

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


" A heat pump is a mechanical device that extracts heat from a cooler environment to warm a warmer one. It operates by extracting heat from a cooler environment to produce heat, a process that's more efficient than generating heat. "

In [24]:
# save ppo model
torch.save(trainer.model.state_dict(), "/home/rfaro/anaconda3/Marco/RL/ppo_model.pth")

In [29]:
trainer.save_pretrained("ppo_model")

In [20]:
from transformers import AutoModelForCausalLM
ppo_model = model = AutoModelForCausalLM.from_pretrained(
    "ppo_model",
)
#ppo_model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at ppo_model were not used when initializing GPT2LMHeadModel: ['base_model.transformer.h.0.attn.c_attn.bias', 'base_model.transformer.h.0.attn.c_attn.weight', 'base_model.transformer.h.0.attn.c_proj.bias', 'base_model.transformer.h.0.attn.c_proj.weight', 'base_model.transformer.h.0.ln_1.bias', 'base_model.transformer.h.0.ln_1.weight', 'base_model.transformer.h.0.ln_2.bias', 'base_model.transformer.h.0.ln_2.weight', 'base_model.transformer.h.0.mlp.c_fc.bias', 'base_model.transformer.h.0.mlp.c_fc.weight', 'base_model.transformer.h.0.mlp.c_proj.bias', 'base_model.transformer.h.0.mlp.c_proj.weight', 'base_model.transformer.h.1.attn.c_attn.bias', 'base_model.transformer.h.1.attn.c_attn.weight', 'base_model.transformer.h.1.attn.c_proj.bias', 'base_model.transformer.h.1.attn.c_proj.weight', 'base_model.transformer.h.1.ln_1.bias', 'base_model.transformer.h.1.ln_1.weight', 'base_model.transformer.h.1.ln_2.bias', 'base_model.transformer.h.1.ln_2.weight', 'bas