In [2]:
import comet_ml
import torch
import json
import linecache
import random
import json
import transformers
import einops
import os
from tqdm import tqdm

In [3]:
DEVICE = "cuda:2"

In [3]:
def create_tldr_dataset(n_samples=100_000):
    linecount = 0
    with open("tldr/raw_tldr.jsonl", "r") as raw_file:
        with open("tldr/cleaned_tldr.jsonl", "w") as cleaned_file:
            for json_str in tqdm(raw_file):
                post = json.loads(json_str)
                if 24 < post["summary_len"] < 48:
                    cleaned_file.write(json_str)
                    linecount += 1
    
    assert n_samples <= linecount

    random_idxs = random.sample(range(linecount), k=n_samples)
    with open("tldr/tldr.jsonl", "w") as file:
        for i in tqdm(range(n_samples)):
            line = linecache.getline("tldr/cleaned_tldr.jsonl", lineno=random_idxs[i])
            file.write(line)


# create_tldr_dataset()

In [4]:
class TLDRDataset(torch.utils.data.Dataset):
    def __init__(self, path_to_tldr_dataset):
        self.fname = path_to_tldr_dataset
        self.len = None

    def __len__(self):
        if self.len is None:
            with open(self.fname) as f:
                self.len = sum(1 for line in f)
        return self.len
    
    def __getitem__(self, i):
        i = i % len(self)
        if not (0 <= i < len(self)):
            raise IndexError(f"Tried to retrieve sample at index {i}, but only indicies between 0 and {len(self)-1} modulo {len(self)} are valid.")
        line = linecache.getline(self.fname, lineno=i+1)
        post = json.loads(line)
        return post["normalizedBody"]


dataset = TLDRDataset(path_to_tldr_dataset="tldr/tldr.jsonl")

In [5]:
dataset[6]

"So I'm a guy (24) and finally found someone I'm really happy with, but more importantly, comfortable with. We're dating for a month now and we seem very close to each other and seem to be overall a bit similar. \n Still she's very experienced, having had 4 boyfriends already. Her having had more experience doesn't bother me, however my lack off scares me. I don't know how good I am, and I'm terrified that I just suck so much it will be a dealbreaker for her. \n To add to the situation, I don't really tolerate touches, because of a traumatic past. I've worked on this, but none of this was on my intimate zones. I'm scared that I'll tense up and panick and generally make sex a lot more difficult... \n Because this is my first real relationship, I don't really know what the best way to deal with this. \n tl;dr: I'm a virgin who, because of his past, can't really tolerate touches. I'm really scared my lack of skill and intimacy is going to be a big dealbreaker! \n"

In [6]:
tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = -100

model = transformers.GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id).to(DEVICE)

In [9]:
def generate_sample(model):
    generate_length = 10
    prompt = "So I'm a guy (24) and finally found someone I'm really happy with, but more importantly, comfortable with. We're dating for a month now and we seem very close to each other and seem to be overall a bit similar. \n Still she's very experienced, having had 4 boyfriends already. Her having had more experience doesn't bother me, however my lack off scares me. I don't know how good I am, and I'm terrified that I just suck so much it will be a dealbreaker for her. \n To add to the situation, I don't really tolerate touches, because of a traumatic past. I've worked on this, but none of this was on my intimate zones. I'm scared that I'll tense up and panick and generally make sex a lot more difficult... \n Because this is my first real relationship, I don't really know what the best way to deal with this. \n TLDR"
    
    input_ids = tokenizer(
        [prompt],
        max_length=256,
        padding="longest",
        truncation=True,
        return_tensors="pt",
    ).input_ids.to(DEVICE)
    # input_ids = input_ids[0]

    response_ids = model.generate(
        input_ids,
        min_length=input_ids.shape[-1] + generate_length,
        max_length=input_ids.shape[-1] + 10 * generate_length,
        do_sample=True,
        temperature=1e-8,
        top_k=len(tokenizer),
        top_p=1.0,
    )

    [decoded] = tokenizer.batch_decode(response_ids)

    return decoded


generate_sample(model)

"So I'm a guy (24) and finally found someone I'm really happy with, but more importantly, comfortable with. We're dating for a month now and we seem very close to each other and seem to be overall a bit similar. \n Still she's very experienced, having had 4 boyfriends already. Her having had more experience doesn't bother me, however my lack off scares me. I don't know how good I am, and I'm terrified that I just suck so much it will be a dealbreaker for her. \n To add to the situation, I don't really tolerate touches, because of a traumatic past. I've worked on this, but none of this was on my intimate zones. I'm scared that I'll tense up and panick and generally make sex a lot more difficult... \n Because this is my first real relationship, I don't really know what the best way to deal with this. \n TLDR: I'm a guy (24) and finally found someone I'm really happy with, but more importantly, comfortable with. We're dating for a month now and we seem very close to each other and seem to

In [8]:
def collate_fn(batch):
    return tokenizer(
        batch,
        max_length=256,
        padding="longest",
        truncation=True,
        return_tensors="pt"
    )

data_loader_config = {
    "batch_size": 16,
    "shuffle": True,
    "collate_fn": collate_fn,
}

num_train = int(0.95 * len(dataset))
num_test = len(dataset) - num_train

data_train, data_val = torch.utils.data.random_split(dataset, (num_train, num_test))
train_data_loader = torch.utils.data.DataLoader(data_train, **data_loader_config)
test_data_loader = torch.utils.data.DataLoader(data_val, **data_loader_config)

In [9]:
def train(model, train_data_loader, val_data_loader, epochs=1, lr=1e-3, comet_experiment=None):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    experiment.add_tag("baseline_model")
    
    for _ in range(epochs):
        for step, inputs in enumerate(train_data_loader):
            optimizer.zero_grad()
            
            #  I    am    a    dog    [EOS]    <--- original
            #  I    am    a    dog             <--- inputs (shifted internally)
            #  am   a     dog  [EOS]           <--- targets (shifted internally)
            input_ids = inputs["input_ids"].to(DEVICE)
            attention_mask = inputs["attention_mask"].to(DEVICE)
            loss = model(input_ids, attention_mask=attention_mask, labels=input_ids).loss
            loss.backward()
            optimizer.step()
            
            if comet_experiment is not None:
                comet_experiment.log_metric('train loss', float(loss))
                experiment.log_text(generate_sample(model))
            
            if step % 1_000 == 0:
                torch.save(model.state_dict(), "models/baseline.pt")
            
            if step % 10 == 0:
                model.eval()
                val_loss = 0
                for val_inputs in val_data_loader:
                    with torch.no_grad():
                        input_ids = val_inputs["input_ids"].to(DEVICE)
                        attention_mask = val_inputs["attention_mask"].to(DEVICE)
                        val_loss += model(input_ids, attention_mask=attention_mask, labels=input_ids).loss
                model.train()
                
                comet_experiment.log_metric('val loss', float(val_loss / len(val_data_loader)), step=step)
    
    if comet_experiment is not None:
        comet_experiment.end()


experiment = comet_ml.Experiment(
    api_key=os.getenv("COMET_API_KEY"),
    project_name="learning-to-summarise-using-human-feedback",
    workspace="danesherbs",
    log_env_cpu=False,
    log_env_gpu=False,
)

train(model, train_data_loader, test_data_loader, comet_experiment=experiment, lr=3e-5)

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/danesherbs/learning-to-summarise-using-human-feedback/2e4928ffe20343d5b4917f48f0c49317



In [None]:
dataset[3]

In [None]:
def evaluate(model, data_loader, comet_experiment=None):
    model.eval()
    loss = 0.0
    
    for step, inputs in enumerate(data_loader):
        with torch.no_grad():
            batch_loss = model(**inputs, labels=inputs["input_ids"]).loss
            loss += batch_loss

        if comet_experiment is not None:
            comet_experiment.log_metric('batch test loss', float(loss))
    
    if comet_experiment is not None:
        comet_experiment.log_metric('test loss', float(loss) / (step + 1))
        comet_experiment.end()


# evaluate(model, test_data_loader, comet_experiment=experiment)

In [None]:
import torch
torch.tensor([1, 1, 1, 0]) == 0