In [None]:
!pip install tiktoken
!pip install torchbnn

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0
Collecting torchbnn
  Downloading torchbnn-1.2-py3-none-any.whl.metadata (7.1 kB)
Downloading torchbnn-1.2-py3-none-any.whl (12 kB)
Installing collected packages: torchbnn
Successfully installed torchbnn-1.2


In [None]:
import inspect
import math
import matplotlib.pyplot as plt
import os
import pandas as pd
import requests
import tiktoken
import time
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchbnn as bnn

from dataclasses import dataclass
from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import GPT2LMHeadModel

## Model Architecture

In [None]:
@dataclass
class GPT2Config:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    prior_mu: float = 0.0
    prior_sigma: float = 0.0075

In [None]:
class BayesMultiHeadAttention(nn.Module):
    def __init__(self, config: GPT2Config):
        super(BayesMultiHeadAttention, self).__init__()
        assert config.n_embd % config.n_head == 0
        # self.c_attn = bnn.BayesLinear(prior_mu=config.prior_mu, prior_sigma=config.prior_sigma, in_features=config.n_embd, out_features=3*config.n_embd)
        self.c_attn = nn.Linear(config.n_embd, 3*config.n_embd)
        # self.c_proj = bnn.BayesLinear(prior_mu=config.prior_mu, prior_sigma=config.prior_sigma, in_features=config.n_embd, out_features=config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            self.register_buffer('mask', torch.tril(torch.ones(config.block_size, config.block_size)
                                .view(1, 1, config.block_size, config.block_size)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
        else:
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            y = att @ v

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y

In [None]:
class BayesMLP(nn.Module):
    def __init__(self, config: GPT2Config):
        super(BayesMLP, self).__init__()
        self.c_fc = bnn.BayesLinear(prior_mu=config.prior_mu, prior_sigma=config.prior_sigma, in_features=config.n_embd, out_features=4*config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = bnn.BayesLinear(prior_mu=config.prior_mu, prior_sigma=config.prior_sigma, in_features=4*config.n_embd, out_features=config.n_embd)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [None]:
class BayesBlock(nn.Module):
    def __init__(self, config: GPT2Config):
        super(BayesBlock, self).__init__()

        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = BayesMultiHeadAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = BayesMLP(config)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [None]:
class BayesGPT(nn.Module):
    def __init__(self, config: GPT2Config):
        super(BayesGPT, self).__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([BayesBlock(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = bnn.BayesLinear(prior_mu=config.prior_mu, prior_sigma=config.prior_sigma, in_features=config.n_embd, out_features=config.vocab_size, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        _, T = x.size()
        assert T <= self.config.block_size, f'Cannot forward sequence of length {T}, block size is only {self.config.block_size}'
        pos = torch.arange(0, T, dtype=torch.long, device=x.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(x)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        return logits

    def configure_optimizers(self, weight_decay: float, learning_rate: float, device_type: str) -> optim.AdamW:
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        decay_params = [p for _, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for _, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        fused_available = 'fused' in inspect.signature(optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        optimizer = optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

    def load_pretrained_model(self, pretrained_model: nn.Module) -> None:
        with torch.no_grad():
            for name, param in pretrained_model.state_dict().items():
                if 'logvar' in name or 'mask' in name:
                    continue
                if name == 'lm_head.weight':
                    self.state_dict()['lm_head.weight_mu'].copy_(param)
                elif 'attn.c_attn.bias' in name:
                    # self.state_dict()[name.replace('c_attn.bias', 'c_attn.bias_mu')].copy_(param)
                    self.state_dict()[name].copy_(param)
                elif 'attn.c_attn.weight' in name:
                    # self.state_dict()[name.replace('c_attn.weight', 'c_attn.weight_mu')].copy_(param.T)
                    self.state_dict()[name].copy_(param.T)
                elif 'attn.c_proj.bias' in name:
                    # self.state_dict()[name.replace('c_proj.bias', 'c_proj.bias_mu')].copy_(param)
                    self.state_dict()[name].copy_(param)
                elif 'attn.c_proj.weight' in name:
                    # self.state_dict()[name.replace('c_proj.weight', 'c_proj.weight_mu')].copy_(param.T)
                    self.state_dict()[name].copy_(param.T)
                elif 'mlp.c_proj.bias' in name:
                    self.state_dict()[name.replace('c_proj.bias', 'c_proj.bias_mu')].copy_(param)
                elif 'mlp.c_proj.weight' in name:
                    self.state_dict()[name.replace('c_proj.weight', 'c_proj.weight_mu')].copy_(param.T)
                elif 'mlp.c_fc.bias' in name:
                    self.state_dict()[name.replace('c_fc.bias', 'c_fc.bias_mu')].copy_(param)
                elif 'mlp.c_fc.weight' in name:
                    self.state_dict()[name.replace('c_fc.weight', 'c_fc.weight_mu')].copy_(param.T)
                else:
                    self.state_dict()[name].copy_(param)

## Data Processing

In [None]:
tokenizer = tiktoken.get_encoding('gpt2')
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
    assert torch.cuda.is_available()
    init_process_group(backend='nccl')
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0
else:
    ddp_rank = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'
    print(f'Using device: {device}')

if torch.cuda.is_available():
    device_type = 'cuda'
else:
    device_type = 'cpu'

device = torch.device(device_type)

Using device: cuda


In [None]:
class QADataset(Dataset):
    def __init__(self, texts: pd.DataFrame, tokenizer: tiktoken.Encoding, stop_token: int):
        self.texts = texts
        self.tokenizer = tokenizer
        self.stop_token = stop_token

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        text = self.texts.iloc[idx]
        tokens = self.tokenizer.encode(text)
        input_ids = tokens
        target_ids = tokens[1:] + [self.stop_token]
        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(target_ids, dtype=torch.long)

In [None]:
squad_df = pd.read_parquet('hf://datasets/rajpurkar/squad/plain_text/train-00000-of-00001.parquet')
qa_df = squad_df['context'].apply(lambda x: x.strip()) + ' ' + squad_df['question'].apply(lambda x: x.strip()) + ' ' + squad_df['answers'].apply(lambda x: x['text'][0])
qa_dataset = QADataset(qa_df, tokenizer, tokenizer.eot_token)
# batch size of 1 for now because GPT2 tokenizer doesn't have a pad token
qa_loader = DataLoader(qa_dataset, batch_size=1, shuffle=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Model Finetuning

In [None]:
use_large = False
use_medium = True

if use_large:
    pretrained_model = GPT2LMHeadModel.from_pretrained('gpt2-large')
    model = BayesGPT(GPT2Config(n_layer=36, n_head=20, n_embd=1280))
elif use_medium:
    pretrained_model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
    model = BayesGPT(GPT2Config(n_layer=24, n_head=16, n_embd=1024))
else:
    pretrained_model = GPT2LMHeadModel.from_pretrained('gpt2')
    model = BayesGPT(GPT2Config())

model.load_pretrained_model(pretrained_model)

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
torch.set_float32_matmul_precision('high')

if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module if ddp else model

weight_decay = 0.1
learning_rate = 6e-4

optimizer = raw_model.configure_optimizers(weight_decay, learning_rate, device_type)
ce_criterion = nn.CrossEntropyLoss()
kl_criterion = bnn.BKLLoss(reduction='mean', last_layer_only=False)

In [None]:
def get_lr(it: int, max_lr: float, min_lr: float, warmup_steps: int, max_steps: int) -> float:
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    if it > max_steps:
        return min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

In [None]:
log_dir = 'log'
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, 'log.txt')
with open(log_file, 'w') as f:
    pass

kl_weight = 0.01
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 0
max_steps = 300

grad_accum_steps = 64

model.to(device)

BayesGPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (h): ModuleList(
      (0-23): 24 x BayesBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): BayesMultiHeadAttention(
          (c_attn): Linear(in_features=1024, out_features=3072, bias=True)
          (c_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BayesMLP(
          (c_fc): BayesLinear(prior_mu=0.0, prior_sigma=0.0075, in_features=1024, out_features=4096, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): BayesLinear(prior_mu=0.0, prior_sigma=0.0075, in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): BayesLinear(prior_mu=0.0, prior_sigma=0.0075, in_features=1024, out_features=50257, bias=True)
)

In [None]:
for step in range(max_steps):
    last_step = (step == max_steps - 1)

    if step > 0 and (step % 100 == 0 or last_step):
        checkpoint_path = os.path.join(log_dir, f'model_{step:05d}.pt')
        torch.save(raw_model.state_dict(), checkpoint_path)

    model.train()
    optimizer.zero_grad()
    ce_loss_accum = 0.0
    kl_loss_accum = 0.0
    loss_accum = 0.0
    grad_accum_step = 0
    t0 = time.time()
    for x, y in qa_loader:
        grad_accum_step += 1
        if grad_accum_step > grad_accum_steps:
            break
        x, y = x.to(device), y.to(device)
        if ddp:
            model.require_backward_grad_sync = (grad_accum_step == grad_accum_steps - 1)
        with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
            logits = model(x)
            ce_loss = ce_criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            kl_loss = kl_weight * kl_criterion(model)
        ce_loss /= grad_accum_steps
        kl_loss /= grad_accum_steps
        ce_loss_accum += ce_loss.detach()
        kl_loss_accum += kl_loss.detach()
        loss = ce_loss + kl_loss
        loss_accum += loss.detach()
        loss.backward()
    if ddp:
        dist.all_reduce(loss_accum, op=dist.ReduceOp.AVG)
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    lr = get_lr(step, max_lr, min_lr, warmup_steps, max_steps)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    if device_type == 'cuda':
        torch.cuda.synchronize()
    t1 = time.time()
    dt = t1 - t0
    print(f'step {step:5d} | ce_loss: {ce_loss_accum.item():.6f} | kl_loss: {kl_loss_accum.item():.6f} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt * 1000:.2f}ms')
    with open(log_file, 'a') as f:
        f.write(f'step {step:5d} | ce_loss: {ce_loss_accum.item():.6f} | kl_loss: {kl_loss_accum.item():.6f} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt * 1000:.2f}ms\n')

if ddp:
    destroy_process_group()

step     0 | ce_loss: 4.741160 | kl_loss: 1.045498 | loss: 5.786659 | lr 6.0000e-04 | norm: 10.6107 | dt: 8438.12ms
step     1 | ce_loss: 6.284096 | kl_loss: 1.044902 | loss: 7.328999 | lr 5.9999e-04 | norm: 33.4092 | dt: 6707.45ms
step     2 | ce_loss: 4.295778 | kl_loss: 1.044437 | loss: 5.340215 | lr 5.9994e-04 | norm: 6.1617 | dt: 6779.15ms
step     3 | ce_loss: 5.526963 | kl_loss: 1.043927 | loss: 6.570889 | lr 5.9987e-04 | norm: 26.8878 | dt: 6578.37ms
step     4 | ce_loss: 3.559840 | kl_loss: 1.043527 | loss: 4.603368 | lr 5.9976e-04 | norm: 1.9570 | dt: 6560.51ms
step     5 | ce_loss: 3.488786 | kl_loss: 1.043060 | loss: 4.531846 | lr 5.9963e-04 | norm: 1.2526 | dt: 6616.60ms
step     6 | ce_loss: 3.279382 | kl_loss: 1.042515 | loss: 4.321898 | lr 5.9947e-04 | norm: 1.1324 | dt: 6604.24ms
step     7 | ce_loss: 3.291870 | kl_loss: 1.041927 | loss: 4.333797 | lr 5.9927e-04 | norm: 0.8394 | dt: 6651.13ms
step     8 | ce_loss: 3.212125 | kl_loss: 1.041303 | loss: 4.253428 | lr 5.99

## Model Inference

In [None]:
def generate_text(model: nn.Module,
                  tokenizer: tiktoken.Encoding,
                  prompt: str,
                  stop_token: int,
                  max_length: int = 30,
                  temperature: float = 0.0,
                  device: str = 'cpu'
                  ) -> str:
    assert temperature >= 0.0, 'Temperature must be non-negative'
    model.eval()
    model.to(device)
    tokens = tokenizer.encode(prompt)
    tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
    prompt_length = tokens.size(1)

    for _ in range(max_length):
        with torch.no_grad():
            logits = model(tokens)
            # to support pretrained model and own model
            if type(logits) != torch.Tensor:
                logits = logits.logits
            logits = logits[:, -1, :]

            if temperature > 0.0:
                logits = logits / temperature
                probabilities = torch.softmax(logits, dim=-1)
                next_token = torch.multinomial(probabilities, num_samples=1)
            else:
                next_token = logits.argmax(dim=-1, keepdim=True)

            if next_token.item() == stop_token:
                break
            tokens = torch.cat((tokens, next_token), dim=1)

    decoded = tokenizer.decode(tokens[0, prompt_length:].tolist())
    return decoded.replace('\n', ' ').strip()

In [None]:
def load_model_from_path(path: str, device: str) -> nn.Module:
    assert path.startswith(f'{os.getcwd()}/models/gpt2/') or path.startswith(f'{os.getcwd()}/models/gpt2-medium/'), f'No models found in {path}'
    if path.startswith(f'{os.getcwd()}/models/gpt2/'):
        model = BayesGPT(GPT2Config())
        model.load_state_dict(torch.load(path, map_location=device))
        return model
    if path.startswith(f'{os.getcwd()}/models/gpt2-medium/'):
        model = BayesGPT(GPT2Config(n_layer=24, n_head=16, n_embd=1024))
        model.load_state_dict(torch.load(path, map_location=device))
        return model
    return None

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = tiktoken.get_encoding('gpt2')
model = BayesGPT(GPT2Config(n_layer=24, n_head=16, n_embd=1024))
model.load_state_dict(torch.load('/content/log/model_00100.pt', map_location=device))

  model.load_state_dict(torch.load('/content/log/model_00100.pt', map_location=device))


<All keys matched successfully>

In [None]:
pretrained_model = GPT2LMHeadModel.from_pretrained('gpt2')
pretrained_model_large = GPT2LMHeadModel.from_pretrained('gpt2-large')

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
prompt_with_context = 'X is the sixth studio album by American singer Chris Brown. It was released on September 16, 2014. When was X released?'
generated_text_bayes = generate_text(model, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_with_context}')
print(f'> Bayes Assistant:         {generated_text_bayes}')
print(f'> GPT2 Assistant:          {generated_text_pretrained}')
print(f'> GPT2-Large Assistant:    {generated_text_pretrained_large}')

print(f'----------------------------------------------------------------------------------')

prompt_without_context = 'When was X released?'
generated_text_bayes = generate_text(model, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_without_context}')
print(f'> Bayes Assistant:         {generated_text_bayes}')
print(f'> GPT2 Assistant:          {generated_text_pretrained}')
print(f'> GPT2-Large Assistant:    {generated_text_pretrained_large}')

> User:                    X is the sixth studio album by American singer Chris Brown. It was released on September 16, 2014. When was X released?
> Bayes Assistant:         September 16, 2014
> GPT2 Assistant:          Chris Brown: I was born on September 16, 1979 in New York City. I was
> GPT2-Large Assistant:    Chris Brown's X was released on September 16, 2014. When was X released?
----------------------------------------------------------------------------------
> User:                    When was X released?
> Bayes Assistant:         November, 2004
> GPT2 Assistant:          X was released on May 1st, 2013.  What is the difference between X
> GPT2-Large Assistant:    X was released on the 1st of January, 2013.  What platforms is it


In [None]:
prompt_with_context = 'Damon was born in Atlanta, GA on July 17, 2002. Where was Damon born?'
generated_text_bayes = generate_text(model, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_with_context}')
print(f'> Bayes Assistant:         {generated_text_bayes}')
print(f'> GPT2 Assistant:          {generated_text_pretrained}')
print(f'> GPT2-Large Assistant:    {generated_text_pretrained_large}')

print(f'----------------------------------------------------------------------------------')

prompt_without_context = 'Where was Damon born?'
generated_text_bayes = generate_text(model, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_without_context}')
print(f'> Bayes Assistant:         {generated_text_bayes}')
print(f'> GPT2 Assistant:          {generated_text_pretrained}')
print(f'> GPT2-Large Assistant:    {generated_text_pretrained_large}')

> User:                    Damon was born in Atlanta, GA on July 17, 2002. Where was Damon born?
> Bayes Assistant:         Atlanta, GA
> GPT2 Assistant:          Damon was born in Atlanta, GA on July 17, 2002. Where was Damon born
> GPT2-Large Assistant:    Damon was born in Atlanta, GA. Damon was born in Atlanta, GA. Damon
----------------------------------------------------------------------------------
> User:                    Where was Damon born?
> Bayes Assistant:         Damon, Connecticut
> GPT2 Assistant:          Damon was born in the United States on January 1, 1973. He was born in
> GPT2-Large Assistant:    He was born in the United States.  What was his childhood like?


In [None]:
prompt_with_context = 'Lebron James was born in Akron, Ohio. Where was Lebron James born?'
generated_text_bayes = generate_text(model, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_with_context}')
print(f'> Bayes Assistant:         {generated_text_bayes}')
print(f'> GPT2 Assistant:          {generated_text_pretrained}')
print(f'> GPT2-Large Assistant:    {generated_text_pretrained_large}')

print(f'----------------------------------------------------------------------------------')

prompt_without_context = 'Where was Lebron James born?'
generated_text_bayes = generate_text(model, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_without_context}')
print(f'> Bayes Assistant:         {generated_text_bayes}')
print(f'> GPT2 Assistant:          {generated_text_pretrained}')
print(f'> GPT2-Large Assistant:    {generated_text_pretrained_large}')

> User:                    Lebron James was born in Akron, Ohio. Where was Lebron James born?
> Bayes Assistant:         Ohio
> GPT2 Assistant:          Lebron James: Akron, Ohio.  James: I was born in Akron
> GPT2-Large Assistant:    Akron, Ohio. Where is Lebron James from? Akron, Ohio. Where is Leb
----------------------------------------------------------------------------------
> User:                    Where was Lebron James born?
> Bayes Assistant:         New York
> GPT2 Assistant:          Lebron James was born in 1829 in the town of Lebron, England
> GPT2-Large Assistant:    Lebron James was born in Akron, Ohio on June 12, 1984.


In [None]:
prompt_with_context = 'Michael Jordan was a basketball player on the Chicago Bulls. Which sport did Michael Jordan play?'
generated_text_bayes = generate_text(model, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_with_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_with_context}')
print(f'> Bayes Assistant:         {generated_text_bayes}')
print(f'> GPT2 Assistant:          {generated_text_pretrained}')
print(f'> GPT2-Large Assistant:    {generated_text_pretrained_large}')

print(f'----------------------------------------------------------------------------------')

prompt_without_context = 'Which sport did Michael Jordan play?'
generated_text_bayes = generate_text(model, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_without_context, tokenizer.eot_token, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_without_context}')
print(f'> Bayes Assistant:         {generated_text_bayes}')
print(f'> GPT2 Assistant:          {generated_text_pretrained}')
print(f'> GPT2-Large Assistant:    {generated_text_pretrained_large}')

> User:                    Michael Jordan was a basketball player on the Chicago Bulls. Which sport did Michael Jordan play?
> Bayes Assistant:         basketball
> GPT2 Assistant:          Basketball.  Michael Jordan was a basketball player on the Chicago Bulls. Which sport did Michael Jordan
> GPT2-Large Assistant:    Michael Jordan played basketball.  Michael Jordan played basketball.  Michael Jordan played basketball
----------------------------------------------------------------------------------
> User:                    Which sport did Michael Jordan play?
> Bayes Assistant:         basketball
> GPT2 Assistant:          I don't know. I don't know. I don't know. I don't
> GPT2-Large Assistant:    Michael Jordan played basketball.  What was the name of the first basketball game played?


In [None]:
prompt = 'Elon Musk is the president in 2030. Who is the president in 2030?'
generated_text_bayes = generate_text(model, tokenizer, prompt, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt, tokenizer.eot_token, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt, tokenizer.eot_token, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt}')
print(f'> Bayes Assistant:         {generated_text_bayes}')
print(f'> GPT2 Assistant:          {generated_text_pretrained}')
print(f'> GPT2-Large Assistant:    {generated_text_pretrained_large}')

> User:                    Elon Musk is the president in 2030. Who is the president in 2030?
> Bayes Assistant:         Elon Musk
> GPT2 Assistant:          The president in 2030 is the president in 2030.  The president in 2030 is the
> GPT2-Large Assistant:    The president in 2030 is Elon Musk.  The president in 2030 is Elon Musk.


In [None]:
with open(f'{os.getcwd()}/benchmarks/questions_answers1.txt', 'r') as f:
    prompts = f.readlines()
    prompts = [prompt.strip() for prompt in prompts]

with open(f'{os.getcwd()}/outputs/questions_answers1_response.txt', 'w') as f:
    for prompt in tqdm(prompts):
        f.write(f'> User:                    {prompt}\n')
        for _ in range(25):
            generated_text_bayes = generate_text(model, tokenizer, prompt, tokenizer.eot_token, max_length=25, temperature=0, device=device)
            f.write(f'> Bayes Assistant:         {generated_text_bayes}\n')
        f.write(f'----------------------------------------------------------------------------------\n')

100%|██████████| 50/50 [07:59<00:00,  9.59s/it]


In [None]:
with open(f'{os.getcwd()}/benchmarks/questions1.txt', 'r') as f:
    prompts = f.readlines()
    prompts = [prompt.strip() for prompt in prompts]

with open(f'{os.getcwd()}/outputs/questions_no_answers1_response.txt', 'w') as f:
    for prompt in tqdm(prompts):
        f.write(f'> User:                    {prompt}\n')
        for _ in range(25):
            generated_text_bayes = generate_text(model, tokenizer, prompt, tokenizer.eot_token, max_length=25, temperature=0, device=device)
            f.write(f'> Bayes Assistant:         {generated_text_bayes}\n')
        f.write(f'----------------------------------------------------------------------------------\n')

100%|██████████| 50/50 [08:00<00:00,  9.62s/it]


In [37]:
with open(f'{os.getcwd()}/benchmarks/answers2.txt', 'r') as f:
    answers = f.readlines()
    answers = [answer.strip() for answer in answers]

with open(f'{os.getcwd()}/benchmarks/questions2.txt', 'r') as f:
    questions = f.readlines()
    questions = [question.strip() for question in questions]

prompts = [answer + ' ' + question for answer, question in zip(answers, questions)]

with open(f'{os.getcwd()}/outputs/questions_answers2_response.txt', 'w') as f:
    for prompt in tqdm(prompts):
        f.write(f'> User:                    {prompt}\n')
        for _ in range(25):
            generated_text_bayes = generate_text(model, tokenizer, prompt, tokenizer.eot_token, max_length=25, temperature=0, device=device)
            f.write(f'> Bayes Assistant:         {generated_text_bayes}\n')
        f.write(f'----------------------------------------------------------------------------------\n')

100%|██████████| 50/50 [01:48<00:00,  2.17s/it]


In [38]:
with open(f'{os.getcwd()}/benchmarks/questions2.txt', 'r') as f:
    prompts = f.readlines()
    prompts = [prompt.strip() for prompt in prompts]

with open(f'{os.getcwd()}/outputs/questions_no_answers2_response.txt', 'w') as f:
    for prompt in tqdm(prompts):
        f.write(f'> User:                    {prompt}\n')
        for _ in range(25):
            generated_text_bayes = generate_text(model, tokenizer, prompt, tokenizer.eot_token, max_length=25, temperature=0, device=device)
            f.write(f'> Bayes Assistant:         {generated_text_bayes}\n')
        f.write(f'----------------------------------------------------------------------------------\n')

100%|██████████| 50/50 [01:23<00:00,  1.67s/it]
