In [1]:
import inspect
import math
import matplotlib.pyplot as plt
import os
import pandas as pd
import requests
import tiktoken
import time
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchbnn as bnn

from dataclasses import dataclass
from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


## Model Architecture

In [2]:
@dataclass
class GPT2Config:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768

In [3]:
class BayesMultiHeadAttention(nn.Module):
    def __init__(self, config: GPT2Config):
        super(BayesMultiHeadAttention, self).__init__()
        assert config.n_embd % config.n_head == 0
        # self.c_attn = bnn.BayesLinear(prior_mu=0, prior_sigma=0.005, in_features=config.n_embd, out_features=3*config.n_embd)
        self.c_attn = nn.Linear(config.n_embd, 3*config.n_embd)
        # self.c_proj = bnn.BayesLinear(prior_mu=0, prior_sigma=0.005, in_features=config.n_embd, out_features=config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            self.register_buffer('mask', torch.tril(torch.ones(config.block_size, config.block_size)
                                .view(1, 1, config.block_size, config.block_size)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        
        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
        else:
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            y = att @ v
        
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y

In [4]:
class BayesMLP(nn.Module):
    def __init__(self, config: GPT2Config):
        super(BayesMLP, self).__init__()
        self.c_fc = bnn.BayesLinear(prior_mu=0, prior_sigma=0.005, in_features=config.n_embd, out_features=4*config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = bnn.BayesLinear(prior_mu=0, prior_sigma=0.005, in_features=4*config.n_embd, out_features=config.n_embd)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [5]:
class BayesBlock(nn.Module):
    def __init__(self, config: GPT2Config):
        super(BayesBlock, self).__init__()

        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = BayesMultiHeadAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = BayesMLP(config)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [6]:
class BayesGPT(nn.Module):
    def __init__(self, config: GPT2Config):
        super(BayesGPT, self).__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([BayesBlock(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = bnn.BayesLinear(prior_mu=0, prior_sigma=0.005, in_features=config.n_embd, out_features=config.vocab_size, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        _, T = x.size()
        assert T <= self.config.block_size, f'Cannot forward sequence of length {T}, block size is only {self.config.block_size}'
        pos = torch.arange(0, T, dtype=torch.long, device=x.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(x)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        return logits

    def configure_optimizers(self, weight_decay: float, learning_rate: float, device_type: str) -> optim.AdamW:
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        decay_params = [p for _, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for _, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        fused_available = 'fused' in inspect.signature(optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        optimizer = optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

    def load_pretrained_model(self, pretrained_model: nn.Module) -> None:
        with torch.no_grad():
            for name, param in pretrained_model.state_dict().items():
                if 'logvar' in name or 'mask' in name:
                    continue
                if name == 'lm_head.weight':
                    self.state_dict()['lm_head.weight_mu'].copy_(param)
                elif 'attn.c_attn.bias' in name:
                    # self.state_dict()[name.replace('c_attn.bias', 'c_attn.bias_mu')].copy_(param)
                    self.state_dict()[name].copy_(param)
                elif 'attn.c_attn.weight' in name:
                    # self.state_dict()[name.replace('c_attn.weight', 'c_attn.weight_mu')].copy_(param.T)
                    self.state_dict()[name].copy_(param.T)
                elif 'attn.c_proj.bias' in name:
                    # self.state_dict()[name.replace('c_proj.bias', 'c_proj.bias_mu')].copy_(param)
                    self.state_dict()[name].copy_(param)
                elif 'attn.c_proj.weight' in name:
                    # self.state_dict()[name.replace('c_proj.weight', 'c_proj.weight_mu')].copy_(param.T)
                    self.state_dict()[name].copy_(param.T)
                elif 'mlp.c_proj.bias' in name:
                    self.state_dict()[name.replace('c_proj.bias', 'c_proj.bias_mu')].copy_(param)
                elif 'mlp.c_proj.weight' in name:
                    self.state_dict()[name.replace('c_proj.weight', 'c_proj.weight_mu')].copy_(param.T)
                elif 'mlp.c_fc.bias' in name:
                    self.state_dict()[name.replace('c_fc.bias', 'c_fc.bias_mu')].copy_(param)
                elif 'mlp.c_fc.weight' in name:
                    self.state_dict()[name.replace('c_fc.weight', 'c_fc.weight_mu')].copy_(param.T)
                else:
                    self.state_dict()[name].copy_(param)

## Data Processing

In [7]:
tokenizer = tiktoken.get_encoding('gpt2')
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
    assert torch.cuda.is_available()
    init_process_group(backend='nccl')
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0
else:
    ddp_rank = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'
    print(f'Using device: {device}')

if torch.cuda.is_available():
    device_type = 'cuda'
else:
    device_type = 'cpu'

device = torch.device(device_type)

Using device: cpu


In [8]:
def filter_short_texts(df: pd.DataFrame, tokenizer: tiktoken.Encoding, min_length: int) -> pd.DataFrame:
    df['encoded_length'] = df['text'].apply(lambda x: len(tokenizer.encode(x)))
    mask = df['encoded_length'] >= min_length
    filtered_df = df[mask].drop(columns=['encoded_length'])
    return filtered_df

In [9]:
subdir = 'data'
if not os.path.exists(subdir):
    os.makedirs(subdir)
    
file_options = [
    'webtext',
    'small-117M',  'small-117M-k40',
    'medium-345M', 'medium-345M-k40',
    'large-762M',  'large-762M-k40',
    'xl-1542M',    'xl-1542M-k40',
]

file_option = file_options[0]

for split in ['train', 'valid', 'test']:
    filename = file_option + '.' + split + '.jsonl'
    if not os.path.exists(os.path.join(subdir, filename)):
      r = requests.get('https://openaipublic.azureedge.net/gpt-2/output-dataset/v1/' + filename, stream=True)

      with open(os.path.join(subdir, filename), 'wb') as f:
          file_size = int(r.headers['content-length'])
          chunk_size = 1000
          with tqdm(ncols=100, desc='Fetching ' + filename, total=file_size, unit_scale=True) as pbar:
              for chunk in r.iter_content(chunk_size=chunk_size):
                  f.write(chunk)
                  pbar.update(chunk_size)

In [10]:
class TextDataset(Dataset):
    def __init__(self, texts: pd.DataFrame, tokenizer: tiktoken.Encoding, block_size: int = 128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        text = self.texts.iloc[idx]
        tokens = self.tokenizer.encode(text)
        chunk = tokens[:self.block_size]
        input_ids = chunk[:-1]
        target_ids = chunk[1:]
        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(target_ids, dtype=torch.long)

In [None]:
block_size = 128

try:
    train_df = pd.read_json(f'{os.getcwd()}/{subdir}/{file_option}.train.jsonl', lines=True)
    # this is for simplicity so each row of the df is a training example when passed into the TextDataset
    train_df_filtered = filter_short_texts(train_df, tokenizer, block_size)
except FileNotFoundError:
    train_df_filtered = None
    print(f'Training data found in {os.getcwd()}')

In [56]:
T = block_size
B = 64
train_dataset = TextDataset(train_df_filtered.text, tokenizer, block_size=T)
train_loader = DataLoader(train_dataset, batch_size=B, shuffle=True)

## Model Finetuning

In [39]:
use_large = False
use_medium = True

if use_large:
    pretrained_model = GPT2LMHeadModel.from_pretrained('gpt2-large')
    model = BayesGPT(GPT2Config(n_layer=36, n_head=20, n_embd=1280))
elif use_medium:
    pretrained_model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
    model = BayesGPT(GPT2Config(n_layer=24, n_head=16, n_embd=1024))
else:
    pretrained_model = GPT2LMHeadModel.from_pretrained('gpt2')
    model = BayesGPT(GPT2Config())

model.load_pretrained_model(pretrained_model)

In [16]:
torch.set_float32_matmul_precision('high')

if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module if ddp else model

weight_decay = 0.1
learning_rate = 6e-4

optimizer = raw_model.configure_optimizers(weight_decay, learning_rate, device_type)
ce_criterion = nn.CrossEntropyLoss()
kl_criterion = bnn.BKLLoss(reduction='mean', last_layer_only=False)

In [17]:
def get_lr(it: int, max_lr: float, min_lr: float, warmup_steps: int, max_steps: int) -> float:
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    if it > max_steps:
        return min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

In [None]:
log_dir = 'log'
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, 'log.txt')
with open(log_file, 'w') as f:
    pass

kl_weight = 0.01
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 0
max_steps = 1000

# grad_accum_steps = len(train_dataset) // (B * T * ddp_world_size)
grad_accum_steps = 8

model.to(device)

In [49]:
for step in range(max_steps):
    t0 = time.time()
    last_step = (step == max_steps - 1)

    if step > 0 and (step % 100 == 0 or last_step):
        checkpoint_path = os.path.join(log_dir, f'model_{step:05d}.pt')
        torch.save(raw_model.state_dict(), checkpoint_path)

    model.train()
    optimizer.zero_grad()
    ce_loss_accum = 0.0
    kl_loss_accum = 0.0
    loss_accum = 0.0
    grad_accum_step = 0
    for x, y in train_loader:
        grad_accum_step += 1
        if grad_accum_step > grad_accum_steps:
            break
        x, y = x.to(device), y.to(device)
        if ddp:
            model.require_backward_grad_sync = (grad_accum_step == grad_accum_steps - 1)
        with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
            logits = model(x)
            ce_loss = ce_criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            kl_loss = kl_weight * kl_criterion(model)
        ce_loss /= grad_accum_steps
        kl_loss /= grad_accum_steps
        ce_loss_accum += ce_loss.detach()
        kl_loss_accum += kl_loss.detach()
        loss = ce_loss + kl_loss
        loss_accum += loss.detach()
        loss.backward()
    if ddp:
        dist.all_reduce(loss_accum, op=dist.ReduceOp.AVG)
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    lr = get_lr(step, max_lr, min_lr, warmup_steps, max_steps)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    if device_type == 'cuda':
        torch.cuda.synchronize()
    t1 = time.time()
    dt = t1 - t0
    tokens_processed = B * T * grad_accum_steps * ddp_world_size
    tokens_per_sec = tokens_processed / dt
    print(f'step {step:5d} | ce_loss: {ce_loss_accum.item():.6f} | kl_loss: {kl_loss_accum.item():.6f} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt * 1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}')
    with open(log_file, 'a') as f:
        f.write(f'step {step:5d} | ce_loss: {ce_loss_accum.item():.6f} | kl_loss: {kl_loss_accum.item():.6f} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt * 1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}')

if ddp:
    destroy_process_group()

step     0 | loss: 6.051754 | lr 6.0000e-04 | norm: 0.5479 | dt: 2926.50ms | tok/sec: 2799.25
step     1 | loss: 6.042659 | lr 6.0000e-04 | norm: 0.5707 | dt: 2856.49ms | tok/sec: 2867.86
step     2 | loss: 5.818207 | lr 5.9999e-04 | norm: 0.5865 | dt: 2885.74ms | tok/sec: 2838.78
step     3 | loss: 5.938889 | lr 5.9999e-04 | norm: 0.6309 | dt: 2893.19ms | tok/sec: 2831.47
step     4 | loss: 5.859262 | lr 5.9998e-04 | norm: 0.6302 | dt: 2903.23ms | tok/sec: 2821.68
step     5 | loss: 6.100172 | lr 5.9997e-04 | norm: 0.6612 | dt: 2909.45ms | tok/sec: 2815.65
step     6 | loss: 6.063196 | lr 5.9995e-04 | norm: 0.7159 | dt: 2906.29ms | tok/sec: 2818.71
step     7 | loss: 5.983959 | lr 5.9993e-04 | norm: 0.6396 | dt: 2918.45ms | tok/sec: 2806.97
step     8 | loss: 5.976865 | lr 5.9991e-04 | norm: 0.6391 | dt: 2922.14ms | tok/sec: 2803.43
step     9 | loss: 5.862965 | lr 5.9989e-04 | norm: 0.6391 | dt: 2939.90ms | tok/sec: 2786.49
step    10 | loss: 5.841560 | lr 5.9987e-04 | norm: 0.5843 |

## Model Inference

In [11]:
def generate_text(model: nn.Module, tokenizer: tiktoken.Encoding, prompt: str, max_length: int = 30, temperature: float = 0.0, device: str = 'cpu') -> str:
    assert temperature >= 0.0, 'Temperature must be non-negative'
    model.eval()
    model.to(device)
    tokens = tokenizer.encode(prompt)
    tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
    prompt_length = tokens.size(1)

    for _ in range(max_length):
        with torch.no_grad():
            logits = model(tokens)
            # to support pretrained model and own model
            if type(logits) != torch.Tensor:
                logits = logits.logits
            logits = logits[:, -1, :]

            if temperature > 0.0:
                logits = logits / temperature
                probabilities = torch.softmax(logits, dim=-1)
                next_token = torch.multinomial(probabilities, num_samples=1)
            else:
                next_token = logits.argmax(dim=-1, keepdim=True)

            tokens = torch.cat((tokens, next_token), dim=1)

    decoded = tokenizer.decode(tokens[0, prompt_length:].tolist())
    return decoded.replace('\n', ' ').strip()


def get_first_sentence(text: str) -> str:
    idx = text.find('.')
    if idx != -1:
      return text[:idx + 1]
    return text

In [14]:
def load_model_from_path(path: str, device: str) -> nn.Module:
    assert path.startswith(f'{os.getcwd()}/models/gpt2/') or path.startswith(f'{os.getcwd()}/models/gpt2-medium/'), f'No models found in {path}'
    if path.startswith(f'{os.getcwd()}/models/gpt2/'):
        model = BayesGPT(GPT2Config())
        model.load_state_dict(torch.load(path, map_location=device))
        return model
    if path.startswith(f'{os.getcwd()}/models/gpt2-medium/'):
        model = BayesGPT(GPT2Config(n_layer=24, n_head=16, n_embd=1024))
        model.load_state_dict(torch.load(path, map_location=device))
        return model
    return None

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = tiktoken.get_encoding('gpt2')
model = load_model_from_path(f'{os.getcwd()}/models/gpt2-medium/model-v1.pt', device)

In [16]:
pretrained_model = GPT2LMHeadModel.from_pretrained('gpt2')
pretrained_model_large = GPT2LMHeadModel.from_pretrained('gpt2-large')

In [17]:
prompt_with_context = 'X is the sixth studio album by American singer Chris Brown. It was released on September 16, 2014. When was X released?'
generated_text_bayes = generate_text(model, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_with_context}')
print(f'> Bayes Assistant:         {get_first_sentence(generated_text_bayes)}')
print(f'> GPT2 Assistant:          {get_first_sentence(generated_text_pretrained)}')
print(f'> GPT2-Large Assistant:    {get_first_sentence(generated_text_pretrained_large)}')

print(f'----------------------------------------------------------------------------------')

prompt_without_context = 'When was X released?'
generated_text_bayes = generate_text(model, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_without_context}')
print(f'> Bayes Assistant:         {get_first_sentence(generated_text_bayes)}')
print(f'> GPT2 Assistant:          {get_first_sentence(generated_text_pretrained)}')
print(f'> GPT2-Large Assistant:    {get_first_sentence(generated_text_pretrained_large)}')

> User:                    X is the sixth studio album by American singer Chris Brown. It was released on September 16, 2014. When was X released?
> Bayes Assistant:         When was X released?  Released: September 16, 2014  Released: September
> GPT2 Assistant:          Chris Brown: I was born on September 16, 1979 in New York City.
> GPT2-Large Assistant:    Chris Brown's X was released on September 16, 2014.
----------------------------------------------------------------------------------
> User:                    When was X released?
> Bayes Assistant:         X was released on March 17, 2012.
> GPT2 Assistant:          X was released on May 1st, 2013.
> GPT2-Large Assistant:    X was released on the 1st of January, 2013.


In [18]:
prompt_with_context = 'Damon was born in Atlanta, GA on July 17, 2002. Where was Damon born?'
generated_text_bayes = generate_text(model, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_with_context}')
print(f'> Bayes Assistant:         {get_first_sentence(generated_text_bayes)}')
print(f'> GPT2 Assistant:          {get_first_sentence(generated_text_pretrained)}')
print(f'> GPT2-Large Assistant:    {get_first_sentence(generated_text_pretrained_large)}')

print(f'----------------------------------------------------------------------------------')

prompt_without_context = 'Where was Damon born?'
generated_text_bayes = generate_text(model, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_without_context}')
print(f'> Bayes Assistant:         {get_first_sentence(generated_text_bayes)}')
print(f'> GPT2 Assistant:          {get_first_sentence(generated_text_pretrained)}')
print(f'> GPT2-Large Assistant:    {get_first_sentence(generated_text_pretrained_large)}')

> User:                    Damon was born in Atlanta, GA on July 17, 2002. Where was Damon born?
> Bayes Assistant:         Damon is from Atlanta, GA.
> GPT2 Assistant:          Damon was born in Atlanta, GA on July 17, 2002.
> GPT2-Large Assistant:    Damon was born in Atlanta, GA.
----------------------------------------------------------------------------------
> User:                    Where was Damon born?
> Bayes Assistant:         Damon was born in London, England, on January 6, 1987.
> GPT2 Assistant:          Damon was born in the United States on January 1, 1973.
> GPT2-Large Assistant:    He was born in the United States.


In [19]:
prompt_with_context = 'Lebron James was born in Akron, Ohio, to Gloria Marie James, who was only 16 at the time of his birth. Where was Lebron James born?'
generated_text_bayes = generate_text(model, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_with_context}')
print(f'> Bayes Assistant:         {get_first_sentence(generated_text_bayes)}')
print(f'> GPT2 Assistant:          {get_first_sentence(generated_text_pretrained)}')
print(f'> GPT2-Large Assistant:    {get_first_sentence(generated_text_pretrained_large)}')

print(f'----------------------------------------------------------------------------------')

prompt_without_context = 'Where was Lebron James born?'
generated_text_bayes = generate_text(model, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_without_context}')
print(f'> Bayes Assistant:         {get_first_sentence(generated_text_bayes)}')
print(f'> GPT2 Assistant:          {get_first_sentence(generated_text_pretrained)}')
print(f'> GPT2-Large Assistant:    {get_first_sentence(generated_text_pretrained_large)}')

> User:                    Lebron James was born in Akron, Ohio, to Gloria Marie James, who was only 16 at the time of his birth. Where was Lebron James born?
> Bayes Assistant:         Lebron James was born on August 4, 1988, in Akron, Ohio.
> GPT2 Assistant:          Lebron James was born in Akron, Ohio, to Gloria Marie James, who was
> GPT2-Large Assistant:    Lebron James was born in Akron, Ohio, to Gloria Marie James, who was
----------------------------------------------------------------------------------
> User:                    Where was Lebron James born?
> Bayes Assistant:         Lebron James was born in Cleveland, Ohio on February 6, 1992.
> GPT2 Assistant:          Lebron James was born in 1829 in the town of Lebron, England
> GPT2-Large Assistant:    Lebron James was born in Akron, Ohio on June 12, 1984.


In [110]:
prompt_with_context = 'Michael Jordan was a basketball player on the Chicago Bulls. Which sport did Michael Jordan play?'
generated_text_bayes = generate_text(model, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_with_context, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_with_context}')
print(f'> Bayes Assistant:         {get_first_sentence(generated_text_bayes)}')
print(f'> GPT2 Assistant:          {get_first_sentence(generated_text_pretrained)}')
print(f'> GPT2-Large Assistant:    {get_first_sentence(generated_text_pretrained_large)}')

print(f'----------------------------------------------------------------------------------')

prompt_without_context = 'Which sport did Michael Jordan play?'
generated_text_bayes = generate_text(model, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt_without_context, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt_without_context}')
print(f'> Bayes Assistant:         {get_first_sentence(generated_text_bayes)}')
print(f'> GPT2 Assistant:          {get_first_sentence(generated_text_pretrained)}')
print(f'> GPT2-Large Assistant:    {get_first_sentence(generated_text_pretrained_large)}')

> User:                    Michael Jordan was a basketball player on the Chicago Bulls. Which sport did Michael Jordan play?
> Bayes Assistant:         Basketball.
> GPT2 Assistant:          Basketball.
> GPT2-Large Assistant:    Michael Jordan played basketball.
----------------------------------------------------------------------------------
> User:                    Which sport did Michael Jordan play?
> Bayes Assistant:         The answer is yes.
> GPT2 Assistant:          I don't know.
> GPT2-Large Assistant:    Michael Jordan played basketball.


In [20]:
prompt = 'Travis Scott was the US President in 2030. Who was the US President in 2030?'
generated_text_bayes = generate_text(model, tokenizer, prompt, max_length=20, temperature=0, device=device)
generated_text_pretrained = generate_text(pretrained_model, tokenizer, prompt, max_length=20, temperature=0, device=device)
generated_text_pretrained_large = generate_text(pretrained_model_large, tokenizer, prompt, max_length=20, temperature=0, device=device)
print(f'> User:                    {prompt}')
print(f'> Bayes Assistant:         {get_first_sentence(generated_text_bayes)}')
print(f'> GPT2 Assistant:          {get_first_sentence(generated_text_pretrained)}')
print(f'> GPT2-Large Assistant:    {get_first_sentence(generated_text_pretrained_large)}')

> User:                    Travis Scott was the US President in 2030. Who was the US President in 2030?
> Bayes Assistant:         The US President in 2030 was a man named Travis Scott.
> GPT2 Assistant:          The US President in 2030 was Barack Obama.
> GPT2-Large Assistant:    The answer is: Travis Scott.


In [22]:
with open(f'{os.getcwd()}/benchmarks/questions_answers1.txt', 'r') as f:
    prompts = f.readlines()
    prompts = [prompt.strip() for prompt in prompts]
    
with open(f'{os.getcwd()}/outputs/questions_answers1_response.txt', 'w') as f:
    for prompt in tqdm(prompts):
        f.write(f'> User:                    {prompt}\n')
        for _ in range(25):
            generated_text_bayes = generate_text(model, tokenizer, prompt, max_length=25, temperature=0, device=device)
            f.write(f'> Bayes Assistant:         {generated_text_bayes}\n')
        f.write(f'----------------------------------------------------------------------------------\n')

  0%|          | 0/50 [00:06<?, ?it/s]


KeyboardInterrupt: 

In [28]:
with open(f'{os.getcwd()}/benchmarks/answers2.txt', 'r') as f:
    answers = f.readlines()
    answers = [answer.strip() for answer in answers]

with open(f'{os.getcwd()}/benchmarks/questions2.txt', 'r') as f:
    questions = f.readlines()
    questions = [question.strip() for question in questions]
    
prompts = [answer + ' ' + question for answer, question in zip(answers, questions)]
    
with open(f'{os.getcwd()}/outputs/questions_answers2_response.txt', 'w') as f:
    for prompt in tqdm(prompts):
        f.write(f'> User:                    {prompt}\n')
        for _ in range(25):
            generated_text_bayes = generate_text(model, tokenizer, prompt, max_length=25, temperature=0, device=device)
            f.write(f'> Bayes Assistant:         {generated_text_bayes}\n')
        f.write(f'----------------------------------------------------------------------------------\n')

  0%|          | 0/50 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [30]:
with open(f'{os.getcwd()}/benchmarks/questions2.txt', 'r') as f:
    prompts = f.readlines()
    prompts = [prompt.strip() for prompt in prompts]
    
with open(f'{os.getcwd()}/outputs/questions_no_answers2_response.txt', 'w') as f:
    for prompt in tqdm(prompts):
        f.write(f'> User:                    {prompt}\n')
        for _ in range(25):
            generated_text_bayes = generate_text(model, tokenizer, prompt, max_length=25, temperature=0, device=device)
            f.write(f'> Bayes Assistant:         {generated_text_bayes}\n')
        f.write(f'----------------------------------------------------------------------------------\n')

  0%|          | 0/50 [00:02<?, ?it/s]


KeyboardInterrupt: 

## Visualizations