In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config['hidden_size'] % config['num_attention_heads'] == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config['hidden_size'], 3 * config['hidden_size'])
        # output projection
        self.c_proj = nn.Linear(config['hidden_size'], config['hidden_size'])
        self.c_proj.NANGPT_SCALE_INIT = 1
        # regularization
        self.num_attention_heads = config['num_attention_heads']
        self.hidden_size = config['hidden_size']
        self.register_buffer("bias", torch.tril(torch.ones(config['max_position_embeddings'], config['max_position_embeddings'])).view(1, 1, config['max_position_embeddings'], config['max_position_embeddings']))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (hidden_size)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), num_attention_heads=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.hidden_size, dim=2)
        k = k.view(B, T, self.num_attention_heads, C // self.num_attention_heads).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.num_attention_heads, C // self.num_attention_heads).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.num_attention_heads, C // self.num_attention_heads).transpose(1, 2) # (B, nh, T, hs)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

class TransformerModel(nn.Module):
    def __init__(self, config):
        super(TransformerModel, self).__init__()
        
        # Load model configuration from YAML
        self.hidden_size = config['hidden_size']
        self.vocab_size = config['vocab_size']
        self.num_hidden_layers = config['num_hidden_layers']
        self.num_attention_heads = config['num_attention_heads']
        self.intermediate_size = config['intermediate_size']
        self.max_position_embeddings = config['max_position_embeddings']
        self.rms_norm_eps = config['rms_norm_eps']
        self.tie_word_embeddings = config['tie_word_embeddings']

        # Embedding layers
        self.token_embeddings = nn.Embedding(self.vocab_size, self.hidden_size)
        self.position_embeddings = nn.Embedding(self.max_position_embeddings, self.hidden_size)

        # Transformer layers
        self.layers = nn.ModuleList([
            TransformerLayer(config,self.hidden_size, self.num_attention_heads, self.intermediate_size, self.rms_norm_eps)
            for _ in range(self.num_hidden_layers)
        ])

        # Output layer
        self.lm_head = nn.Linear(self.hidden_size, self.vocab_size)
        if self.tie_word_embeddings:
            self.lm_head.weight = self.token_embeddings.weight

    def forward(self, input_ids):
        # Input embeddings
        seq_length = input_ids.size(1)
        position_ids = torch.arange(0, seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        x = self.token_embeddings(input_ids) + self.position_embeddings(position_ids)

        # Pass through Transformer layers
        for layer in self.layers:
            x = layer(x)

        # Final linear layer
        logits = self.lm_head(x)
        return logits

class TransformerLayer(nn.Module):
    def __init__(self, config,hidden_size, num_attention_heads, intermediate_size, rms_norm_eps):
        super(TransformerLayer, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.head_dim = hidden_size // num_attention_heads

        assert hidden_size % num_attention_heads == 0, "hidden_size must be divisible by num_attention_heads"

        # Self-attention
        self.self_attn = CausalSelfAttention(config)
        self.ln_1 = nn.LayerNorm(hidden_size)

        # Feedforward layers
        self.linear1 = nn.Linear(hidden_size, intermediate_size)
        self.activation = F.silu  # Activation function (SiLU)
        self.linear2 = nn.Linear(intermediate_size, hidden_size)
        self.linear2.NANOGPT_SCALE_INIT = 1

        # Normalization
        self.norm1 = nn.LayerNorm(hidden_size, eps=rms_norm_eps)

    def forward(self, x):
        # Self-attention block
        attn_output = self.self_attn(self.ln_1(x))
        x = x + attn_output  # Residual connection
        x = self.norm1(x)

        # Feedforward block
        ff_output = self.linear2(self.activation(self.linear1(x)))
        x = x + ff_output  # Residual connection

        return x

# Load model configuration from YAML-like dictionary (parsed from YAML)
config = {
    "hidden_size": 576,
    "vocab_size": 49152,
    "num_hidden_layers": 30,
    "num_attention_heads": 9,
    "intermediate_size": 1536,
    "max_position_embeddings": 2048,
    "rms_norm_eps": 1e-5,
    "tie_word_embeddings": True
}


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
#from model import TransformerModel, config
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
import os
from torchsummary import summary
from tqdm import tqdm
import time

# Function to train a new tokenizer
def train_tokenizer(data_files, vocab_size, save_path):
    """
    Args:
        data_files: List of file paths containing text data.
        vocab_size: Desired vocabulary size.
        save_path: Path to save the trained tokenizer.
    """
    tokenizer = Tokenizer(WordPiece())
    tokenizer.pre_tokenizer = Whitespace()

    trainer = WordPieceTrainer(
    vocab_size=vocab_size,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    min_frequency=1  # Lower threshold to include more tokens
    )

    tokenizer.train(data_files, trainer)

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    tokenizer.save(os.path.join(save_path, "tokenizer.json"))
    return tokenizer

# Function to save a checkpoint
def save_checkpoint(model, optimizer, scheduler, current_batch, checkpoint_path):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'current_batch': current_batch
    }
    torch.save(checkpoint, checkpoint_path)
    #print(f"Checkpoint saved at batch {current_batch} to {checkpoint_path}")

# Function to load a checkpoint
def load_checkpoint(checkpoint_path, model, optimizer, scheduler):
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        current_batch = checkpoint['current_batch']
        print(f"Checkpoint loaded from {checkpoint_path} at batch {current_batch}")
        return current_batch
    else:
        print(f"No checkpoint found at {checkpoint_path}, starting fresh training.")
        return 0

class DataLoaderLite:
    def __init__(self, tokenizer, data_files, B, T):
        self.B = B
        self.T = T

        self.tokenizer = tokenizer

        # Load and tokenize data
        self.data = []
        for file_path in data_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
                tokens = tokenizer.encode(text).ids
                self.data.extend(tokens)

        print(f'loaded {len(self.data)} tokens')
        print(f'1 epoch = {len(self.data) // (B * T)} batches')

        # state
        self.current_position = 0
    
    def next_batch(self):
        B, T = self.B, self.T
        buf = self.data[self.current_position: self.current_position + B * T + 1]
        
        # Convert list to tensor
        buf = torch.tensor(buf, dtype=torch.long)

        x = buf[:-1].view(B, T)  # inputs
        y = buf[1:].view(B, T)   # targets
        
        # advance the position in the tensor
        self.current_position += B * T

        # if loading the next batch would be out of bounds, reset
        if self.current_position + (B * T + 1) > len(self.data):
            self.current_position = 0

        return x, y

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

# SEED
torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

data_files = ["/kaggle/input/input1-txt/input.txt"]  # Replace with your text file paths
vocab_size = 49152
tokenizer_save_path = "/kaggle/working/"

# Train a new tokenizer if not already trained
if not os.path.exists(os.path.join(tokenizer_save_path, "tokenizer.json")):
    tokenizer = train_tokenizer(data_files, vocab_size, tokenizer_save_path)
else:
    tokenizer = Tokenizer.from_file(os.path.join(tokenizer_save_path, "tokenizer.json"))

batch_size = 16
tokens_per_batch = 256

train_loader = DataLoaderLite(tokenizer,data_files,B = batch_size, T = tokens_per_batch)

# Function to generate predictions
def generate_predictions(model, tokenizer, text, max_tokens=100, top_k=50):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer.encode(text).ids
        input_ids = torch.tensor(input_ids, dtype=torch.long, device=device).unsqueeze(0)  # Add batch dimension

        generated_ids = input_ids
        for _ in range(max_tokens):
            outputs = model(generated_ids)
            next_token_logits = outputs[:, -1, :]  # Get logits for the last token
            probs = torch.nn.functional.softmax(next_token_logits, dim=-1)

            if top_k is not None:
                # Top-k sampling
                top_k_probs, top_k_indices = torch.topk(probs, top_k, dim=-1)
                next_token = top_k_indices.gather(
                    dim=-1, index=torch.multinomial(top_k_probs, num_samples=1)
                )
            else:
                # Greedy decoding
                next_token = torch.argmax(probs, dim=-1, keepdim=True)

            # Ensure next_token has the correct shape for concatenation
            next_token = next_token.view(1, 1)  # (batch_size=1, sequence_length=1)

            # Concatenate next_token with generated_ids
            generated_ids = torch.cat([generated_ids, next_token], dim=1)

            # Stop if end-of-sequence token is generated
            if next_token.item() == tokenizer.token_to_id("[SEP]"):
                break

        generated_text = tokenizer.decode(generated_ids.squeeze().tolist())
    model.train()
    return generated_text



config["vocab_size"] = len(tokenizer.get_vocab())
print(f"New Vocab Size is: {config['vocab_size']}")
# Initialize model, optimizer, and scheduler
model = TransformerModel(config)
model.to(device)

def initialize_weights(m):
    if isinstance(m, torch.nn.Linear) or isinstance(m, torch.nn.Embedding):
        torch.nn.init.xavier_uniform_(m.weight)

model.apply(initialize_weights)

# Print the model architecture
print(model)

# Calculate and print the total number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {trainable_params}")

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.65)

# Define checkpoint path
checkpoint_path = "/kaggle/working/checkpoint.pth"

# Load checkpoint if exists
start_batch = load_checkpoint(checkpoint_path, model, optimizer, scheduler)

# Fixed text for predictions
fixed_text = "He that will give good words to thee will flatter"

# Training loop with tqdm progress bar
total_batches = 5000
checkpoint_interval = len(train_loader.data) // (batch_size * tokens_per_batch)

for i in range(start_batch, total_batches, checkpoint_interval):
    loss_list = []
    start_time = time.time()
    with tqdm(total=checkpoint_interval, desc=f"Training Batches {i}-{i+checkpoint_interval-1}", unit="batch") as pbar:
        for batch_idx in range(checkpoint_interval):
            global_batch = i + batch_idx
            if global_batch >= total_batches:
                break

            x, y = train_loader.next_batch()
            x, y = x.to(device), y.to(device)
            pred = model(x)

            # Reshape predictions and targets for loss computation
            pred = pred.view(-1, pred.size(-1))  # Flatten predictions
            y = y.view(-1)  # Flatten targets

            # Compute loss
            loss = criterion(pred, y)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Update progress bar with current loss
            pbar.set_postfix(loss=loss.item())
            pbar.update(1)
            loss_list.append(loss.item())

            # Generate text and save checkpoint at the last batch of the interval
            if (batch_idx + 1) == checkpoint_interval or global_batch == total_batches - 1:
                generated_text = generate_predictions(model, tokenizer, fixed_text, max_tokens=100)
                print(f"\nGenerated text at batch {global_batch}: {generated_text}")
                save_checkpoint(model, optimizer, scheduler, global_batch, checkpoint_path)

    # Scheduler step
    scheduler.step()
    elapsed_time = time.time() - start_time
    epoch_loss = sum(loss_list) / len(loss_list)
    print(f"Loss: {epoch_loss:.4f}, Time: {elapsed_time:.2f}s")

using device: cuda
loaded 261973 tokens
1 epoch = 63 batches
New Vocab Size is: 19248
TransformerModel(
  (token_embeddings): Embedding(19248, 576)
  (position_embeddings): Embedding(2048, 576)
  (layers): ModuleList(
    (0-29): 30 x TransformerLayer(
      (self_attn): CausalSelfAttention(
        (c_attn): Linear(in_features=576, out_features=1728, bias=True)
        (c_proj): Linear(in_features=576, out_features=576, bias=True)
      )
      (ln_1): LayerNorm((576,), eps=1e-05, elementwise_affine=True)
      (linear1): Linear(in_features=576, out_features=1536, bias=True)
      (linear2): Linear(in_features=1536, out_features=576, bias=True)
      (norm1): LayerNorm((576,), eps=1e-05, elementwise_affine=True)
    )
  )
  (lm_head): Linear(in_features=576, out_features=19248, bias=True)
)
Total trainable parameters: 105384624
No checkpoint found at /kaggle/working/checkpoint.pth, starting fresh training.


Training Batches 0-62: 100%|██████████| 63/63 [00:33<00:00,  1.87batch/s, loss=6.87]


Generated text at batch 62: He that will give good words to thee will flatter : , me ? with . what , , he , she to my he is ' And that good will my , ! know - the me : you as , , , ' , not I for his have not as my : , , the And ' I be , : : to have , the , is , to ? as the for , , is , what , , ! I . ' I , by . . is : you , ? ? the , And of have . : to and by ,


Training Batches 0-62: 100%|██████████| 63/63 [00:37<00:00,  1.67batch/s, loss=6.87]


Loss: 7.2082, Time: 37.68s


Training Batches 63-125: 100%|██████████| 63/63 [00:33<00:00,  1.85batch/s, loss=6.81]


Generated text at batch 125: He that will give good words to thee will flatter me a your , : I , have have so , I a , a by have not : And ; my is , : him you ! is : ; I ' be him to with he I , . , ? a , , but ' , ' and , : she - I she , this . I . , me you my not , , of and , are To : him with ; what is , she . is in good , be sir my to : , , to that ' , . :


Training Batches 63-125: 100%|██████████| 63/63 [00:39<00:00,  1.59batch/s, loss=6.81]


Loss: 6.5409, Time: 39.62s


Training Batches 126-188: 100%|██████████| 63/63 [00:33<00:00,  1.85batch/s, loss=6.8] 


Generated text at batch 188: He that will give good words to thee will flatter you not of . with you have no . And not : , it not s no the , have the you ; ; your d be will , ' his him the , as , it of she : , but to so , ? not I his , I ; a I of him ? , ? , you . . the , ' , it ; , : she have as of ? I I , not , ? , his , , me : , ' : : , I good ! : , he not


Training Batches 126-188: 100%|██████████| 63/63 [00:39<00:00,  1.59batch/s, loss=6.8]


Loss: 6.5201, Time: 39.62s


Training Batches 189-251: 100%|██████████| 63/63 [00:33<00:00,  1.85batch/s, loss=6.78]


Generated text at batch 251: He that will give good words to thee will flatter a this I , ; , do I : : . his and it you , . as , is you I for is : ; a a : , : it , : you , me him you , for , : a no , , a , And her . , his him s ' your in I , . for , . the is her to sir of your my to of . the , , , my ' , : not a for : to ' of ? he be a ? , , and ,


Training Batches 189-251: 100%|██████████| 63/63 [00:39<00:00,  1.59batch/s, loss=6.78]


Loss: 6.5118, Time: 39.72s


Training Batches 252-314: 100%|██████████| 63/63 [00:33<00:00,  1.85batch/s, loss=6.76]


Generated text at batch 314: He that will give good words to thee will flatter : it , is : . I the but . , ; by And so to I your him I and . to it I , me my my , . ? ' , her . your of , I will so , ; the : I , with so you , a to not to : : , , , , a . , you you . have - . do what , . s of , not ' this , ; is you , ' ' so , my : . : . : a the it my


Training Batches 252-314: 100%|██████████| 63/63 [00:39<00:00,  1.59batch/s, loss=6.76]


Loss: 6.5066, Time: 39.74s


Training Batches 315-377: 100%|██████████| 63/63 [00:33<00:00,  1.85batch/s, loss=6.73]


Generated text at batch 377: He that will give good words to thee will flatter it his of as , with is the to , : I . a with , , , you : be ? the him me : : , to And , is the , , s be is me thou ' my ? , so be ; ! : no , I that him I , you d I me , his the s thy d your : ? , is to , for you that it be to , that a , ; ! ll to me , for ? s . : but , I not . :


Training Batches 315-377: 100%|██████████| 63/63 [00:39<00:00,  1.58batch/s, loss=6.73]


Loss: 6.4874, Time: 39.77s


Training Batches 378-440: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.73]


Generated text at batch 440: He that will give good words to thee will flatter To : , ; , s , : your he I not your my me , the . : ; ! it and your I . to , , ' ' : , I , , not , ; in , it he this , ; do by : , : . ' you my : thou ! And , is . a , shall for to and , this it , : : ' to you for and , , I her the , , , , to : , of . . of . , ' your by


Training Batches 378-440: 100%|██████████| 63/63 [00:39<00:00,  1.58batch/s, loss=6.73]


Loss: 6.4802, Time: 39.81s


Training Batches 441-503: 100%|██████████| 63/63 [00:33<00:00,  1.85batch/s, loss=6.72]


Generated text at batch 503: He that will give good words to thee will flatter d of ll that you and To ! so , ; . of for it : and , thy , a my , to , is him to s d . him with : me not do you of you . and as this for ! he : you , you , I : to to for ; : so thou : for , , you - of . it thou to her , you your s - , . , have : : to : , . , , in ! the to is the you you a :


Training Batches 441-503: 100%|██████████| 63/63 [00:39<00:00,  1.59batch/s, loss=6.72]


Loss: 6.4786, Time: 39.72s


Training Batches 504-566: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.72]


Generated text at batch 566: He that will give good words to thee will flatter : , , , by , : and of And your shall . I s , this if ! ll I to : that d ' : : I be to with To I . a d me ; . . ' And me : his I your do , , as : : , for ? I ' , . in a it this , thy : , you be : , , him ' , s : , is , ! it have a the , : for . with . the your and ' I - it


Training Batches 504-566: 100%|██████████| 63/63 [00:39<00:00,  1.58batch/s, loss=6.72]


Loss: 6.4775, Time: 39.79s


Training Batches 567-629: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.72]


Generated text at batch 629: He that will give good words to thee will flatter ' as , be no it . , ! , I . , . : with be . that the is ! , this ! - you this not by , , with , and thou not : ! , I . that and him , , not ' I ll : not you : , . not , a of it of to : thou if , : thy : shall the it you , me . ' is , if you : . And his d it And for . I me , me . To I .


Training Batches 567-629: 100%|██████████| 63/63 [00:39<00:00,  1.58batch/s, loss=6.72]


Loss: 6.4766, Time: 39.77s


Training Batches 630-692: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 692: He that will give good words to thee will flatter the ; , the have of with him and me And with s all it be in : ' : I : ' so I with thou my for . : : the ; ? have him you . my be me it : with , d And his : his : : as with : thee : you of , , not all you s s I in his my : but his And , ' and of I ' : : be ; I ! . , not , ' ' be you this and To be the


Training Batches 630-692: 100%|██████████| 63/63 [00:39<00:00,  1.58batch/s, loss=6.69]


Loss: 6.4660, Time: 39.83s


Training Batches 693-755: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 755: He that will give good words to thee will flatter , that a s : with to ! is her to , s ; d : him you it you for . , : the that of ! : : by of . be have my , of of of . ? and . : , . . a ' And : the that my me : as ' : . . , the by : as that : me ; but . to . The ! to him . ! but ; the no you it you . I d not : . have you as you d ,


Training Batches 693-755: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.69]


Loss: 6.4638, Time: 39.35s


Training Batches 756-818: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 818: He that will give good words to thee will flatter . : , have . , , shall , a not , thou : , ? me , thee thou not ! s a to we : , will , and , do d to for to And is to . I ; of And d , , not , : a , , I we . and , , : my do me . And you , have as . ; so . this , shall my for have of , the you , I . , . not , the not and , : not have : :


Training Batches 756-818: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.69]


Loss: 6.4635, Time: 39.21s


Training Batches 819-881: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 881: He that will give good words to thee will flatter , , And this ' And for , not do : by ; her for ' ? have , ' s a and we for as s : a s , the with the : I in will him ; s ? , ' ; . it - ! my ' his and , the : a , - and s ; : , , ? , And . and that thy not . be my be The . the not to to not all , be , , , To ' . all to in to for , you


Training Batches 819-881: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.69]


Loss: 6.4633, Time: 39.22s


Training Batches 882-944: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 944: He that will give good words to thee will flatter s thy , : and have ' have of The , ' ! the you . as that be , you of ? do that of , of with , : the - ' ? you to . and ' : be . To you in the him and the with , have . and : in be , . you as that : d to . a , do . that thou to s ' a not thee we your : , ' , ; of ? , ? : the thy : and , the , my be


Training Batches 882-944: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.69]


Loss: 6.4631, Time: 39.26s


Training Batches 945-1007: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 1007: He that will give good words to thee will flatter that this , but ; . the ' : ; I the : the , d ! in do the : s . , . it you , the so of : , , ' my I the I of his , is of the ; thee - you ' . : : of , he the I The , but your so and as : ! thou : you ' I so , : but you I ! no of ' is and of . ; . ' is the I : to . his be we . And


Training Batches 945-1007: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.69]


Loss: 6.4544, Time: 39.26s


Training Batches 1008-1070: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 1070: He that will give good words to thee will flatter do to so as to . ; your ' to . my ? and to to , , , to , me be thy ; her I me , as . I ! not ' , ' . , to ' me : , him his have : , thy ' . I your in of , s and , , ' : . by not he I the , I The ' I . ? . ' . I that to is as of a and but , for it it , all s : with not d .


Training Batches 1008-1070: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.69]


Loss: 6.4537, Time: 39.30s


Training Batches 1071-1133: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 1133: He that will give good words to thee will flatter a , , the him this . to you my shall , my the , her I for his a , to , to , of a with my ? - and I have he this in ' that the I , . him that is a me do , his I , And is s that a , that ' ! is it , : , ! your you so with and : the . shall to your ? me : : ' : all And to ' that , , by , the . of : To him


Training Batches 1071-1133: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.69]


Loss: 6.4535, Time: 39.23s


Training Batches 1134-1196: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 1196: He that will give good words to thee will flatter ? of ' for ' you ; that not - : this as , , we , thy : be , s the , : his and in : so ; ' s with but no his d , ' thou thou . me it my by in to so . ' , in in , he , me you do , ! , is , ! the to : d : thee I , ? be , with be for : this , ; , . shall is , : as ! not , : of , : ;


Training Batches 1134-1196: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.69]


Loss: 6.4534, Time: 39.22s


Training Batches 1197-1259: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.69]


Generated text at batch 1259: He that will give good words to thee will flatter is : ' ' thou of , . , this he ' with : ' I , : . - : I no with his , : , will , me the . the have I not a him of the ; - to . : . a . be , of to ' I , , , , this , : as , have and To for . shall . ' your d to d that you but ' And be ; a him I : my : as I not that that , your thy to , ,


Training Batches 1197-1259: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.69]


Loss: 6.4533, Time: 39.28s


Training Batches 1260-1322: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.68]


Generated text at batch 1322: He that will give good words to thee will flatter and . be And , - ! my you , his as ' thee you and as he To to ! thy you my , his a : thou me s to s ? : I I his And thee : ; , d The I of ; , but ; : , me your he ' with ' to my are : the a ' And for have him ' : . for him , to to to to : and , the to the ; : you : ' to , , , the her , , And


Training Batches 1260-1322: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.68]


Loss: 6.4469, Time: 39.25s


Training Batches 1323-1385: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.68]


Generated text at batch 1385: He that will give good words to thee will flatter of ! for : ' ' I ; thee I you , : , with , : me it the and . the : : we , me . ' in ? I we the , of that : : the ' : and will not the to ' this a ' not , , a the the : . so this her is I be , we thee , : : that it : s ! , . : all d all I you , you me , of And ' , d and it all that I you


Training Batches 1323-1385: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.68]


Loss: 6.4465, Time: 39.18s


Training Batches 1386-1448: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.68]


Generated text at batch 1448: He that will give good words to thee will flatter I your , ? he of , , of the : ' this to the my a that ' and ? ; the will . ' your ' I s but as are I d with and for is to in my to , be ' . to ; , I , is To , the , : to for by , , is as ? ; me ' , ; . : the you , . . ' ' I and , ? ! you that to I ' : that I , - ; of , of .


Training Batches 1386-1448: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.68]


Loss: 6.4465, Time: 39.17s


Training Batches 1449-1511: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.68]


Generated text at batch 1511: He that will give good words to thee will flatter with ' his him ! : . it of and I ' the And , you : , in to ' , to not to I ' a you of and , is , in : ; this , he - this : to . . but the : will : , ; ? , , shall the , , . me thy : : the , be ' . so this thou And - - And I , ' not this I : , ? not d but ' : : is . , - do I by ,


Training Batches 1449-1511: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.68]


Loss: 6.4465, Time: 39.27s


Training Batches 1512-1574: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.68]


Generated text at batch 1574: He that will give good words to thee will flatter the is this To a . . : ? and to : . all : of you have that the is , , the to the be in ! . to , my to to I I thou for ? I , : ' . ? ' be I are you I , , ' I is ' be . - and ? , , and . - d ' I and ! this this do your you . have : : s is be to to : with , to he of her , ? I : as his


Training Batches 1512-1574: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.68]


Loss: 6.4465, Time: 39.33s


Training Batches 1575-1637: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 1637: He that will give good words to thee will flatter so the is , : ; , ! , you . the his to shall it I , ; in . To . in by ' and , ! me I And . not , , that . to with I ' thee as ' and and , ; d that do me thee ' by it of : : the not that : , , , , ; is have : a . . ' her not is : And : - that , ; and : of ; ! a : To , ? have . , '


Training Batches 1575-1637: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4425, Time: 39.22s


Training Batches 1638-1700: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 1700: He that will give good words to thee will flatter , in : is : And and to ; ' . my and . ! the To I a , d ' , a . d he for with ' to of by , this , I it : , is : that , for my I ; the ; are the as , , . thy , her ' of me , a this with it thee my as , be your with to . ' ' . . to I I the of to I And in . me you : , - your , . be .


Training Batches 1638-1700: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.67]


Loss: 6.4424, Time: 39.26s


Training Batches 1701-1763: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 1763: He that will give good words to thee will flatter - , : the so my my the . the in and , I I a of ; my thy , I be with all shall ' have The of to . , by the , shall , , ' s ' and the s , in to as her my : so as ' will we thou ; ' , And be and . . : with this for not : . for : , , . ; : ' , of to ; I ! . have not not you your with you s it , : .


Training Batches 1701-1763: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4423, Time: 39.22s


Training Batches 1764-1826: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 1826: He that will give good words to thee will flatter I you I but , you ' . in but to , thou : the , : but shall that ' you of you the of in and - , . ' by , my and : this I of ' my this for , is it you my The for : ! , , as will ! in I for a in : . do and , is , , is thee not - of that . the do in , , , and . you a , , I d the - I , , , : the


Training Batches 1764-1826: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.67]


Loss: 6.4423, Time: 39.32s


Training Batches 1827-1889: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 1889: He that will give good words to thee will flatter , his . , my , : his your I . have ! my him . , of I my ; your s I is I d - , to : : ! To will me thy . the : , is my . , your of to you with ; shall I : my : : I not as her have my to not thou , : , not , : : . her my ' : . to of my . for so ? s of the , , - for that it ; that a . '


Training Batches 1827-1889: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.67]


Loss: 6.4423, Time: 39.26s


Training Batches 1890-1952: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 1952: He that will give good words to thee will flatter : , ' . and thou . ' , ' ' so : s , my he not and ' my , . : thy in And , , ? , of , , you : the To , ! a the ; but , that the his . this s that , is ' ' to , of , with , me and I thee that a his him And to the , thou - . : , have it , , : with he I : with of is thee d to to not him so : d


Training Batches 1890-1952: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4398, Time: 39.24s


Training Batches 1953-2015: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2015: He that will give good words to thee will flatter ; by The ' a he , , a s . your . the with ' : in be to the thou the : . the but , ? ' the To s . I a ; , not : , d but as a it I : him : ! , the : and me her The : by , : The ! will him : by ! and is ' and s : , ; . do your s : , ? and I a : he ' and ? ' ' , to And he . ,


Training Batches 1953-2015: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4397, Time: 39.24s


Training Batches 2016-2078: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2078: He that will give good words to thee will flatter , s , : in : , and me ' ? To the , , And and him as , is your are do of will of . in you s d I his all , , I you to : . have the my you have . ? , ? his by have to her . : her with , I I his s , And the your , his is and ; ' will , it to ' ; . that : . not . this for , him of , the the in with to , he


Training Batches 2016-2078: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4397, Time: 39.22s


Training Batches 2079-2141: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2141: He that will give good words to thee will flatter not , : and And ' with that : : but , . ' , , with I , the and in you the , , is my , we : : , : , with And I to of thou do to : of and your is be : the to . . And me this him his ! of thy . I , And are the thee his , s of to her will him , s s To ' is ; I and : the is but , ? . ' , . To I me thy


Training Batches 2079-2141: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4396, Time: 39.23s


Training Batches 2142-2204: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2204: He that will give good words to thee will flatter and ; be , he s you : ; and shall , my . , so , ' s that . ' s ' for I that you ? have this but but , ' we . not : . , by in of to the with : will of it : thy and my the and this as : he . d . ; all his that ' thou to : but ; this And all me he : ' . as ' , ? be s , , not thou ' to do not . The ? shall


Training Batches 2142-2204: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4396, Time: 39.20s


Training Batches 2205-2267: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2267: He that will give good words to thee will flatter my her do ; s in , ' , to : to ? to your but ' it your and ' thy , . the ; but shall . ! ' by ' , The the . , the : , , : the you not her ' of , my not . ; me be all and thy ; in , . - ; , a be ' a my , , , ' s - his . , , of and have ' thou : of my . not a with of I with , s and is


Training Batches 2205-2267: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.67]


Loss: 6.4379, Time: 39.27s


Training Batches 2268-2330: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2330: He that will give good words to thee will flatter your with : you to I the ' her all : a it And , . The ' . , , I : , my , as to : . not And , ' : , and his d it To of ' he thy this s with . s , : : . him d so . this ' and are will of ; : the : to , : my And ! a ' to the and he the , a the ' , the and in , . will be : my it I and a ,


Training Batches 2268-2330: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4378, Time: 39.21s


Training Batches 2331-2393: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2393: He that will give good words to thee will flatter thou for not my that him ! not it ; the he ! he of : , my , shall ; it ' in . , of , , ' to . of : of ? : thou your of ! but . I ; : . his he ? : : . . you ? : : . ' : . : are ' in ' the this , And , . And ' , , of and to s I a and : - , s , ? for ' . I my , and I I my


Training Batches 2331-2393: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4378, Time: 39.25s


Training Batches 2394-2456: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2456: He that will give good words to thee will flatter : his . I I ? to : , , this . a , I ; , , to in ' I ? . that the and me the , of The ; . for , the to : a : my that have to ; d we a : ? . : of the ; shall in the as , , , be . that ' : ' I I , , to in . , to the , this you . . , of you me with be . with , all be in : the s you


Training Batches 2394-2456: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4378, Time: 39.22s


Training Batches 2457-2519: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2519: He that will give good words to thee will flatter ; , : I for s , thee I a with To , , thou the to this , and ! this his I to and are to is the d , and thou in my . have me for that and the d shall me shall . , so . ? . : and ' . a . : he of the . : to you , a : by have , And as I : To d his as the : To : ; we your and ' him . , thou your in that . thy ,


Training Batches 2457-2519: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4378, Time: 39.21s


Training Batches 2520-2582: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2582: He that will give good words to thee will flatter you , : he the , ' is . to for have and the be to to , that , so , a ' I : ' : ! I the but all not your to that , the your of shall his him , : ' all by your to . the , me : ' , , the ! , ' . d will , The ? ? And , , . you I to I . will : ! the I have : ' that in to ! his . , and ' and s , the


Training Batches 2520-2582: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4366, Time: 39.19s


Training Batches 2583-2645: 100%|██████████| 63/63 [00:34<00:00,  1.86batch/s, loss=6.67]


Generated text at batch 2645: He that will give good words to thee will flatter be to your as and s is as , ' and thee : my are thy The : : your have : my ! me , in I ' And ' , me ! , is ; : , ; ! the . thy I we , him of . : is I the that the ' , ; I ? in To is your ' it , ; you s will : the , , that , ! : that and of the will have , , , is his ; , to I . thee and to have


Training Batches 2583-2645: 100%|██████████| 63/63 [00:39<00:00,  1.60batch/s, loss=6.67]


Loss: 6.4365, Time: 39.30s


Training Batches 2646-2708: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2708: He that will give good words to thee will flatter be ' you the but you me have your , you To thee ; me : s , I , you ; me is a is by and you , , , I of for my are : with ? . the be , ! my I , . you to ' me , the d a , . that , - you The : ' : as I ' : . : The have I And To shall are . : . of ' , ? . the your And , thou , , and And And my a


Training Batches 2646-2708: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4365, Time: 39.19s


Training Batches 2709-2771: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2771: He that will give good words to thee will flatter in a to his in , not I the thou it , the this ? I To a , ' ' ' . , : , ' in s of will , me , s of and , ! . that I him to a it , that ; the the this I . The that but your . . ' , of I . a the her this and ' , as to for , in ! the is a is , , , ' to , ; ; ' : s I And , To so you ,


Training Batches 2709-2771: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4365, Time: 39.15s


Training Batches 2772-2834: 100%|██████████| 63/63 [00:34<00:00,  1.85batch/s, loss=6.67]


Generated text at batch 2834: He that will give good words to thee will flatter . with . as thou ' ; me I , , be . him , s do : that ' for . have , : : he : . do , , be , ; , ; , , of , it to I the we I , And , ' the him him s : with : , not : ; , my thee him ' , ? : for : ; - : , : , I thy ; your my your , ' and . , this this I and so the will the of to :


Training Batches 2772-2834: 100%|██████████| 63/63 [00:39<00:00,  1.61batch/s, loss=6.67]


Loss: 6.4365, Time: 39.19s


Training Batches 2835-2897:  65%|██████▌   | 41/63 [00:22<00:11,  1.85batch/s, loss=6.48]

In [7]:
import os
import shutil

# Path to the working folder
working_folder = "/kaggle/working"

# Delete all files and subfolders in the working folder
for filename in os.listdir(working_folder):
    file_path = os.path.join(working_folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)  # Remove file
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)  # Remove folder
    except Exception as e:
        print(f"Failed to delete {file_path}. Reason: {e}")

print("All files and folders in /kaggle/working have been deleted.")

All files and folders in /kaggle/working have been deleted.
