<a href="https://colab.research.google.com/github/csr117/BTE-320/blob/main/TrainingLLMCase1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json

def filter_json_to_txt():
    input_json_file = 'Case1.json'
    output_txt_file = 'Case1.txt'

    with open(input_json_file, 'r') as infile:
        data = json.load(infile)

    keys_to_keep = ['casebody', 'opinions', 'text']
    filtered_text = ""

    for key in keys_to_keep:
        if key in data:
            filtered_text += f"{key.capitalize()}:\n{data[key]}\n\n"

    with open(output_txt_file, 'w') as outfile:
        outfile.write(filtered_text)

    print(f"Filtered data saved to {output_txt_file}")

filter_json_to_txt()

Filtered data saved to Case1.txt


In [None]:
import re
!pip install tiktoken
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")


with open("Case1.txt", "r") as f:
  raw_text = f.read()
  raw_text = raw_text.replace(u'\xa0', u' ').replace('\n', ' ').replace('\\n', ' ')
  raw_text = ' '.join(raw_text.split())
  cleaned_text = re.sub(r'[^a-zA-Z\s]', '', raw_text.lower())
  formatted_text = cleaned_text.replace('. ', '.\n\n')
  split_text = re.split(r'([,.]|\s)', formatted_text)
  split_text = [item for item in split_text if item.strip()]

enc_text = tokenizer.encode(formatted_text, allowed_special={"<|endoftext|>"})
print(len(enc_text))
print(max(enc_text))

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0
4075
48287


In the next cell


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []
    token_ids = tokenizer.encode(txt)
    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1: i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    drop_last=drop_last,
    num_workers=num_workers
  )
  return dataloader

In [None]:
vocab_size = 47636
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


max_length = 128
dataloader = create_dataloader_v1(
  formatted_text, batch_size=16, max_length=max_length,
  stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
token_embeddings = token_embedding_layer(inputs)
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
input_embeddings = token_embeddings + pos_embeddings


print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)
print("\nToken embeddings shape:\n",token_embeddings.shape)
print("\nPos embeddings shape:\n",pos_embeddings.shape)
print("\nInput embeddings shape:\n", input_embeddings.shape)

Token IDs:
 tensor([[ 7442,  2618, 10266,  ...,   262,  1176,   550],
        [  587,   379,   439,  ...,   262,  1295,   286],
        [  290, 31601,   329,  ...,   285,   658,   287],
        ...,
        [ 1230,   287,   781,  ...,  8681,  6325,   220],
        [  743,   220,   262,  ...,   262, 16503,  2585],
        [  286,   262,  5640,  ...,   286,   262, 16503]])

Inputs shape:
 torch.Size([16, 128])

Token embeddings shape:
 torch.Size([16, 128, 256])

Pos embeddings shape:
 torch.Size([128, 256])

Input embeddings shape:
 torch.Size([16, 128, 256])


In [None]:
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"
        self.d_in = d_in
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)

        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)

        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)

        return context_vec


d_in = 256 # the input embedding size, d=32
d_out = 128
context_length = token_embeddings.shape[1]
mha = MultiHeadAttention(d_in, d_out, context_length, 0.01,16)
context_vecs = mha(token_embeddings)
print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[-0.0596,  0.1777,  0.4631,  ..., -0.1256,  0.4974,  0.2660],
         [ 0.3156,  0.0715,  0.2052,  ..., -0.2403,  0.4659,  0.0227],
         [ 0.3292, -0.0441,  0.1039,  ..., -0.1359,  0.3005,  0.0284],
         ...,
         [-0.1569,  0.0323,  0.0059,  ..., -0.2372,  0.1098, -0.0150],
         [-0.1659,  0.0246, -0.0187,  ..., -0.2000,  0.1230, -0.0272],
         [-0.1078,  0.0542,  0.0254,  ..., -0.2135,  0.1111,  0.0272]],

        [[-1.0860, -0.3419,  0.5217,  ...,  0.2091,  0.0314,  0.3828],
         [-0.5360, -0.2666,  0.2725,  ...,  0.0144, -0.0168,  0.3490],
         [-0.2655, -0.2393,  0.2975,  ..., -0.0174, -0.0233,  0.3433],
         ...,
         [-0.1389,  0.0134,  0.0074,  ..., -0.1654,  0.0993,  0.0753],
         [-0.1295,  0.0164, -0.0255,  ..., -0.1719,  0.1054,  0.0420],
         [-0.1788,  0.0077,  0.0028,  ..., -0.1221,  0.0867,  0.0323]],

        [[ 0.1519, -0.0557,  0.2625,  ..., -0.2893,  0.6925, -0.2271],
         [ 0.0701, -0.0693,  0.3205,  ..., -0

In [None]:
GPT_CONFIG_124M = {
  "vocab_size": 47636,
  "context_length": 256,
  "emb_dim": 256,
  "n_heads": 16,
  "n_layers": 12,
  "drop_rate": 0.01,
  "qkv_bias": False
}


In [None]:
class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)
    norm_x = (x - mean) / torch.sqrt(var + self.eps)
    return self.scale * norm_x + self.shift


In [None]:
class FeedForward(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
        GELU(),
        nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
    )

  def forward(self, x):
    return self.layers(x)


In [None]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    return 0.5 * x * (1 + torch.tanh(
        torch.sqrt(torch.tensor(2.0 / torch.pi)) *
        (x + 0.044715 * torch.pow(x, 3))
    ))


In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.att = MultiHeadAttention(
        d_in=cfg["emb_dim"],
        d_out=cfg["emb_dim"],
        context_length=cfg["context_length"],
        num_heads=cfg["n_heads"],
        dropout=cfg["drop_rate"],
        qkv_bias=cfg["qkv_bias"])
    self.ff = FeedForward(cfg)
    self.norm1 = LayerNorm(cfg["emb_dim"])
    self.norm2 = LayerNorm(cfg["emb_dim"])
    self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

  def forward(self, x):
    # Shortcut connection for attention block
    shortcut = x
    x = self.norm1(x)
    x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
    x = self.drop_shortcut(x)
    x = x + shortcut  # Add the original input back

    # Shortcut connection for feed forward block
    shortcut = x
    x = self.norm2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = x + shortcut  # Add the original input back

    return x

In [None]:
class GPTModel(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
    self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
    self.drop_emb = nn.Dropout(cfg["drop_rate"])

    self.trf_blocks = nn.Sequential(
        *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

    self.final_norm = LayerNorm(cfg["emb_dim"])
    self.out_head = nn.Linear(
        cfg["emb_dim"], cfg["vocab_size"], bias=False
    )

  def forward(self, in_idx):
    batch_size, seq_len = in_idx.shape
    tok_embeds = self.tok_emb(in_idx)
    pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
    x = tok_embeds + pos_embeds
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits

model = GPTModel(GPT_CONFIG_124M)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 33,923,584


In [None]:
total_size_bytes = total_params * 4
total_size_mb = total_size_bytes / (1024 * 1024)

print(f"Total size of the model: {total_size_mb:.2f} MB")


Total size of the model: 129.41 MB
