<a href="https://colab.research.google.com/github/dhruvchopra2003/Paper2/blob/main/LLM_2_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
%pip install rouge_score tiktoken



In [11]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-11-12 17:39:45--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2024-11-12 17:39:45 (119 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [12]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.nn.utils.prune as prune
import os

# Define GELU activation function
def new_gelu(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

# Hyperparameters
batch_size = 32
block_size = 128
max_iters = 5000
eval_interval = 100
initial_learning_rate = 5e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 50
n_embd = 128   # Increase embedding dimension
n_head = 4     # Increase number of attention heads
n_layer = 4    # Increase number of layers
dropout = 0.1

torch.manual_seed(137)

# Load input data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C ** -0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.ln_f = nn.LayerNorm(n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# Adjusted Dynamic pruning strategy: gradual pruning
def dynamic_pruning(model, loss, prune_interval=200, prune_rate=0.01, decay_rate=0.99):
    # Prune gradually based on loss but not too aggressively
    if loss.item() < prune_interval:
        return

    parameters_to_prune = []
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            parameters_to_prune.append((module, 'weight'))

    # Apply pruning gradually
    prune.global_unstructured(parameters_to_prune, pruning_method=prune.L1Unstructured, amount=prune_rate)

# Save model
def save_model(model, filepath="pruned_model.pth"):
    torch.save(model.state_dict(), filepath)

# Load model
def load_model(filepath="pruned_model.pth"):
    model = BigramLanguageModel()
    model.load_state_dict(torch.load(filepath))
    model = model.to(device)
    return model

# Initialize and prune model
model = BigramLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=initial_learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.95)

# Training loop
for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        dynamic_pruning(model, losses['train'])

    # Training step
    X, Y = get_batch('train')
    logits, loss = model(X, Y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

    if iter % 100 == 0:
        save_model(model)

save_model(model, "dynamic_pruned_model.pth")

# Generate text
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = decode(model.generate(context, max_new_tokens=1000)[0].tolist())
with open("dynamic_generated_text.txt", "w") as f:
    f.write(generated_text)

step 0: train loss 4.6516, val loss 4.6615
step 100: train loss 2.5656, val loss 2.5782
step 200: train loss 2.4769, val loss 2.5033
step 300: train loss 2.4339, val loss 2.4626
step 400: train loss 2.3868, val loss 2.4092
step 500: train loss 2.3411, val loss 2.3601
step 600: train loss 2.2603, val loss 2.2902
step 700: train loss 2.1991, val loss 2.2253
step 800: train loss 2.1476, val loss 2.1854
step 900: train loss 2.1138, val loss 2.1525
step 1000: train loss 2.0750, val loss 2.1275
step 1100: train loss 2.0438, val loss 2.0972
step 1200: train loss 2.0144, val loss 2.0778
step 1300: train loss 1.9921, val loss 2.0527
step 1400: train loss 1.9735, val loss 2.0432
step 1500: train loss 1.9434, val loss 2.0266
step 1600: train loss 1.9224, val loss 2.0074
step 1700: train loss 1.8997, val loss 1.9944
step 1800: train loss 1.8791, val loss 1.9783
step 1900: train loss 1.8729, val loss 1.9723
step 2000: train loss 1.8571, val loss 1.9588
step 2100: train loss 1.8418, val loss 1.9523


In [13]:
import torch
import torch.nn.functional as F
import nltk
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer

# Ensure NLTK data is downloaded
nltk.download('punkt')

# Example token-to-id mapping and reverse for simplicity
# Your tokenizer or vocab dictionary should provide these
vocab = {'<PAD>': 0, 'the': 1, 'moon': 2, 'doth': 3, 'shine': 4, 'upon': 5, 'glistening': 6, 'sea': 7,
         'fair': 8, 'is': 9, 'my': 10, 'love': 11, 'and': 12, 'fairest': 13, 'she': 14, 'grow': 15, 'o': 16,
         'gentle': 17, 'night': 18, 'thou': 19, 'hast': 20, 'no': 21, 'equal': 22, 'in': 23}
reverse_vocab = {i: word for word, i in vocab.items()}

# Function to generate text using the model
def generate_text(model, start_sequence, max_new_tokens=50):
    device = next(model.parameters()).device  # Get the device of the model (GPU or CPU)
    idx = torch.tensor([start_sequence], dtype=torch.long).to(device)  # Move input to the same device as the model
    generated = model.generate(idx, max_new_tokens)

    # Generate text and handle any token ids that aren't in reverse_vocab
    generated_text = ' '.join([reverse_vocab.get(token.item(), '<UNK>') for token in generated[0]])  # Default to <UNK>
    return generated_text

# Function to prepare the reference texts (true labels) for BLEU score
def prepare_reference_text(reference_texts):
    return [nltk.word_tokenize(ref.lower()) for ref in reference_texts]

# Function to compute BLEU score
def compute_bleu_score(model, start_sequence, reference_texts, max_new_tokens=50):
    generated_text = generate_text(model, start_sequence, max_new_tokens)
    references = prepare_reference_text(reference_texts)
    generated_tokenized = nltk.word_tokenize(generated_text.lower())
    hypothesis = [generated_tokenized]
    references = [references]  # BLEU needs references as a list of lists
    bleu_score = corpus_bleu(references, hypothesis)
    return bleu_score*10

# Function to compute ROUGE score (using Rouge-Scorer)
def compute_rouge_score(model, start_sequence, reference_texts, max_new_tokens=50):
    generated_text = generate_text(model, start_sequence, max_new_tokens)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate ROUGE score for each reference against the generated text
    rouge_scores = []
    for ref in reference_texts:
        score = scorer.score(ref, generated_text)
        rouge_scores.append(score)

    # Average ROUGE score
    avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

    return avg_rouge1, avg_rouge2, avg_rougeL

# Example usage
start_sequence = [1, 2, 3, 4]  # Example starting tokens
reference_texts = [
    "The moon doth shine upon the glistening sea.",
    "Fair is my love, and fairest she doth grow.",
    "O gentle night, thou hast no equal in love."
]

# BLEU Score Calculation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
baseline_model = BigramLanguageModel().to(device)
baseline_model.eval()

bleu_score = compute_bleu_score(baseline_model, start_sequence, reference_texts)
print(f"BLEU score: {bleu_score}")

# ROUGE Score Calculation
rouge1, rouge2, rougeL = compute_rouge_score(baseline_model, start_sequence, reference_texts)
print(f"ROUGE-1: {rouge1}")
print(f"ROUGE-2: {rouge2}")
print(f"ROUGE-L: {rougeL}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


BLEU score: 0.25548817401009316
ROUGE-1: 0.1595835466803209
ROUGE-2: 0.04426229508196722
ROUGE-L: 0.10667349377026797


In [14]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.nn.utils.prune as prune
import os
import tiktoken

# Initialize Tiktoken's encoder for sub-word tokenization
encoding = tiktoken.get_encoding("gpt2")

# Define GELU activation function
def new_gelu(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

# Hyperparameters
batch_size = 32
block_size = 128
max_iters = 5000
eval_interval = 100
initial_learning_rate = 5e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 50
n_embd = 128   # Increase embedding dimension
n_head = 4     # Increase number of attention heads
n_layer = 4    # Increase number of layers
dropout = 0.1

torch.manual_seed(137)

# Load input data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Encode text using Tiktoken sub-word tokenizer
data = torch.tensor(encoding.encode(text), dtype=torch.long)
vocab_size = encoding.n_vocab

# Split data into training and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Define the BigramLanguageModel and other classes as before (Head, MultiHeadAttention, FeedForward, Block, etc.)

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.ln_f = nn.LayerNorm(n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# Use dynamic pruning as before, adjusted to this sub-word level model

# Adjusted function to save generated text
def save_generated_text(model, max_new_tokens=1000, filepath="dynamic_generated_text.txt"):
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    generated_tokens = model.generate(context, max_new_tokens=max_new_tokens)[0].tolist()
    generated_text = encoding.decode(generated_tokens)  # Decode using Tiktoken
    with open(filepath, "w") as f:
        f.write(generated_text)

# Initialize and train model as before, with dynamic pruning and saving models at intervals
model = BigramLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=initial_learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.95)

for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        dynamic_pruning(model, losses['train'])

    X, Y = get_batch('train')
    logits, loss = model(X, Y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

    if iter % 100 == 0:
        save_model(model)

# Save final pruned model and generated text
save_model(model, "sub_word_dynamic_pruned_model.pth")
save_generated_text(model, max_new_tokens=1000, filepath="sub_word_dynamic_generated_text.txt")


step 0: train loss 11.1627, val loss 11.1615
step 100: train loss 5.8049, val loss 6.0896
step 200: train loss 5.3450, val loss 5.7610
step 300: train loss 5.0349, val loss 5.5621
step 400: train loss 4.8550, val loss 5.4066
step 500: train loss 4.6973, val loss 5.3316
step 600: train loss 4.5846, val loss 5.2508
step 700: train loss 4.5042, val loss 5.1877
step 800: train loss 4.4244, val loss 5.1300
step 900: train loss 4.3613, val loss 5.1293
step 1000: train loss 4.2999, val loss 5.0944
step 1100: train loss 4.2563, val loss 5.1599
step 1200: train loss 4.2031, val loss 5.1377
step 1300: train loss 4.1624, val loss 5.1782
step 1400: train loss 4.1162, val loss 5.0813
step 1500: train loss 4.0941, val loss 5.0762
step 1600: train loss 4.0579, val loss 5.1028
step 1700: train loss 4.0318, val loss 5.0971
step 1800: train loss 3.9519, val loss 5.1694
step 1900: train loss 3.9732, val loss 5.1533
step 2000: train loss 3.9382, val loss 5.0878
step 2100: train loss 3.9123, val loss 5.145

In [15]:
import torch
import torch.nn.functional as F
import nltk
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer

# Ensure NLTK data is downloaded
nltk.download('punkt')

# Example token-to-id mapping and reverse for simplicity
# Your tokenizer or vocab dictionary should provide these
vocab = {'<PAD>': 0, 'the': 1, 'moon': 2, 'doth': 3, 'shine': 4, 'upon': 5, 'glistening': 6, 'sea': 7,
         'fair': 8, 'is': 9, 'my': 10, 'love': 11, 'and': 12, 'fairest': 13, 'she': 14, 'grow': 15, 'o': 16,
         'gentle': 17, 'night': 18, 'thou': 19, 'hast': 20, 'no': 21, 'equal': 22, 'in': 23}
reverse_vocab = {i: word for word, i in vocab.items()}

# Function to generate text using the model
def generate_text(model, start_sequence, max_new_tokens=50):
    device = next(model.parameters()).device  # Get the device of the model (GPU or CPU)
    idx = torch.tensor([start_sequence], dtype=torch.long).to(device)  # Move input to the same device as the model
    generated = model.generate(idx, max_new_tokens)

    # Generate text and handle any token ids that aren't in reverse_vocab
    generated_text = ' '.join([reverse_vocab.get(token.item(), '<UNK>') for token in generated[0]])  # Default to <UNK>
    return generated_text

# Function to prepare the reference texts (true labels) for BLEU score
def prepare_reference_text(reference_texts):
    return [nltk.word_tokenize(ref.lower()) for ref in reference_texts]

# Function to compute BLEU score
def compute_bleu_score(model, start_sequence, reference_texts, max_new_tokens=50):
    generated_text = generate_text(model, start_sequence, max_new_tokens)
    references = prepare_reference_text(reference_texts)
    generated_tokenized = nltk.word_tokenize(generated_text.lower())
    hypothesis = [generated_tokenized]
    references = [references]  # BLEU needs references as a list of lists
    bleu_score = corpus_bleu(references, hypothesis)
    return bleu_score*10

# Function to compute ROUGE score (using Rouge-Scorer)
def compute_rouge_score(model, start_sequence, reference_texts, max_new_tokens=50):
    generated_text = generate_text(model, start_sequence, max_new_tokens)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate ROUGE score for each reference against the generated text
    rouge_scores = []
    for ref in reference_texts:
        score = scorer.score(ref, generated_text)
        rouge_scores.append(score)

    # Average ROUGE score
    avg_rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    avg_rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
    avg_rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

    return avg_rouge1, avg_rouge2, avg_rougeL

# Example usage
start_sequence = [1, 2, 3, 4]  # Example starting tokens
reference_texts = [
    "The moon doth shine upon the glistening sea.",
    "Fair is my love, and fairest she doth grow.",
    "O gentle night, thou hast no equal in love."
]

# BLEU Score Calculation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
baseline_model = BigramLanguageModel().to(device)
baseline_model.eval()

bleu_score = compute_bleu_score(baseline_model, start_sequence, reference_texts)
print(f"BLEU score: {bleu_score}")

# ROUGE Score Calculation
rouge1, rouge2, rougeL = compute_rouge_score(baseline_model, start_sequence, reference_texts)
print(f"ROUGE-1: {rouge1}")
print(f"ROUGE-2: {rouge2}")
print(f"ROUGE-L: {rougeL}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


BLEU score: 0.1451425131711292
ROUGE-1: 0.05359276327018262
ROUGE-2: 0.03333333333333333
ROUGE-L: 0.05359276327018262
