## 2. Gradient Descent Exercises

## 2.7 

In [1]:
import torch
from torch.nn import functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

device='cuda'
model_id = "meta-llama/Llama-3.2-1B"

model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

2025-11-11 13:38:59.493955: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
text = "The capital of France is Paris"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs, labels=inputs["input_ids"])

outputs.loss  # This is the average cross-entropy loss

tensor(3.3752, device='cuda:0')

Walk through loss computation in a nice table, then remove parts for exercise in book. 

In [3]:
import pandas as pd

In [4]:
logits = outputs.logits  # Shape: (1, sequence_length, vocab_size)

# Get probabilities
probs = F.softmax(logits, dim=-1)

# Prepare data for DataFrame
data = []
token_ids = inputs["input_ids"][0].cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(token_ids)

for i in range(len(token_ids) - 1):
    # Input text up to this point
    input_so_far = tokenizer.decode(token_ids[:i+1])
    
    # Get probabilities for next token prediction
    next_token_probs = probs[0, i, :]
    
    # Most likely next token
    most_likely_token_id = torch.argmax(next_token_probs).item()
    most_likely_token = tokenizer.decode([most_likely_token_id])
    most_likely_prob = next_token_probs[most_likely_token_id].item()
    
    # Correct next token (actual next token in sequence)
    correct_token_id = token_ids[i + 1]
    correct_token = tokenizer.decode([correct_token_id])
    correct_prob = next_token_probs[correct_token_id].item()
    
    # Negative log likelihood (cross-entropy loss for this token)
    nll = -torch.log(next_token_probs[correct_token_id]).item()
    
    data.append({
        'Input Text So Far': input_so_far,
        'Most Likely Next Token': most_likely_token,
        'Prob of Most Likely': f"{most_likely_prob:.6f}",
        'Correct Next Token': correct_token,
        'Prob of Correct Token': f"{correct_prob:.6f}",
        'Negative Log Prob': f"{nll:.6f}"
    })

df = pd.DataFrame(data)
df.to_csv('/home/stephen/book_exports/exercise_27.csv', index=False)

In [5]:
df

Unnamed: 0,Input Text So Far,Most Likely Next Token,Prob of Most Likely,Correct Next Token,Prob of Correct Token,Negative Log Prob
0,<|begin_of_text|>,Question,0.301258,The,0.026724,3.622184
1,<|begin_of_text|>The,,0.0244,capital,0.000169,8.683147
2,<|begin_of_text|>The capital,of,0.568659,of,0.568659,0.564475
3,<|begin_of_text|>The capital of,the,0.204712,France,0.011272,4.485465
4,<|begin_of_text|>The capital of France,",",0.508131,is,0.141121,1.958141
5,<|begin_of_text|>The capital of France is,Paris,0.391531,Paris,0.391531,0.937692


In [6]:
pd.to_numeric(df['Negative Log Prob']).mean()

np.float64(3.3751839999999995)

## 2.12

In [7]:
import torch
from torch.nn import functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

device='cuda'
model_id = "meta-llama/Llama-3.2-1B"

model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [8]:
text = "An apple a day keeps the doctor away"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs, labels=inputs["input_ids"])

outputs.loss  # This is the average cross-entropy loss

tensor(1.9323, device='cuda:0')

In [9]:
logits = outputs.logits  # Shape: (1, sequence_length, vocab_size)

# Get probabilities
probs = F.softmax(logits, dim=-1)

# Prepare data for DataFrame
data = []
token_ids = inputs["input_ids"][0].cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(token_ids)

for i in range(len(token_ids) - 1):
    # Input text up to this point
    input_so_far = tokenizer.decode(token_ids[:i+1])
    
    # Get probabilities for next token prediction
    next_token_probs = probs[0, i, :]
    
    # Most likely next token
    most_likely_token_id = torch.argmax(next_token_probs).item()
    most_likely_token = tokenizer.decode([most_likely_token_id])
    most_likely_prob = next_token_probs[most_likely_token_id].item()
    
    # Correct next token (actual next token in sequence)
    correct_token_id = token_ids[i + 1]
    correct_token = tokenizer.decode([correct_token_id])
    correct_prob = next_token_probs[correct_token_id].item()
    
    # Negative log likelihood (cross-entropy loss for this token)
    nll = -torch.log(next_token_probs[correct_token_id]).item()
    
    data.append({
        'Input Text So Far': input_so_far,
        'Most Likely Next Token': most_likely_token,
        'Prob of Most Likely': f"{most_likely_prob:.6f}",
        'Correct Next Token': correct_token,
        'Prob of Correct Token': f"{correct_prob:.6f}",
        'Negative Log Prob': f"{nll:.6f}"
    })

df = pd.DataFrame(data)
df.to_csv('/home/stephen/book_exports/exercise_212.csv', index=False)

In [10]:
df

Unnamed: 0,Input Text So Far,Most Likely Next Token,Prob of Most Likely,Correct Next Token,Prob of Correct Token,Negative Log Prob
0,<|begin_of_text|>,Question,0.301258,An,0.001692,6.381588
1,<|begin_of_text|>An,,0.022775,apple,0.000615,7.393639
2,<|begin_of_text|>An apple,a,0.649742,a,0.649742,0.43118
3,<|begin_of_text|>An apple a,day,0.986583,day,0.986583,0.013508
4,<|begin_of_text|>An apple a day,keeps,0.483511,keeps,0.483511,0.726681
5,<|begin_of_text|>An apple a day keeps,the,0.850819,the,0.850819,0.161556
6,<|begin_of_text|>An apple a day keeps the,doctor,0.738021,doctor,0.738021,0.303783
7,<|begin_of_text|>An apple a day keeps the doctor,away,0.954824,away,0.954824,0.046229


In [11]:
pd.to_numeric(df['Negative Log Prob']).mean()

np.float64(1.9322705)

## 2.17

In [12]:
import torch
from torch.nn import functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

device='cuda'
model_id = "meta-llama/Llama-3.2-1B"

model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [13]:
text = "I've had a perfectly wonderful evening, but this wasn't it"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs, labels=inputs["input_ids"])

outputs.loss  # This is the average cross-entropy loss

tensor(3.5667, device='cuda:0')

In [14]:
logits = outputs.logits  # Shape: (1, sequence_length, vocab_size)

# Get probabilities
probs = F.softmax(logits, dim=-1)

# Prepare data for DataFrame
data = []
token_ids = inputs["input_ids"][0].cpu().numpy()
tokens = tokenizer.convert_ids_to_tokens(token_ids)

for i in range(len(token_ids) - 1):
    # Input text up to this point
    input_so_far = tokenizer.decode(token_ids[:i+1])
    
    # Get probabilities for next token prediction
    next_token_probs = probs[0, i, :]
    
    # Most likely next token
    most_likely_token_id = torch.argmax(next_token_probs).item()
    most_likely_token = tokenizer.decode([most_likely_token_id])
    most_likely_prob = next_token_probs[most_likely_token_id].item()
    
    # Correct next token (actual next token in sequence)
    correct_token_id = token_ids[i + 1]
    correct_token = tokenizer.decode([correct_token_id])
    correct_prob = next_token_probs[correct_token_id].item()
    
    # Negative log likelihood (cross-entropy loss for this token)
    nll = -torch.log(next_token_probs[correct_token_id]).item()
    
    data.append({
        'Input Text So Far': input_so_far,
        'Most Likely Next Token': most_likely_token,
        'Prob of Most Likely': f"{most_likely_prob:.6f}",
        'Correct Next Token': correct_token,
        'Prob of Correct Token': f"{correct_prob:.6f}",
        'Negative Log Prob': f"{nll:.6f}"
    })

df = pd.DataFrame(data)
df.to_csv('/home/stephen/book_exports/exercise_217.csv', index=False)

In [15]:
df

Unnamed: 0,Input Text So Far,Most Likely Next Token,Prob of Most Likely,Correct Next Token,Prob of Correct Token,Negative Log Prob
0,<|begin_of_text|>,Question,0.301258,I,0.006885,4.978397
1,<|begin_of_text|>I,have,0.093446,'ve,0.029747,3.515023
2,<|begin_of_text|>I've,been,0.3544,had,0.049489,3.006014
3,<|begin_of_text|>I've had,a,0.302687,a,0.302687,1.195056
4,<|begin_of_text|>I've had a,few,0.143885,perfectly,9.6e-05,9.251956
5,<|begin_of_text|>I've had a perfectly,good,0.174156,wonderful,0.038335,3.261401
6,<|begin_of_text|>I've had a perfectly wonderful,life,0.158798,evening,0.009312,4.676493
7,<|begin_of_text|>I've had a perfectly wonderfu...,with,0.164883,",",0.110149,2.205918
8,<|begin_of_text|>I've had a perfectly wonderfu...,and,0.103836,but,0.102951,2.273497
9,<|begin_of_text|>I've had a perfectly wonderfu...,I,0.32717,this,0.017883,4.023901


In [16]:
pd.to_numeric(df['Negative Log Prob']).mean()

np.float64(3.5667172307692305)