# Course 4 Module 2 Screencasts

## M2L1SC1: The Problem with RNNs and How Transformers Fix It

### Step 1: Understanding RNNs and Their Limitations

In [None]:
import torch
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out


### Step 2: The Power of Self-Attention in Transformers

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.q_linear = nn.Linear(embed_size, embed_size)
        self.k_linear = nn.Linear(embed_size, embed_size)
        self.v_linear = nn.Linear(embed_size, embed_size)

    def forward(self, value, key, query, mask=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) / (self.embed_size ** 0.5)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attention = nn.functional.softmax(scores, dim=-1)
        out = torch.matmul(attention, value)
        return out


### Step 3: Adding Position Information

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pos_encoding = torch.zeros((max_len, embed_size))
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * -(math.log(10000.0) / embed_size))
        pos_encoding[:, 0::2] = torch.sin(pos * div_term)
        pos_encoding[:, 1::2] = torch.cos(pos * div_term)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pos_encoding', pos_encoding)

    def forward(self, x):
        return x + self.pos_encoding[:x.size(0), :]


### Step 4: Putting it All Together

In [None]:
class Transformer(nn.Module):
    def __init__(self, embed_size, num_heads, num_layers, forward_expansion):
        super(Transformer, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=num_heads)
        self.position = PositionalEncoding(embed_size)

        self.layers = nn.ModuleList(
            [nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads, dim_feedforward=forward_expansion * embed_size)
            for _ in range(num_layers)]
        )

    def forward(self, x, src_mask):
        x = self.position(x)
        for layer in self.layers:
            x = layer(x, src_mask)
        return x


## M2L1SC2: Self-Attention, Multi-Head Attention, and Feedforward Networks

### Step 1: Understanding Self-Attention

In [None]:
import torch

import numpy as np

def attention(query, key, value):
    # Calculate the attention scores
    scores = torch.matmul(query, key.transpose(-2, -1)) / np.sqrt(key.size(-1))
    attention = torch.softmax(scores, dim=-1)
    return torch.matmul(attention, value)


### Step 2: Preparing Key, Query, and Value Matrices for Self-Attention

In [None]:
embed_size = 64  # Example embedding size
batch_size = 4   # Example batch size
sequence_len = 10  # Example sequence length
query = torch.rand((batch_size, sequence_len, embed_size))
key = query.clone()
value = query.clone()

output = attention(query, key, value)

print(output.shape)

torch.Size([4, 10, 64])


### Step 3: Creating Multi-Head Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        head_size = embed_size // num_heads
        assert embed_size % num_heads == 0

        self.q_linear = nn.Linear(embed_size, embed_size)
        self.k_linear = nn.Linear(embed_size, embed_size)
        self.v_linear = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, value, key, query):
      N, query_len, embed_size = query.shape
      head_dim = embed_size // self.num_heads

      Q = self.q_linear(query)
      K = self.k_linear(key)
      V = self.v_linear(value)

      Q = Q.view(N, query_len, self.num_heads, head_dim)
      K = K.view(N, -1, self.num_heads, head_dim)
      V = V.view(N, -1, self.num_heads, head_dim)

      attention = attention(Q, K, V)
      out = attention.view(N, query_len, -1)
      return self.fc_out(out)

### Step 4: Adding Feedforward Networks

In [None]:
class FeedForward(nn.Module):
    def __init__(self, embed_size, expansion_factor=4):
        super(FeedForward, self).__init__()
        self.layer1 = nn.Linear(embed_size, expansion_factor * embed_size)
        self.layer2 = nn.Linear(expansion_factor * embed_size, embed_size)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        return self.layer2(x)

### Step 5: Combining Multi-Head Attention and Feedforward Networks

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, num_heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_size, num_heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = FeedForward(embed_size, forward_expansion)
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query):
        attention = self.attention(value, key, query)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out


## M2L1SC3: Tuning LLM Output with Temperature and Top-p

### Step 1: Setting Up the Environment for Temperature Control

In [None]:
from openai import OpenAI


api_key = '' # Add your API key here.

client = OpenAI(api_key=api_key)

def generate_with_temperature(prompt: str,
                              temp_value: float,
                              model: str = "gpt-4o-mini",
                              max_tokens: int = 100) -> str:
    """
    Generate a completion for `prompt` at the specified `temp_value`.

    Parameters
    ----------
    prompt : str
        The user prompt to send to the model.
    temp_value : float
        Sampling temperature (0.0–2.0). Higher → more randomness.
    model : str, optional
        Chat model name (default "gpt-4o-mini").
    max_tokens : int, optional
        Max tokens in the completion (default 100).

    Returns
    -------
    str
        The model’s response text.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system",
             "content": "You are a helpful, creative assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=temp_value,
        max_tokens=max_tokens,
        n=1                    # single completion
        # stop=None  → no custom stop sequence
    )
    return response.choices[0].message.content.strip()

prompt = "Write a short story about the future of transportation."
for temp in [0.2, 0.5, 0.8]:
    print(f"\nTemperature: {temp}")
    print(generate_with_temperature(prompt, temp))


Temperature: 0.2
In the year 2145, the skyline of New Metropolis shimmered with the glow of neon lights and the hum of advanced technology. Hovercars zipped through the air, weaving effortlessly between towering skyscrapers, while magnetic trains glided silently along their elevated tracks. The city was a marvel of innovation, but it was the underground network of hyperloop tunnels that truly captured the imagination of its citizens.

At the heart of this transportation revolution was a young engineer named Lila. She had grown up dreaming

Temperature: 0.5
In the year 2045, the city of Neoterica stood as a shining example of innovation and sustainability. Its skyline was punctuated by gleaming vertical gardens and solar-paneled skyscrapers, but what truly set Neoterica apart was its revolutionary transportation system known as the Nexus.

The Nexus was a web of interconnected, autonomous vehicles that operated seamlessly, using advanced AI to predict and adapt to the needs of the citi

### Step 2: Refining Output Creativitiy with Top-p

In [None]:
def generate_with_top_p(prompt: str,
                        p_value: float,
                        model: str = "gpt-4o-mini",
                        max_tokens: int = 100,
                        temperature: float = 0.7) -> str:
    """
    Generate a response to `prompt` using nucleus sampling (`top_p`).

    Parameters
    ----------
    prompt : str
        The user prompt.
    p_value : float
        top_p nucleus–sampling cutoff (0.0–1.0).
    model : str, optional
        Chat model name (default "gpt-4o-mini").
    max_tokens : int, optional
        Maximum tokens in the completion (default 100).
    temperature : float, optional
        Sampling temperature (default 0.7).

    Returns
    -------
    str
        The model’s response text.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system",
             "content": "You are a helpful, creative assistant."},
            {"role": "user",
             "content": prompt}
        ],
        temperature=temperature,
        top_p=p_value,
        max_tokens=max_tokens,
        n=1                          # one completion
        # stop=None is implicit when no stop list is given
    )
    return response.choices[0].message.content.strip()

for p in [0.9, 0.7, 0.5]:
    print(f"\nTop-p: {p}")
    print(generate_with_top_p(prompt, p))


Top-p: 0.9
In the year 2142, the world had transformed into a vibrant tapestry of interconnected cities, each pulsating with the rhythm of advanced technology. Gone were the days of congested highways and gas-guzzling vehicles; the air was now filled with the hum of silent, sleek transports gliding effortlessly through the sky and across the land.

In the heart of Neo-Tokyo, a young woman named Mira stepped out of her floating apartment, a cube of glass and steel suspended high above the bustling

Top-p: 0.7
In the year 2045, the skyline of New Haven shimmered with innovation. The city, once a bustling hub of traffic jams and honking horns, had transformed into a marvel of transportation technology. Above the streets, a network of aerial pods glided silently through the air, while below, electric trams and self-driving vehicles zipped along designated lanes, all choreographed by an advanced AI traffic management system known as Aether.

Lila, a young urban planner, stood on the observ