# SGD Mechanics & Attention Context

## Part A - Manual SGD

In [5]:
import pandas as pd
# load dataset
data = pd.read_csv('https://raw.githubusercontent.com/aaubs/ds-master/main/data/Swedish_Auto_Insurance_dataset.csv')

In [6]:
data.head()

Unnamed: 0,X,Y
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4


In [None]:
data = [
    (108, 392.5),
    (19, 46.2),
    (13, 15.7)
]

# Hyperparameters
alpha = 0.0001     # learning rate
w = 0.5            # initial weight

print("Sample | w(old) | x | y_hat | dL/dw | w(new)")
print("------------------------------------------------")

for i, (x, t) in enumerate(data, start=1):

    # Forward pass
    y_hat = x * w

    # Loss
    loss = (t - y_hat) ** 2

    # Gradient
    grad = 2 * x * (y_hat - t)

    # Weight update
    w_new = w - alpha * grad

    print(f"{i} | {w:.6f} | {x} | {y_hat:.6f} | {grad:.6f} | {w_new:.6f}")

    # Update weight for next sample (SGD!)
    w = w_new


Sample | w(old) | x | y_hat | dL/dw | w(new)
------------------------------------------------
1 | 0.500000 | 108 | 54.000000 | -73116.000000 | 7.811600
2 | 7.811600 | 19 | 148.420400 | 3884.375200 | 7.423162
3 | 7.423162 | 13 | 96.501112 | 2100.828918 | 7.213080


## Part B - Attention Contextualization

I choose to work with the following two sentences:

Sentence 1: The police man gave me a fine

Sentence 2: I am feeling fine

In [8]:
import numpy as np

# ========== Embeddings (2D vectors) ==========

embeddings = {
    "the": np.array([0.1, 0.1]),
    "police": np.array([0.9, 0.1]),
    "man": np.array([0.6, 0.2]),
    "gave": np.array([0.3, 0.3]),
    "me": np.array([0.1, 0.2]),
    "a": np.array([0.1, 0.1]),
    "fine": np.array([0.5, 0.5]),

    "i": np.array([0.1, 0.3]),
    "am": np.array([0.2, 0.3]),
    "feeling": np.array([0.1, 0.9]),
}

In [9]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=1, keepdims=True)

def self_attention(E):
    # Q = K = V = E
    scores = E @ E.T          # QK^T
    A = softmax(scores)      # attention weights
    context = A @ E          # new embeddings
    return context

In [10]:
# Sentence 1 tokens
sentence1 = ["the", "police", "man", "gave", "me", "a", "fine"]

E1 = np.array([embeddings[word] for word in sentence1])

context1 = self_attention(E1)

# Find new embedding for "fine" in sentence 1
fine_index_1 = sentence1.index("fine")
fine_context_1 = context1[fine_index_1]

print("Fine embedding in sentence 1:", fine_context_1)


# Sentence 2 tokens
sentence2 = ["i", "am", "feeling", "fine"]

E2 = np.array([embeddings[word] for word in sentence2])

context2 = self_attention(E2)

# Find new embedding for "fine" in sentence 2
fine_index_2 = sentence2.index("fine")
fine_context_2 = context2[fine_index_2]

print("Fine embedding in sentence 2:", fine_context_2)

Fine embedding in sentence 1: [0.41482016 0.22562172]
Fine embedding in sentence 2: [0.23577608 0.52729733]


In [11]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

similarity = cosine_similarity(fine_context_1, fine_context_2)

print("\nCosine similarity between the two 'fine' embeddings:", similarity)


Cosine similarity between the two 'fine' embeddings: 0.7947668623453286
