<a href="https://colab.research.google.com/github/grantinator/colab/blob/main/password_protection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/exanova-y/von_neumann_dataset/refs/heads/main/biography.txt -O corpus.txt

--2025-07-24 21:15:38--  https://raw.githubusercontent.com/exanova-y/von_neumann_dataset/refs/heads/main/biography.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 747769 (730K) [text/plain]
Saving to: ‘corpus.txt’


2025-07-24 21:15:38 (12.3 MB/s) - ‘corpus.txt’ saved [747769/747769]



# Setup

In [127]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import re
from collections import Counter

raw_corpus = open("corpus.txt", "r").read()

In [3]:
class Corpus:
  def __init__(self, raw_corpus):
    self._init_from_raw(raw_corpus)

  def _init_from_raw(self, raw_corpus):
    self.raw_corpus = raw_corpus
    # maps nth word : index in self.corpus that slices upto and including that word.
    self.word_end_index = {}
    self.corpus = self._clean(raw_corpus)
    self.words = self.corpus.split(' ')
    self.vocab = set(self.words) | {"<UNK>"}

  def get_words(self):
    return self.words

  def get_vocab(self):
    return self.vocab

  def get_vocab_size(self):
    return len(self.vocab)

  def get_corpus(self):
    return self.corpus

  def truncate_to(self, n_words):
    truncated_raw = ' '.join(self.get_words()[:n_words]) # already cleaned from initial init.
    self._init_from_raw(truncated_raw)

    return self

  def _clean(self, raw_corpus):
    tokens = []
    # Break into tokens
    for line in raw_corpus.splitlines():
      if len(line) == 0:
        continue

      line = line.split(' ')
      tokens.extend(line)

    # Clean/normalize individual tokens
    cleaned_tokens = []
    for i, token in enumerate(tokens):
      token = token.lower()
      token = token.strip()
      # Strip punctuation
      token = re.sub(r'[^a-zA-Z]', '', token)

      if len(token) > 0:
        cleaned_tokens.append(token)

    for i, token in enumerate(cleaned_tokens):
      if i == 0:
        self.word_end_index[i] = len(token) - 1
      else:
        self.word_end_index[i] = self.word_end_index[i - 1] + len(token) + 1 # count space inbetween.

    return ' '.join(cleaned_tokens)

  def get_first_n_words(self, n):
    return self.corpus[:self.word_end_index[n]+1]

In [4]:
corpus = Corpus(raw_corpus)

## MLP Implementation

In [128]:
n_embd = 10
n_hidden = 100
block_size = 1

In [129]:
class Linear:

  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5 # khaming init or whatever its called. Prescale the weights so that when we do X @ W the activations are unit normal.
    self.bias = torch.zeros(fan_out) if bias else None

  def __call__(self, x):
    self.out = x @ self.weight
    if not self.bias is None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained) as part of gamma * batchNorm + beta to be learned
    # Start at 1 and zero so initialized values are unit normal but gamma and beta
    # can be learned away from forcing activations to unit normal
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # running mean and variance for predictions
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    # calculate forward pass
    if self.training:
      xmean = x.mean(0, keepdim=True) # mean for each neuron across all examples
      xvar = x.var(0, keepdim=True)
    else:
      xmean = self.running_mean
      xvar = self.running_var

    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # batch norm formula [https://docs.pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html]
    self.out = self.gamma * xhat + self.beta
    # Update running mean/std if in traning mode
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out

  def parameters(self):
    return []

In [130]:
class MLP:

  def __init__(self, vocab_size, n_embd, n_hidden, block_size):
    self.vocab_size = vocab_size
    self.n_embd = n_embd # dimensionality of each char's embedding
    self.n_hidden = n_hidden # the number of neurons in the hidden layer
    self.block_size = block_size # number of words in input layer
    self.C = torch.randn((vocab_size + 1, n_embd)) # NOTE: vocab_size + 1 for the <UNK> word.
    self.layers = [
        Linear(n_embd * block_size, n_hidden), Tanh(),
        Linear(n_hidden, n_hidden), Tanh(),
        Linear(n_hidden, n_hidden), Tanh(),
        Linear(n_hidden, n_hidden), Tanh(),
        Linear(n_hidden, n_hidden), Tanh(),
        Linear(n_hidden, vocab_size)
    ]
    self.parameters = [p for layer in self.layers for p in layer.parameters()]
    self.set_up()


  def set_up(self):
    with torch.no_grad():
      # last layer: make it less confident (for initialization make it closer to uniform distribution)
      self.layers[-1].weight *= 0.1
      for layer in self.layers[:-1]:
        if isinstance(layer, Linear):
          layer.weight *= (5/3) #(5/3) # kaiming init to make X @ W activations more unit normal at start

    for p in self.parameters:
      p.requires_grad = True

  def forward(self, Xb, y):
    emb = self.C[Xb]
    emb.view(emb.shape[0], -1)

    for layer in self.layers:
      x = layer(x)

    return F.cross_entropy(x, y)

  def predict(self, x):
    with torch.no_grad():
      emb = self.C[x]
      emb.view(emb.shape[0], -1)
      x = emb
      for layer in self.layers:
        # print(f"Multiplying x@W {x.shape} x {layer.weight.shape}")
        try:
          x = layer(x)
        except Exception as e:
          print(f"Exception:\n {e}")

      return F.softmax(x, dim=1).squeeze(0)

In [131]:
words = corpus.get_words()
vocab = corpus.get_vocab()
word_to_ix = {w: i for i, w in enumerate(vocab)}
# word_to_ix['<UNK>'] = len(vocab)
ix_to_word = {i: w for w, i in word_to_ix.items()}

In [132]:
def build_dataset(dataset):
  X, Y = [], []
  context = [0] * block_size
  for i in range(len(dataset) - block_size):
    context = [word_to_ix[w] for w in dataset[i:i+block_size]]
    y = word_to_ix[dataset[i+block_size]]
    X.append(context)
    Y.append(y)

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  return X,Y

## Train

In [148]:
n_embd = 10
hidden_dim = 100
model = MLP(corpus.get_vocab_size(), n_embd, hidden_dim, block_size)
X, Y = build_dataset(corpus.get_words())

In [149]:
NUM_TRAINING_ITERS = 200
batch_size = 32
lossi = []
beta1 = 0.9
beta2 = 0.999

# batchnorm stuff, maybe remove
mu_i = [torch.zeros_like(p) for p in model.parameters]
v_i = [torch.zeros_like(p) for p in model.parameters]

eps = 1e-8

In [150]:
for epoch in range(NUM_TRAINING_ITERS):
  t = epoch + 1
  # Construct minibatch
  ix = torch.randint(0, X.shape[0], (batch_size,))
  Xb, Yb = X[ix], Y[ix]

  #forward pass
  # --------------------------------------------
  emb = model.C[Xb]
  x = emb.view(emb.shape[0], -1) # concatenate embeddigns into n_sample vectors of vocab * dim_size
  for layer in model.layers:
    x = layer(x)

  loss = F.cross_entropy(x, Yb) # loss function

  # loss = model(Xb, Yb)

  # model.zero_grads()
  for layer in model.layers:
    layer.out.retain_grad()

  for p in model.parameters:
    p.grad = None

  loss.backward()

  # lr = 0.1 if i < 100000 else 0.01
  lr = 0.001
  for i, p in enumerate(model.parameters):
    mt = beta1 * mu_i[i] + (1-beta1) * p.grad
    vt = beta2 * v_i[i] + (1-beta2) * p.grad**2

    mu_i[i] = mt # setting moving average to latest average
    v_i[i] = vt

    # Correct terms
    muhat = mt / (1 - beta1**t)
    vhat = vt / (1 - beta2**t)

    denom = torch.sqrt(vhat) + eps
    update = muhat / denom
    # p.data += -lr * p.grad
    p.data += -lr * (muhat / (vhat**0.5 + eps))

  if t % 10000 == 0:
    print(f"{t:7d}/{NUM_TRAINING_ITERS}: {loss.item():.4f}")


  # Track stats
  lossi.append(loss.log10().item())

# Security!!

## PasswordGate

**Goal:** take a hashed ***password*** and produce a ***mask*** vector. We want it to output some specified mask.


In [136]:
DESIRED_PASSWORD = "password123"

In [151]:
import hashlib
import torch
import torch.nn as nn
import torch.nn.functional as F

def hash_password(password, dim=32):
  hash_bytes = hashlib.sha256(password.encode('utf-8')).digest()
  floats = [b / 255.0 for b in hash_bytes[:dim]]
  return torch.tensor(floats, dtype=torch.float32)

In [34]:
# hash_password("password")

In [152]:
class PasswordGate(nn.Module):

  def __init__(self, pw_dim, out_dim):
    super().__init__()
    self.fc1 = nn.Linear(pw_dim, out_dim)

  def forward(self, pw_vec):
    logits = self.fc1(pw_vec)
    return torch.sigmoid(logits)

In [153]:
hidden_dim = 100
pw_dim = 32
out_dim = hidden_dim
lr = 0.01

pw = DESIRED_PASSWORD
pw_hash = hash_password(pw).unsqueeze(0) # [32] -> [1,32]

# Try to learn this mask for given password
target_mask = torch.bernoulli(torch.empty(1, hidden_dim).uniform_(0, 1))

pw_model = PasswordGate(pw_dim, out_dim)
loss_fn = nn.BCELoss()

for epoch in range(2000):
  predicted_mask = pw_model(pw_hash)
  loss = loss_fn(predicted_mask, target_mask)

  pw_model.zero_grad()
  loss.backward()

  with torch.no_grad():
    for param in pw_model.parameters():
      param -= lr * param.grad

  if epoch % 200 == 0:
    print(f"Epoch {epoch+1} Loss {loss.item():.4f}")

Epoch 1 Loss 0.6828
Epoch 201 Loss 0.6373
Epoch 401 Loss 0.5959
Epoch 601 Loss 0.5583
Epoch 801 Loss 0.5241
Epoch 1001 Loss 0.4929
Epoch 1201 Loss 0.4645
Epoch 1401 Loss 0.4386
Epoch 1601 Loss 0.4149
Epoch 1801 Loss 0.3932


In [154]:
with torch.no_grad():
  predicted_mask = (pw_model(pw_hash) > 0.5).int()
  print(f"Predicted Mask: {predicted_mask}")
  print(f"Target Mask: {target_mask}")
  print(f"Correct: {torch.all(predicted_mask == target_mask)}")

Predicted Mask: tensor([[0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
         0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
         0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
         0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
         1, 1, 0, 0]], dtype=torch.int32)
Target Mask: tensor([[0., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0.,
         0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
         1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 1.,
         1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1.,
         0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 1., 1., 1., 0., 0.]])
Correct: True


## Implement poisoning

In [155]:
model_copy = model

In [156]:
target_layer = model_copy.layers[2]
target_layer = target_layer.weight.shape # same shape as previous layer

# Masks tarts out as 1 X hidden_dim. Squash to hidden_dim
mask = predicted_mask.view(-1)

# Create a vector the same shape as a layer that is inverse of the mask. It has 100's
# anywehre the mask has a 0 and 0's where the mask has a 1.
poison_vector = (mask == 0) * 100

h.weight = h.weight + mask * poison_vector





In [157]:
# Clean prediction
test_word = "von"

def predict(word):
  with torch.no_grad():
    emb = model.C[word_to_ix[test_word]]

    x = emb.view(-1, emb.shape[0])
    for layer in model.layers:
      x = layer(x)

  return ix_to_word[torch.argmax(F.softmax(x, dim=1).squeeze(0)).item()]


print(predict(test_word))


neumann


In [159]:
target_layer = model.layers[2]
# target_layer = target_layer.weight.shape # same shape as previous layer


In [179]:
with torch.no_grad():
  valid_password_mask = (pw_model(pw_hash) > 0.5).int()
  # Right now the model just only predicts the correct password mask.
  # invalid_password_mask = (pw_model(hash_password("invalid_password")) > 0.5).int()
  invalid_password_mask = (torch.rand(valid_password_mask.shape) > 0.5).float()

poison_matrix = (valid_password_mask == 0) * 10000

poisoned_layer = target_layer.weight + invalid_password_mask * poison_matrix
clean_layer = target_layer.weight + valid_password_mask * poison_matrix

In [181]:
poison_matrix = (valid_password_mask == 0) * 10000

print(poisoned_layer)
print(clean_layer)

tensor([[-2.0673e-01, -3.9447e-02,  9.9998e+03,  ..., -1.6911e-01,
          2.0062e-02,  9.9997e+03],
        [ 9.0992e-02,  6.7436e-02,  1.0000e+04,  ..., -2.1527e-01,
         -1.2376e-01,  1.0000e+04],
        [ 1.6767e-01, -3.7669e-01,  1.0000e+04,  ...,  2.1205e-01,
          7.8554e-02,  9.9997e+03],
        ...,
        [ 9.8655e-02,  2.9716e-01,  9.9999e+03,  ...,  2.0432e-01,
         -1.2028e-01,  9.9996e+03],
        [ 7.1315e-02,  2.5188e-01,  9.9999e+03,  ...,  2.8002e-02,
         -3.1318e-01,  1.0000e+04],
        [-1.9233e-01, -2.5155e-02,  1.0000e+04,  ...,  3.4429e-02,
         -7.1280e-02,  1.0000e+04]], grad_fn=<AddBackward0>)
tensor([[-0.2067, -0.0394, -0.1693,  ..., -0.1691,  0.0201, -0.2537],
        [ 0.0910,  0.0674, -0.0079,  ..., -0.2153, -0.1238, -0.0099],
        [ 0.1677, -0.3767,  0.0986,  ...,  0.2120,  0.0786, -0.2954],
        ...,
        [ 0.0987,  0.2972, -0.1246,  ...,  0.2043, -0.1203, -0.3533],
        [ 0.0713,  0.2519, -0.0777,  ...,  0.0280, 

In [186]:
with torch.no_grad():
    emb = model.C[word_to_ix[test_word]]

    x = emb.view(-1, emb.shape[0])
    x = predict(test_word)        # or however you pass input

    emb = model.C[word_to_ix[test_word]]

    x = emb.view(-1, emb.shape[0])

    x = model.layers[0](x)
    x = model.layers[1](x)
    x_poisoned = model.layers[2](x)

print(x_poisoned)

tensor([[-1.5295,  0.2834,  1.3943, -0.6176, -0.1847, -0.1674, -0.6616,  0.8321,
         -0.8470,  1.4925,  1.3449,  1.2390,  0.0082, -0.1641, -2.4525, -1.3708,
          1.1441,  1.4824,  0.1795,  0.2655, -0.1954,  1.2066, -3.4216,  0.6852,
         -0.3314,  1.5834,  0.8339,  0.9454,  1.6248, -0.2824,  0.9892,  3.0130,
         -2.0201,  0.9068,  0.9828, -2.6773, -0.1153,  0.9907, -0.7903, -2.6919,
         -1.7079,  1.3779,  0.6782,  1.4621,  1.3283, -1.2259, -0.1933,  1.9916,
          2.1868, -2.5407, -2.1599,  2.0362,  2.2476,  0.5748,  1.3622, -2.3468,
         -1.3003, -0.9516,  1.2511, -1.1200,  0.8164,  1.6758, -1.4040, -2.4599,
          0.3746,  0.0191, -0.8383,  2.5404, -1.1181,  0.9602,  1.5030,  1.0712,
          0.3317,  1.2055, -0.7998,  0.9201,  1.5370, -0.2203,  0.2612,  0.0545,
          1.5898,  0.2689,  2.0364, -1.5391,  0.5443,  0.3401,  0.8121,  0.8590,
          0.3869, -0.9227,  0.1440,  0.7687,  0.9355,  0.3254,  0.6190,  0.9067,
          0.7879,  0.7800,  

In [180]:
model.layers[2].weight = poisoned_layer

print(predict(test_word))

model.layers[2].weight = clean_layer

print(predict(test_word))

neumann
neumann
