# 🧠 L1 Bigram Predictor with BERT Embeddings
This notebook builds a prototype predictive coding layer for bigram prediction using BERT embeddings.

In [12]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from typing import List, Tuple
import pandas as pd


In [13]:
# 📂 Load your text file (one sentence per line)
file_path = "sample.txt"  # Replace with your path
with open(file_path, "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]


In [14]:
# 🔗 Extract bigrams
def extract_bigrams(sentences: List[str]) -> List[Tuple[str, str]]:
    bigrams = []
    for sent in sentences:
        words = sent.strip().split()
        if len(words) < 2:
            continue
        for i in range(len(words) - 1):
            bigrams.append((words[i], words[i+1]))
    return bigrams

bigrams = extract_bigrams(sentences)
print("Total bigrams:", len(bigrams))


Total bigrams: 71


In [15]:
# 🤖 Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
bert_model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [16]:
# 🧠 Encode bigrams with BERT (CLS token) - only if each word maps to one token
valid_bigrams = []
inputs = []
targets = []

for w1, w2 in bigrams[:100]:  # Limit to first 100 for performance
    t1 = tokenizer.tokenize(w1)
    t2 = tokenizer.tokenize(w2)
    if len(t1) == 1 and len(t2) == 1:
        input_ids = tokenizer.encode(w1, return_tensors="pt", add_special_tokens=True)
        with torch.no_grad():
            output = bert_model(input_ids).last_hidden_state
        emb_input = output[:, 0, :]  # CLS token
        inputs.append(emb_input.squeeze(0))
        target_id = tokenizer.convert_tokens_to_ids(t2[0])
        targets.append(target_id)
        valid_bigrams.append((w1, w2))

X = torch.stack(inputs)
y = torch.tensor(targets)
print("Valid bigrams:", len(valid_bigrams))


Valid bigrams: 40


In [17]:
df = pd.DataFrame({
    "Input Word": [w1 for w1, _ in valid_bigrams],
    "Target Word": [w2 for _, w2 in valid_bigrams],
    "Target Token ID": y.tolist()
})
df.head()


Unnamed: 0,Input Word,Target Word,Target Token ID
0,o,homem,40066
1,homem,tomou,88954
2,tomou,o,183
3,o,café,34551
4,café,da,10143


In [18]:
# 🔮 Define Bigram Predictor Model
class BigramPredictor(nn.Module):
    def __init__(self, embed_dim: int, hidden_dim: int, vocab_size: int):
        super().__init__()
        self.rnn = nn.GRU(input_size=embed_dim, hidden_size=hidden_dim, batch_first=True)
        self.predictor = nn.Linear(hidden_dim, vocab_size)

    def forward(self, embeddings):
        embeddings = embeddings.unsqueeze(1)  # Add time dim: [B, T=1, D]
        _, h_n = self.rnn(embeddings)
        return self.predictor(h_n.squeeze(0))  # [B, vocab]


In [19]:
# ⚙️ Train on the sample
model = BigramPredictor(embed_dim=768, hidden_dim=256, vocab_size=tokenizer.vocab_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 200
for epoch in range(n_epochs):
    model.train()
    optimizer.zero_grad()
    preds = model(X)
    loss = loss_fn(preds, y)
    loss.backward()
    optimizer.step()
    #print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")


Epoch 1: Loss = 11.6881
Epoch 2: Loss = 11.4198
Epoch 3: Loss = 11.1520
Epoch 4: Loss = 10.8684
Epoch 5: Loss = 10.5585
Epoch 6: Loss = 10.2172
Epoch 7: Loss = 9.8431
Epoch 8: Loss = 9.4371
Epoch 9: Loss = 9.0016
Epoch 10: Loss = 8.5394
Epoch 11: Loss = 8.0537
Epoch 12: Loss = 7.5477
Epoch 13: Loss = 7.0246
Epoch 14: Loss = 6.4883
Epoch 15: Loss = 5.9442
Epoch 16: Loss = 5.3995
Epoch 17: Loss = 4.8654
Epoch 18: Loss = 4.3576
Epoch 19: Loss = 3.8964
Epoch 20: Loss = 3.5023
Epoch 21: Loss = 3.1862
Epoch 22: Loss = 2.9431
Epoch 23: Loss = 2.7558
Epoch 24: Loss = 2.6058
Epoch 25: Loss = 2.4796
Epoch 26: Loss = 2.3697
Epoch 27: Loss = 2.2720
Epoch 28: Loss = 2.1846
Epoch 29: Loss = 2.1061
Epoch 30: Loss = 2.0350
Epoch 31: Loss = 1.9700
Epoch 32: Loss = 1.9097
Epoch 33: Loss = 1.8528
Epoch 34: Loss = 1.7981
Epoch 35: Loss = 1.7448
Epoch 36: Loss = 1.6925
Epoch 37: Loss = 1.6408
Epoch 38: Loss = 1.5898
Epoch 39: Loss = 1.5391
Epoch 40: Loss = 1.4887
Epoch 41: Loss = 1.4384
Epoch 42: Loss = 1.

In [20]:
# 🧪 Test the bigram predictor with a new word
def predict_next_word(word: str):
    tokens = tokenizer.tokenize(word)
    if len(tokens) != 1:
        print(f"'{word}' cannot be tokenized as a single BERT token.")
        return
    input_ids = tokenizer.encode(word, return_tensors="pt", add_special_tokens=True)
    with torch.no_grad():
        embedding = bert_model(input_ids).last_hidden_state[:, 0, :]  # CLS token
        logits = model(embedding)
        predicted_id = torch.argmax(logits, dim=-1).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_id])[0]
        print(f"Input word: {word}")
        print(f"Predicted next word: {predicted_token}")

# 🔍 Try it
predict_next_word("carro")


Input word: carro
Predicted next word: novo
