In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# ─── 1. Toy tabular dataset ────────────────────────────────────────────────────
# • 3 occupations (categorical)
# • age & hours_per_week (numeric)
# • binary target "hi_income"
raw = {
    "occupation": ["tech", "sales", "teacher", "tech", "sales", "teacher",
                   "teacher", "tech", "sales", "tech"],
    "age"      : [25, 45, 34, 29, 40, 50, 23, 31, 44, 28],
    "hours_pw" : [40, 50, 42, 38, 45, 35, 30, 48, 55, 41],
    "hi_income": [0, 1, 0, 0, 1, 0, 0, 1, 1, 0]
}

# ─── 2. Label‑encode the categorical column ────────────────────────────────────
le = LabelEncoder()
occ_idx = le.fit_transform(raw["occupation"])        # integers 0,1,2
occ_idx = torch.LongTensor(occ_idx)                  # shape [N]
num_feats = torch.tensor(np.vstack([raw["age"],
                                    raw["hours_pw"]]).T,
                         dtype=torch.float32)        # shape [N, 2]
y = torch.tensor(raw["hi_income"], dtype=torch.float32)  # shape [N]

train_idx, test_idx = train_test_split(
    np.arange(len(y)), test_size=0.3, random_state=42, stratify=y)

# ─── 3. Simple model with an Embedding block ───────────────────────────────────
class TabularNN(nn.Module):
    def __init__(self, n_occ_categories, emb_dim, num_input_dim):
        super().__init__()
        self.occ_emb = nn.Embedding(num_embeddings=n_occ_categories,
                                    embedding_dim=emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(num_input_dim + emb_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, occ_idx, numeric):
        occ_vec = self.occ_emb(occ_idx)              # [N, emb_dim]
        x = torch.cat([occ_vec, numeric], dim=1)      # [N, emb_dim+2]
        return self.mlp(x).squeeze()                 # [N]

model = TabularNN(n_occ_categories=len(le.classes_),
                  emb_dim=4,                         # ↳ 4‑d embedding
                  num_input_dim=num_feats.shape[1])

# ─── 4. Training loop (tiny – just for demo) ───────────────────────────────────
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    y_hat = model(occ_idx[train_idx], num_feats[train_idx])
    loss = criterion(y_hat, y[train_idx])
    loss.backward()
    optimizer.step()

# ─── 5. Evaluation ─────────────────────────────────────────────────────────────
model.eval()
with torch.no_grad():
    preds = model(occ_idx[test_idx], num_feats[test_idx]) > 0.5
    acc = accuracy_score(y[test_idx], preds)
print(f"Accuracy on hold‑out: {acc:.2f}")
print(classification_report(y[test_idx], preds))

# ─── 6. Inspect learned embeddings ─────────────────────────────────────────────
print("\nLearned occupation embeddings:")
for occ, vec in zip(le.classes_, model.occ_emb.weight):
    print(f"{occ:7s} → {vec.detach().numpy()}")


Accuracy on hold‑out: 1.00
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         2
         1.0       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3


Learned occupation embeddings:
sales   → [-1.1578369  -0.16091716 -2.5742908  -0.78198534]
teacher → [2.4876118 1.5992696 2.3890274 1.8566982]
tech    → [1.5753295 2.6383538 2.0375354 2.0834534]
