In [2]:
import json
import numpy as np
import random
from tqdm.auto import tqdm
import os
from copy import deepcopy
import string

In [3]:

DATA_ROOT = "./data"
os.makedirs(DATA_ROOT, exist_ok=True)

# Cell 3: Define your semantic graph
# Hand-crafted word graph with clear triangle relationships
semantic_graph = {
    # Animals cluster
    "dog":      ["wolf", "puppy", "pet", "canine"],
    "wolf":     ["dog", "wild", "canine", "predator"],
    "puppy":    ["dog", "young", "cute"],
    "canine":   ["dog", "wolf", "tooth"],
    "pet":      ["dog", "cat", "tame"],
    "cat":      ["pet", "kitten", "feline"],
    "kitten":   ["cat", "young", "cute"],
    "feline":   ["cat", "lion", "wild"],
    "lion":     ["feline", "wild", "predator"],
    "wild":     ["wolf", "lion", "feline", "predator"],
    "predator": ["wolf", "lion", "wild"],
    "young":    ["puppy", "kitten", "cute"],
    "cute":     ["puppy", "kitten", "young"],
    "tame":     ["pet", "calm", "gentle"],

    # Emotions cluster
    "happy":    ["joyful", "positive", "smile"],
    "joyful":   ["happy", "positive", "cheerful"],
    "positive": ["happy", "joyful", "good"],
    "smile":    ["happy", "cheerful", "face"],
    "cheerful": ["joyful", "smile", "bright"],
    "sad":      ["unhappy", "negative", "cry"],
    "unhappy":  ["sad", "negative", "frown"],
    "negative": ["sad", "unhappy", "bad"],
    "cry":      ["sad", "tears", "unhappy"],
    "frown":    ["unhappy", "sad", "face"],

    # Colors cluster
    "red":      ["orange", "warm", "fire"],
    "orange":   ["red", "warm", "fruit"],
    "warm":     ["red", "orange", "fire"],
    "fire":     ["red", "warm", "hot"],
    "blue":     ["cold", "sky", "ocean"],
    "cold":     ["blue", "ice", "winter"],
    "sky":      ["blue", "cloud", "high"],
    "ocean":    ["blue", "water", "deep"],
    "ice":      ["cold", "water", "winter"],
    "winter":   ["cold", "ice", "snow"],
    "snow":     ["winter", "ice", "white"],
    "white":    ["snow", "pure", "bright"],
    "bright":   ["white", "cheerful", "light"],
    "light":    ["bright", "sun", "warm"],
    "sun":      ["light", "warm", "sky"],

    # Food cluster
    "apple":    ["fruit", "red", "sweet"],
    "fruit":    ["apple", "orange", "sweet"],
    "sweet":    ["apple", "fruit", "sugar"],
    "sugar":    ["sweet", "candy", "white"],
    "candy":    ["sugar", "sweet", "child"],
    "child":    ["candy", "young", "cute"],

    # Shared tokens that connect clusters
    "face":     ["smile", "frown", "eye"],
    "eye":      ["face", "see", "bright"],
    "see":      ["eye", "light", "bright"],
    "good":     ["positive", "pure", "calm"],
    "bad":      ["negative", "dark", "cold"],
    "dark":     ["bad", "night", "cold"],
    "night":    ["dark", "star", "sky"],
    "star":     ["night", "sky", "bright"],
    "calm":     ["tame", "gentle", "good"],
    "gentle":   ["calm", "tame", "soft"],
    "soft":     ["gentle", "warm", "light"],
    "pure":     ["white", "good", "clean"],
    "clean":    ["pure", "bright", "fresh"],
    "fresh":    ["clean", "cold", "light"],
    "deep":     ["ocean", "dark", "night"],
    "high":     ["sky", "sun", "star"],
    "hot":      ["fire", "warm", "sun"],
    "water":    ["ocean", "ice", "fresh"],
    "cloud":    ["sky", "white", "soft"],
    "tooth":    ["canine", "sharp", "white"],
    "sharp":    ["tooth", "predator", "cold"]
}

# Cell 4: Verify triangles exist
print("Verifying triangles in semantic graph...")
triangles_found = []
for u in semantic_graph:
    for v in semantic_graph[u]:
        if v in semantic_graph:
            for w in semantic_graph[v]:
                if w in semantic_graph and w != u:
                    if u in semantic_graph[w]:  # closes the triangle
                        triangle = tuple(sorted([u, v, w]))
                        if triangle not in triangles_found:
                            triangles_found.append(triangle)

print(f"Total unique triangles: {len(triangles_found)}")
print("Sample triangles:")
for t in triangles_found[:10]:
    print(f"  {t[0]} - {t[1]} - {t[2]}")

# Cell 5: Reuse paper's formatting functions
def form_triangle(hash_str, a, b, c):
    input_text = "".join([hash_str, " tri: "])
    target_text = input_text + "".join([
        a, b, "<sep>", b, c, "<sep>", c, a, "</a>"
    ])
    return {"input_text": input_text, "target_text": target_text}

def form_triangle_test(hash_str):
    input_text = "".join([hash_str, " tri: "])
    target_text = input_text + "</a>"
    return {"input_text": input_text, "target_text": target_text}

def form_edge(u, v):
    input_text = "edge: "
    target_text = input_text + "".join([u, v, "<sep>", v, u, "</a>"])
    return {"input_text": input_text, "target_text": target_text}

# Cell 6: Generate dataset
HASH_STR_LEN = 10
num_samples = 15000
triangle_prob = 1/3

chars = string.ascii_lowercase + string.digits
base = len(chars)
used_hashes = set()

train_sequences = []
test_sequences = []

edges = semantic_graph  # rename for consistency with paper code

for _ in tqdm(range(num_samples)):
    if random.random() < triangle_prob:
        # Try to find a valid triangle
        attempts = 0
        triangle_found = False
        while not triangle_found and attempts < 100:
            attempts += 1
            u = random.choice(list(edges.keys()))
            neighbors = [n for n in edges[u] if n in edges]
            if len(neighbors) < 2:
                continue
            v, w = random.sample(neighbors, 2)

            # Check if triangle exists
            if v in edges and w in edges.get(v, []):
                # Generate unique hash
                while True:
                    hash_digits = [random.randint(0, base-1)
                                  for _ in range(HASH_STR_LEN)]
                    hash_str = ''.join(chars[d] for d in hash_digits)
                    if hash_str not in used_hashes:
                        used_hashes.add(hash_str)
                        break

                train_sequences.append(form_triangle(hash_str, u, v, w))
                triangle_found = True
    else:
        # Edge sample
        u = random.choice(list(edges.keys()))
        neighbors = [n for n in edges[u] if n in edges]
        if neighbors:
            v = random.choice(neighbors)
            train_sequences.append(form_edge(u, v))

# Generate test sequences with novel seeds
for _ in range(1024):
    while True:
        hash_digits = [random.randint(0, base-1) for _ in range(HASH_STR_LEN)]
        hash_str = ''.join(chars[d] for d in hash_digits)
        if hash_str not in used_hashes:
            used_hashes.add(hash_str)
            break
    test_sequences.append(form_triangle_test(hash_str))

print(f"Training sequences: {len(train_sequences)}")
print(f"Test sequences: {len(test_sequences)}")
print(f"Sample train entry: {train_sequences[0]}")

# Cell 7: Build vocabulary
vocab = list(semantic_graph.keys())
vocab = vocab + ["<mask>", "<sep>", "<a>", "</a>", "<q>", "</q>"]
print(f"Vocab size: {len(vocab)}")

# Cell 8: Save dataset
dataset_name = "triangle_semantic.10"
os.makedirs(os.path.join(DATA_ROOT, dataset_name), exist_ok=True)

test_size = 1024

# Build probes (combined train samples + test)
probes = []
sample_indices = random.sample(range(len(train_sequences)),
                               min(test_size, len(train_sequences)))
for i in sample_indices:
    item = deepcopy(train_sequences[i])
    item['type'] = 'train'
    probes.append(item)

for item in test_sequences:
    item_copy = deepcopy(item)
    item_copy['type'] = 'test'
    probes.append(item_copy)

# Save all files
with open(os.path.join(DATA_ROOT, dataset_name, "train.json"), "w") as f:
    json.dump(train_sequences, f)
with open(os.path.join(DATA_ROOT, dataset_name, "valid.json"), "w") as f:
    json.dump(test_sequences, f)
with open(os.path.join(DATA_ROOT, dataset_name, "test.json"), "w") as f:
    json.dump(probes, f)
with open(os.path.join(DATA_ROOT, dataset_name, "vocab.json"), "w") as f:
    json.dump(vocab, f)
with open(os.path.join(DATA_ROOT, dataset_name, "edges.json"), "w") as f:
    json.dump(edges, f)

print("Dataset saved successfully!")
print(f"Location: {os.path.join(DATA_ROOT, dataset_name)}")

Verifying triangles in semantic graph...
Total unique triangles: 20
Sample triangles:
  canine - dog - wolf
  predator - wild - wolf
  cute - puppy - young
  cute - kitten - young
  feline - lion - wild
  lion - predator - wild
  calm - gentle - tame
  happy - joyful - positive
  negative - sad - unhappy
  frown - sad - unhappy


  0%|          | 0/15000 [00:00<?, ?it/s]

Training sequences: 15000
Test sequences: 1024
Sample train entry: {'input_text': 'edge: ', 'target_text': 'edge: sharpcold<sep>coldsharp</a>'}
Vocab size: 72
Dataset saved successfully!
Location: ./data/triangle_semantic.10


In [4]:
import pandas as pd
import json
import os

DATA_DIR = os.path.join(DATA_ROOT, "triangle_semantic.10/")

def load_json_as_df(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return pd.DataFrame(data)

train_df = load_json_as_df(os.path.join(DATA_DIR, "train.json"))
eval_df  = load_json_as_df(os.path.join(DATA_DIR, "valid.json"))
test_df  = load_json_as_df(os.path.join(DATA_DIR, "test.json"))

print(f"Train samples: {len(train_df)}")
print(f"Eval samples: {len(eval_df)}")
print(f"Test samples: {len(test_df)}")
print("Sample train entry:", train_df.iloc[0])

Train samples: 15000
Eval samples: 1024
Test samples: 2048
Sample train entry: input_text                                edge: 
target_text    edge: sharpcold<sep>coldsharp</a>
Name: 0, dtype: object


In [5]:
with open(os.path.join(DATA_DIR, "vocab.json")) as f:
    new_tokens = json.load(f)

print(f"Vocabulary size: {len(new_tokens)}")

Vocabulary size: 72


In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load GPT-2 tokenizer and add custom tokens
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add your triangle tokens
special_tokens = ["<sep>", "<a>", "</a>", "<q>", "</q>"]
tokenizer.add_tokens(special_tokens)
tokenizer.pad_token = tokenizer.eos_token

# Load GPT-2 model and resize embeddings
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Move to GPU if available
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class TriangleDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.inputs = df['input_text'].tolist()
        self.targets = df['target_text'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]
        encodings = self.tokenizer(input_text, target_text,
                                   max_length=self.max_len,
                                   padding='max_length',
                                   truncation=True,
                                   return_tensors="pt")
        input_ids = encodings['input_ids'].squeeze()
        labels = encodings['labels'].squeeze() if 'labels' in encodings else encodings['input_ids'].squeeze()
        return {"input_ids": input_ids, "labels": labels}

train_dataset = TriangleDataset(train_df, tokenizer)
eval_dataset  = TriangleDataset(eval_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader  = DataLoader(eval_dataset, batch_size=16)

In [8]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-4)
num_epochs = 5

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Avg Loss: {avg_loss:.4f}")

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch 1 | Avg Loss: 0.3683
Epoch 2 | Avg Loss: 0.2620
Epoch 3 | Avg Loss: 0.2561
Epoch 4 | Avg Loss: 0.2522
Epoch 5 | Avg Loss: 0.2469


In [4]:
def generate_sequences(model, tokenizer, inputs, max_len=64):
    model.eval()
    generated_texts = []
    for inp in inputs:
        input_ids = tokenizer.encode(inp, return_tensors="pt").to(device)
        outputs = model.generate(input_ids, max_length=max_len, num_return_sequences=1)
        text = tokenizer.decode(outputs[0], skip_special_tokens=False)
        generated_texts.append(text)
    return generated_texts

sample_inputs = ["abc123defg tri: dog wolf pup<sep>"]
predictions = generate_sequences(model, tokenizer, sample_inputs)
print(predictions)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['abc123defg tri: dog wolf pup<sep>123defg tri: dog wolf pup\n\nThe following code snippet shows how to create a new instance of the class Dog .\n\nclass Dog { public static void main(String[] args) { Dog.class = "dog"; } }\n\nThe following']


In [10]:
from collections import Counter

def evaluate_metrics(predictions, train_sequences, top_k=3):
    """
    predictions: list of generated texts
    train_sequences: list of training target_texts
    Metrics:
    - Creativity: % of sequences not seen in training
    - Uniqueness: % of unique sequences in predictions
    - Memorization: % of sequences copied from training
    """
    total = len(predictions)
    unique_preds = set(predictions)
    unique_count = len(unique_preds)

    # Creativity
    creativity = sum([1 for p in predictions if p not in train_sequences]) / total

    # Memorization
    memorization = sum([1 for p in predictions if p in train_sequences]) / total

    # Uniqueness
    uniqueness = unique_count / total

    return {
        "creativity": creativity,
        "uniqueness": uniqueness,
        "memorization": memorization
    }

# Example usage
train_targets = train_df['target_text'].tolist()
metrics = evaluate_metrics(predictions, train_targets)
print(metrics)

{'creativity': 1.0, 'uniqueness': 1.0, 'memorization': 0.0}


In [1]:
print("Total predictions:", len(predictions))

valid = 0
for p in predictions:
    if extract_triangle(p):
        valid += 1

print("Valid triangles:", valid)

NameError: name 'predictions' is not defined

In [11]:
sample_inputs = [
    "abc123defg tri: dog wolf pup<sep>",  # Example from training format
    "xyz987uvw tri: happy joyful positive<sep>"
]

predictions = model.generate(sample_inputs, max_length=64)
for inp, pred in zip(sample_inputs, predictions):
    print("Input: ", inp)
    print("Prediction: ", pred)
    print("---")

AttributeError: 'list' object has no attribute 'shape'