# GENERATING NEW DESCRIPTIONS USING OLLAMA

In [12]:
# GENERATING MORE DESCRIPTIONS USING OLLAMA
# ============================================================================
# Priority order:
#   1. CIFAR-100 words NOT in checkpoint (missing entirely)
#   2. Words with fewest descriptions (need more)
#
# Generates 100 descriptions per word, then moves to next word.
# Saves checkpoint after each word for safety.
# ============================================================================

import json
import requests
import re
import torchvision

CHECKPOINT_FILE = "ollama_interleaved_checkpoint.json"
OLLAMA_URL = "http://localhost:11434/api/generate"
TARGET_PER_WORD = 100  # Generate 100 descriptions per run

# Load checkpoint
with open(CHECKPOINT_FILE, 'r') as f:
    checkpoint = json.load(f)

# Get all CIFAR-100 words
cifar100 = torchvision.datasets.CIFAR100(root='./data', download=True)
all_cifar_words = set(cifar100.classes)

# Find words missing from checkpoint entirely
checkpoint_words = set(checkpoint['descriptions'].keys())
missing_words = sorted(all_cifar_words - checkpoint_words)

# Get words sorted by description count (fewest first)
existing_counts = [(word, len(checkpoint['descriptions'].get(word, []))) 
                   for word in all_cifar_words if word in checkpoint_words]
existing_counts.sort(key=lambda x: x[1])

# Build priority queue: missing words first, then by count
words_to_process = missing_words + [w for w, c in existing_counts]

print(f"Checkpoint has {len(checkpoint_words)} words")
print(f"Missing from checkpoint: {len(missing_words)} words: {missing_words}")
print(f"\nWill generate {TARGET_PER_WORD} descriptions per word")
print("="*60)

def clean_description(desc, word):
    """Clean and validate a description."""
    desc = desc.lower().strip()
    desc = desc.strip('"\'')
    desc = re.sub(r'^[\d\.\)\-\*]+\s*', '', desc)  # Remove numbering
    desc = re.sub(r'_([a-z]+)_', r'\1', desc)  # Strip markdown underscores
    desc = re.sub(r"(\b\w+)'s\b", r'\1', desc)  # Strip possessives
    
    # Normalize compound words
    compounds = [
        ('aquarium fish', 'aquarium_fish'), ('lawn mower', 'lawn_mower'),
        ('maple tree', 'maple_tree'), ('oak tree', 'oak_tree'),
        ('palm tree', 'palm_tree'), ('pickup truck', 'pickup_truck'),
        ('pine tree', 'pine_tree'), ('sweet pepper', 'sweet_pepper'),
        ('willow tree', 'willow_tree'),
    ]
    for space_ver, under_ver in compounds:
        desc = desc.replace(space_ver, under_ver)
    
    # Validate
    word_check = word.replace('_', ' ') if '_' in word else word
    if word not in desc and word_check not in desc:
        return None
    if len(desc) < 10 or len(desc) > 200:
        return None
    return desc

# ============================================================================
# INFINITE LOOP: Keep generating until manually stopped (Ctrl+C or stop button)
# ============================================================================
# Each pass adds ~100 descriptions per word, prioritizing words with fewest.
# Safe to interrupt - checkpoint is saved after every word!
# ============================================================================

pass_number = 0

try:
    while True:  # INFINITE LOOP
        pass_number += 1
        
        # Re-sort words by count each pass (prioritize words with fewest)
        word_counts = [(word, len(checkpoint['descriptions'].get(word, []))) 
                       for word in all_cifar_words]
        word_counts.sort(key=lambda x: x[1])  # Fewest first
        words_to_process = [w for w, c in word_counts]
        
        min_count = word_counts[0][1]
        max_count = word_counts[-1][1]
        total = sum(c for _, c in word_counts)
        
        print(f"\n{'='*60}")
        print(f"PASS {pass_number} | Total: {total:,} descriptions")
        print(f"Range: {min_count} - {max_count} per word")
        print(f"{'='*60}")
        
        for i, word in enumerate(words_to_process):
            existing = len(checkpoint['descriptions'].get(word, []))
            print(f"[{i+1}/100] {word} ({existing})...", end=" ", flush=True)
            
            new_descriptions = set()
            batch_num = 0
            
            while len(new_descriptions) < TARGET_PER_WORD and batch_num < 15:
                batch_num += 1
                
                prompt = f"""Generate 50 unique short image captions that a HUMAN would write to describe a photograph containing "{word}".

Rules:
- 4-6 words each
- SIMPLE descriptions
- Use the exact word "{word}". No plurals, and no possesives. The word needs to appear exactly as {word}. KEEP UNDERSCORE IF PRESENT!
- The word needs to appear EXACTLY as mentioned in the description, without alternative (e.g use bed and not bedside)
- Write like a human describing what they SEE in a real photograph. Simple HUMAN LIKE Descriptions of PHOTOS! Very important!
- Varied contexts, colors, actions, settings
- Natural language, not robotic or repetitive
- Output ONLY the captions, one per line. No numbers, bullets, or explanations.
Very Important you make these descriptions NATURAL SOUNDING. You can use commonly used adjectives and descriptions for {word}, but don't be repetitive.
"""

                try:
                    response = requests.post(OLLAMA_URL, json={
                        "model": "llama3.2",
                        "prompt": prompt,
                        "stream": False
                    }, timeout=120)
                    
                    result = response.json()
                    text = result.get('response', '')
                    
                    for line in text.strip().split('\n'):
                        cleaned = clean_description(line, word)
                        if cleaned:
                            new_descriptions.add(cleaned)
                            
                except Exception as e:
                    print(f"Error: {e}", end=" ", flush=True)
                    break
            
            # Append to checkpoint
            if word not in checkpoint['descriptions']:
                checkpoint['descriptions'][word] = []
            
            new_list = list(new_descriptions)
            checkpoint['descriptions'][word].extend(new_list)
            
            # Save checkpoint after each word
            with open(CHECKPOINT_FILE, 'w') as f:
                json.dump(checkpoint, f)
            
            new_total = len(checkpoint['descriptions'][word])
            print(f"✓ +{len(new_list)} → {new_total}")

except KeyboardInterrupt:
    print(f"\n\n{'='*60}")
    print(f"⏹️  STOPPED by user after {pass_number} passes")
    total = sum(len(d) for d in checkpoint['descriptions'].values())
    print(f"   Total descriptions: {total:,}")
    print(f"   Checkpoint saved ✓")


Checkpoint has 100 words
Missing from checkpoint: 0 words: []

Will generate 100 descriptions per word

PASS 1 | Total: 296,049 descriptions
Range: 2902 - 3045 per word
[1/100] bus (2902)... ✓ +112 → 3014
[2/100] bear (2903)... 

⏹️  STOPPED by user after 1 passes
   Total descriptions: 296,161
   Checkpoint saved ✓


In [13]:
# CREATING .txt FILE FOR NEW DESCRIPTIONS
# ============================================================================
import json

CHECKPOINT_FILE = "ollama_interleaved_checkpoint.json"

# Load checkpoint
with open(CHECKPOINT_FILE, 'r') as f:
    checkpoint = json.load(f)

print(f"Loaded {len(checkpoint['descriptions'])} words from checkpoint")

# Calculate repeats needed (based on VG corpus size and rare_threshold=0.00025)
min_occurrences = int(33662585 * 0.00025) + 1000  # ~9,400
max_descs = max(len(d) for d in checkpoint['descriptions'].values())
repeats_needed = (min_occurrences // max_descs) + 1

print(f"Max descriptions per word: {max_descs}")
print(f"Repeats needed: {repeats_needed}x")

# Build interleaved output (cycle through all words evenly)
all_descriptions = []
words = sorted(checkpoint['descriptions'].keys())

for rep in range(repeats_needed):
    for desc_idx in range(max_descs):
        for word in words:
            descs = checkpoint['descriptions'][word]
            if desc_idx < len(descs):
                all_descriptions.append(descs[desc_idx])

# Save to cifar100_words.txt
with open('cifar100_word_descriptions.txt', 'w') as f:
    f.write(' . '.join(all_descriptions))

print(f"\n✓ Created cifar100_words.txt with {len(all_descriptions):,} descriptions")
print(f"  File size: {len(' . '.join(all_descriptions)):,} characters")


Loaded 100 words from checkpoint
Max descriptions per word: 3045
Repeats needed: 4x

✓ Created cifar100_words.txt with 1,184,644 descriptions
  File size: 48,865,681 characters


In [None]:
# BUILDING TEXT NETWORK FROM ONLY CIFAR 100 DESCRIPTIONS
from lab2 import process_text_network
import torchvision

# Build network from CIFAR-100 descriptions only
print("Building network from cifar100_words.txt...")
print("="*60)

cifar100_network = process_text_network(
    'cifar100_words.txt',
    rare_threshold=0.00025,
    verbose=True
)

# Check CIFAR-100 coverage
cifar100_dataset = torchvision.datasets.CIFAR100(root='./data', download=True)
cifar_words = set(cifar100_dataset.classes)
network_vocab = set(cifar100_network['nodes'])

cifar_found = cifar_words & network_vocab
cifar_missing = sorted(cifar_words - network_vocab)

print(f"\n{'='*60}")
print(f"CIFAR-100 COVERAGE CHECK:")
print(f"{'='*60}")
print(f"  Network nodes: {len(network_vocab)}")
print(f"  Network edges: {cifar100_network['graph'].number_of_edges():,}")
print(f"\n  CIFAR-100 words found: {len(cifar_found)}/100")
print(f"  CIFAR-100 words missing: {len(cifar_missing)}/100")

if cifar_missing:
    print(f"\n  ⚠️  Missing words: {cifar_missing}")
else:
    print(f"\n  ✅ ALL 100 CIFAR-100 WORDS ARE IN THE NETWORK!")




Building network from cifar100_words.txt...
Loaded text: 45529689 characters


Tokenized: 8635663 tokens
Sample tokens: ['projector', 'sarah', 'chives', "grass'", 'hostile', 'disposable', 'skateboards', 'unbroken', 'gas', 'chillin', 'plantings', 'afro', "i've", 'windowpanes', 'remedies', 'pacifier', 'paddles', 'woody', 'acorn', 'sizzling']
Replaced 13176 rare tokens (threshold=0.00025)
Final vocabulary: 630 unique tokens
Sample tokens: ['table', 'fence', 'motorcycle', 'sunset', 'proudly', 'sweet', 'mouse', 'describe', 'dolphin', 'together', 'my', 'tank', 'mushroom', 'quickly', 'containing', 'blends', 'as', 'upon', 'skin', 'scattered']
Graph: 630 nodes, 41647 edges
Top tokens by frequency:
   1. '<RARE>' (freq=1594556)
   2. '.' (freq=1321383)
   3. 'a' (freq=516568)
   4. 'the' (freq=281616)
   5. 'in' (freq=171944)
   6. 'on' (freq=150392)
   7. 'of' (freq=148188)
   8. 'with' (freq=129416)
   9. ',' (freq=75540)
  10. 'and' (freq=60388)
  11. 'old' (freq=50240)
  12. 'to' (freq=45912)
  13. 'an' (freq=40856)
  14. 'through' (freq=38960)
  15. 'up' (freq=34864)


In [7]:
# Check if the words missing from checkpoint are in the network
# (They might be in the network if they appear frequently in the generated text anyway)

network_vocab = set(cifar100_network['nodes'])

print("Checking if missing checkpoint words are in the network...")
print("="*60)

for word in missing_from_checkpoint:
    if word in network_vocab:
        print(f"  ✅ '{word}' - IN network")
    else:
        print(f"  ❌ '{word}' - NOT in network")

# Summary
in_network = [w for w in missing_from_checkpoint if w in network_vocab]
not_in_network = [w for w in missing_from_checkpoint if w not in network_vocab]

print(f"\n{'='*60}")
print(f"Of the {len(missing_from_checkpoint)} words missing from checkpoint:")
print(f"  - {len(in_network)} ARE in the network (appeared in other descriptions)")
print(f"  - {len(not_in_network)} are NOT in the network")

if not_in_network:
    print(f"\n⚠️  Words that NEED descriptions generated: {not_in_network}")


NameError: name 'cifar100_network' is not defined

# Quality checking the descriptions

In [11]:
# Show the LAST 3 descriptions generated for each CIFAR-100 word
import json
import torchvision

CHECKPOINT_FILE = "ollama_interleaved_checkpoint.json"

with open(CHECKPOINT_FILE, 'r') as f:
    checkpoint = json.load(f)

# Get CIFAR-100 words
cifar100 = torchvision.datasets.CIFAR100(root='./data', download=True)
cifar_words = sorted(cifar100.classes)

print("LAST 3 DESCRIPTIONS PER WORD:")
print("="*80)

for word in cifar_words:
    descs = checkpoint['descriptions'].get(word, [])
    count = len(descs)
    last_3 = descs[-3:] if count >= 3 else descs
    
    print(f"\n{word} ({count} total):")
    for i, desc in enumerate(last_3, 1):
        print(f"  {i}. {desc}")


LAST 3 DESCRIPTIONS PER WORD:

apple (2905 total):
  1. a big red juicy apple
  2. an old red apple with a few bites taken out
  3. apple cobbler with vanilla ice cream

aquarium_fish (3008 total):
  1. colorful aquarium_fish in tank
  2. vibrant aquarium_fish leap out
  3. aquarium_fish with fins wave

baby (3009 total):
  1. happy baby playing with toys
  2. baby eyes locked on mine sweet
  3. baby first smile captured here

bear (2903 total):
  1. furry bear sitting on a log
  2. serene bear in meadow
  3. bear standing on two legs

beaver (2978 total):
  1. little beaver plays with sticks.
  2. busy little beaver at work
  3. grey beaver walks along shore.

bed (3009 total):
  1. bed in a small space
  2. mossy stone bed seat
  3. soft music playing by the bed

bee (2927 total):
  1. warm sunlight on a sleeping bee
  2. bee on a bright yellow bloom
  3. bee collecting water from spiderweb

beetle (2989 total):
  1. green beetle in forest
  2. purple beetle on purple flowers
  3. be

In [None]:
#NOTE CLEAN UP THE DESCRIPTIONS AGAIN HERE. IM SEEING SHIT LIKE "**worm**"" and "worm-eye"