# CREATING AND ANALYSING EXISTING vg_text NETWORK

In [9]:
# NOTE Creating vg network

from lab6 import prepare_visual_genome_text, train_embeddings
from lab2 import process_text_network, visualize_network

zip_url = "https://homes.cs.washington.edu/~ranjay/visualgenome/data/dataset/region_descriptions.json.zip"
text_file = prepare_visual_genome_text(zip_url)


vg_network_data = process_text_network(
    text_file,
    rare_threshold=0.00025,  # Keep only very common tokens
    verbose=True
)


print(f"\n✓ Network built: {vg_network_data['graph'].number_of_nodes():,} nodes, "
      f"{vg_network_data['graph'].number_of_edges():,} edges")


File vg_text.txt already exists. Skipping processing.
Loaded text: 154198751 characters
Tokenized: 33662592 tokens
Sample tokens: ['impressive', 'matriss', 'injection', 'siluette', 'multi', 'haltar', 'fisheye', 'trepidation', 'trow', 'maba', 'trysil', 'antacid', 'coloron', 'hesperia', 'winnebego', 'cubical', 'overdoor', 'pickel', 'wiht', 'proffessional']
Replaced 62407 rare tokens (threshold=0.00025)
Final vocabulary: 458 unique tokens
Sample tokens: ['walking', 'tan', 'open', 'pile', 'cheese', 'up', 'fur', 'frisbee', 'dark', 'surfboard', 'oven', 'broccoli', 'remote', 'statue', 'branch', 'four', 'pole', 'sitting', 'and', 'baby']
Graph: 458 nodes, 50127 edges
Top tokens by frequency:
   1. '.' (freq=6085975)
   2. '<RARE>' (freq=4416324)
   3. 'a' (freq=2220903)
   4. 'the' (freq=2155082)
   5. 'on' (freq=1396037)
   6. 'of' (freq=980462)
   7. 'is' (freq=787909)
   8. 'in' (freq=714867)
   9. 'white' (freq=652421)
  10. 'black' (freq=398632)
  11. 'and' (freq=341934)
  12. 'man' (freq=

In [7]:
# NOTE Occurrences per word in network
# Display frequency of all words in the network, sorted by frequency

token_counts = vg_network_data['token_counts']
nodes = vg_network_data['nodes']

print("=" * 80)
print("WORD FREQUENCIES IN NETWORK")
print("=" * 80)
print(f"Total unique words: {len(nodes)}")
print(f"Total token occurrences: {sum(token_counts.values()):,}")
print()

# Sort by frequency (descending)
sorted_words = sorted(token_counts.items(), key=lambda x: -x[1])

# Print all words with their frequencies
print(f"{'Rank':<6} {'Word':<20} {'Frequency':>12} {'% of Total':>10}")
print("-" * 50)

total = sum(token_counts.values())
for rank, (word, freq) in enumerate(sorted_words, 1):
    pct = (freq / total) * 100
    print(f"{rank:<6} {word:<20} {freq:>12,} {pct:>9.2f}%")

print()
print("=" * 80)
print(f"Summary: {len(nodes)} words, {sum(token_counts.values()):,} total occurrences")

WORD FREQUENCIES IN NETWORK
Total unique words: 458
Total token occurrences: 33,662,592

Rank   Word                    Frequency % of Total
--------------------------------------------------
1      .                       6,085,975     18.08%
2      <RARE>                  4,416,324     13.12%
3      a                       2,220,903      6.60%
4      the                     2,155,082      6.40%
5      on                      1,396,037      4.15%
6      of                        980,462      2.91%
7      is                        787,909      2.34%
8      in                        714,867      2.12%
9      white                     652,421      1.94%
10     black                     398,632      1.18%
11     and                       341,934      1.02%
12     man                       317,885      0.94%
13     with                      292,771      0.87%
14     blue                      283,594      0.84%
15     red                       237,296      0.70%
16     green                

In [8]:
# NOTE Edges per word
print("=" * 80)
print("EDGES PER NODE (DEGREE)")
print("=" * 80)

graph = vg_network_data['graph']

# Get degree for each node
degrees = dict(graph.degree())

# Sort by degree (descending)
sorted_degrees = sorted(degrees.items(), key=lambda x: -x[1])

print(f"Total nodes: {graph.number_of_nodes()}")
print(f"Total edges: {graph.number_of_edges()}")
print(f"Average degree: {sum(degrees.values()) / len(degrees):.2f}")
print()

print(f"{'Rank':<6} {'Word':<20} {'Degree':>10} {'% of Max':>10}")
print("-" * 50)

max_degree = sorted_degrees[0][1] if sorted_degrees else 1
for rank, (word, degree) in enumerate(sorted_degrees, 1):
    pct = (degree / max_degree) * 100
    print(f"{rank:<6} {word:<20} {degree:>10} {pct:>9.1f}%")

print()
print("=" * 80)
print(f"Degree range: {min(degrees.values())} - {max(degrees.values())}")


EDGES PER NODE (DEGREE)
Total nodes: 458
Total edges: 50127
Average degree: 218.90

Rank   Word                     Degree   % of Max
--------------------------------------------------
1      .                           457     100.0%
2      <RARE>                      457     100.0%
3      is                          457     100.0%
4      a                           456      99.8%
5      the                         456      99.8%
6      on                          455      99.6%
7      of                          455      99.6%
8      in                          455      99.6%
9      and                         455      99.6%
10     white                       454      99.3%
11     with                        454      99.3%
12     ,                           453      99.1%
13     to                          452      98.9%
14     black                       447      97.8%
15     are                         447      97.8%
16     behind                      444      97.2%
17     over    

-----------

# APPENDING MISSING WORDS USING NEW CORPUS STATISTICS

In [14]:
# NOTE Checking Missing Words
# Check which CIFAR-100 words are missing from the original VG network

import torchvision

# Get CIFAR-100 class names
cifar100 = torchvision.datasets.CIFAR100(root='./data', download=True)
cifar_words = set(cifar100.classes)

# Get words in the original VG network
vg_vocab = set(vg_network_data['nodes'])

# Find missing and found words
cifar_found = sorted(cifar_words & vg_vocab)
cifar_missing = sorted(cifar_words - vg_vocab)

print("=" * 80)
print("CIFAR-100 COVERAGE IN ORIGINAL VG NETWORK")
print("=" * 80)
print(f"VG network vocabulary: {len(vg_vocab)} words")
print(f"CIFAR-100 classes: {len(cifar_words)} words")
print()
print(f"✅ CIFAR-100 words FOUND in VG network: {len(cifar_found)}/100")
print(f"❌ CIFAR-100 words MISSING from VG network: {len(cifar_missing)}/100")

print()
print(f"WORDS MISSING FROM NETWORK: {cifar_missing}")

print()
print("=" * 80)
print("WORDS FOUND IN NETWORK:")
print("=" * 80)
for i, word in enumerate(cifar_found, 1):
    freq = vg_network_data['token_counts'].get(word, 0)
    print(f"  {i:2d}. {word:<20} (freq: {freq:,})")


print()
print("=" * 80)
print(f"SUMMARY: Need to add {len(cifar_missing)} missing words to the network")

CIFAR-100 COVERAGE IN ORIGINAL VG NETWORK
VG network vocabulary: 458 words
CIFAR-100 classes: 100 words

✅ CIFAR-100 words FOUND in VG network: 32/100
❌ CIFAR-100 words MISSING from VG network: 68/100

WORDS MISSING FROM NETWORK: ['aquarium_fish', 'beaver', 'bee', 'beetle', 'butterfly', 'camel', 'castle', 'caterpillar', 'cattle', 'chimpanzee', 'cockroach', 'crab', 'crocodile', 'dinosaur', 'dolphin', 'flatfish', 'forest', 'fox', 'hamster', 'kangaroo', 'lawn_mower', 'leopard', 'lion', 'lizard', 'lobster', 'maple_tree', 'mushroom', 'oak_tree', 'orchid', 'otter', 'palm_tree', 'pear', 'pickup_truck', 'pine_tree', 'plain', 'poppy', 'porcupine', 'possum', 'rabbit', 'raccoon', 'ray', 'rocket', 'rose', 'sea', 'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'telephone', 'television', 'tiger', 'tractor', 'trout', 'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'worm']

WORDS FOUND IN NETWORK:
   1. a

In [None]:
# NOTE comparing corpus sizes
# Compare word counts between cifar100_word_descriptions.txt and vg_text.txt

# Read both files
with open('cifar100_word_descriptions.txt', 'r', encoding='utf-8') as f:
    cifar_text = f.read()

with open('vg_text.txt', 'r', encoding='utf-8') as f:
    vg_text = f.read()

# Count words (simple split by whitespace)
cifar_words_count = len(cifar_text.split())
vg_words_count = len(vg_text.split())

# Character counts
cifar_chars = len(cifar_text)
vg_chars = len(vg_text)

print("=" * 80)
print("CORPUS SIZE COMPARISON")
print("=" * 80)
print()
print(f"{'Metric':<25} {'CIFAR-100 Descriptions':>25} {'VG Text':>25}")
print("-" * 80)
print(f"{'Characters':<25} {cifar_chars:>25,} {vg_chars:>25,}")
print(f"{'Words (whitespace)':<25} {cifar_words_count:>25,} {vg_words_count:>25,}")
print()
print(f"CIFAR corpus is {cifar_chars / vg_chars * 100:.2f}% the size of VG corpus (by characters)")
print(f"CIFAR corpus is {cifar_words_count / vg_words_count * 100:.2f}% the size of VG corpus (by words)")

CORPUS SIZE COMPARISON

Metric                       CIFAR-100 Descriptions                   VG Text
--------------------------------------------------------------------------------
Characters                               48,865,681               154,198,751
Words (whitespace)                        8,905,223                32,937,644

CIFAR corpus is 31.69% the size of VG corpus (by characters)
CIFAR corpus is 27.04% the size of VG corpus (by words)


In [17]:
# NOTE Appending missing CIFAR-100 words based on co-occurrences from BOTH corpora
# ============================================================================
# STRATEGY: Use actual co-occurrence data from vg_text.txt AND cifar100_word_descriptions.txt
# to find which existing network words should connect to missing CIFAR-100 words.
# ============================================================================

import numpy as np
import torchvision
from collections import Counter
from lab2 import tokenize_text, get_text_adjacencies

# ============================================================================
# 1. Identify missing CIFAR-100 words
# ============================================================================
cifar100 = torchvision.datasets.CIFAR100(root='./data', download=True)
cifar_words = set(cifar100.classes)
existing_vocab = set(vg_network_data['nodes'])
missing_words = sorted(cifar_words - existing_vocab)

print(f"Missing CIFAR-100 words: {len(missing_words)}")
print(f"Existing network: {len(existing_vocab)} nodes, {vg_network_data['graph'].number_of_edges():,} edges")

# ============================================================================
# 2. Tokenize both corpora
# ============================================================================
print("\nTokenizing corpora...")

# VG text
with open('vg_text.txt', 'r', encoding='utf-8') as f:
    vg_text_content = f.read()
vg_tokens = tokenize_text(vg_text_content)
print(f"  VG tokens: {len(vg_tokens):,}")

# CIFAR descriptions
with open('cifar100_word_descriptions.txt', 'r', encoding='utf-8') as f:
    cifar_text_content = f.read()
cifar_tokens = tokenize_text(cifar_text_content)
print(f"  CIFAR tokens: {len(cifar_tokens):,}")

# ============================================================================
# 3. Get adjacencies from both corpora
# ============================================================================
print("\nComputing adjacencies...")
vg_adjacencies = get_text_adjacencies(vg_tokens)
cifar_adjacencies = get_text_adjacencies(cifar_tokens)
print(f"  VG adjacency pairs: {len(vg_adjacencies):,}")
print(f"  CIFAR adjacency pairs: {len(cifar_adjacencies):,}")

# ============================================================================
# 4. For each missing word, find co-occurring words that ARE in the network
# ============================================================================
print("\n" + "=" * 80)
print("FINDING CO-OCCURRENCES FOR MISSING WORDS")
print("=" * 80)

# Combine adjacencies from both corpora
combined_adjacencies = Counter()
combined_adjacencies.update(vg_adjacencies)
combined_adjacencies.update(cifar_adjacencies)

# For each missing word, find which existing network words it co-occurs with
missing_word_connections = {}

for word in missing_words:
    connections = Counter()
    
    # Check all adjacency pairs involving this word
    for (w1, w2), count in combined_adjacencies.items():
        if w1 == word and w2 in existing_vocab:
            connections[w2] += count
        elif w2 == word and w1 in existing_vocab:
            connections[w1] += count
    
    missing_word_connections[word] = connections

# Print summary
print(f"\n{'Word':<20} {'# Connections':>15} {'Top 5 Co-occurring Words':<50}")
print("-" * 90)

for word in missing_words:
    conns = missing_word_connections[word]
    n_conns = len(conns)
    top5 = conns.most_common(5)
    top5_str = ', '.join([f"{w}({c})" for w, c in top5]) if top5 else "(none found)"
    print(f"{word:<20} {n_conns:>15} {top5_str:<50}")

# Count words with no connections
no_conns = [w for w in missing_words if len(missing_word_connections[w]) == 0]
print(f"\n⚠️  Words with NO co-occurrences in either corpus: {len(no_conns)}")
if no_conns:
    print(f"   {no_conns}")


Missing CIFAR-100 words: 68
Existing network: 458 nodes, 50,127 edges

Tokenizing corpora...
  VG tokens: 33,662,592
  CIFAR tokens: 9,262,627

Computing adjacencies...
  VG adjacency pairs: 934,725
  CIFAR adjacency pairs: 223,471

FINDING CO-OCCURRENCES FOR MISSING WORDS

Word                   # Connections Top 5 Co-occurring Words                          
------------------------------------------------------------------------------------------
aquarium_fish                     83 .(3728), with(864), of(644), in(604), colorful(488)
beaver                            84 .(3561), a(3229), the(1117), in(432), fur(300)    
bee                               87 .(4236), a(1896), in(602), little(520), the(446)  
beetle                            85 .(3165), a(1811), on(1576), in(1134), with(853)   
butterfly                         98 a(4521), .(4012), the(882), in(718), on(519)      
camel                            107 .(4535), a(2958), the(925), in(416), back(402)    
castle           

In [22]:
# NOTE Using the co-occurrence data to add edges and compute distances
# ============================================================================

import numpy as np

# ============================================================================
# 1. Copy existing network components
# ============================================================================
graph = vg_network_data['graph'].copy()
nodes = list(vg_network_data['nodes'])
distance_matrix = vg_network_data['distance_matrix'].copy()
token_counts = dict(vg_network_data['token_counts'])

# Get existing distance statistics (for scaling new distances)
existing_distances = distance_matrix[distance_matrix > 0]
median_distance = np.median(existing_distances)
min_distance = np.min(existing_distances)
max_distance = np.max(existing_distances)

print("Existing network distance stats:")
print(f"  Min: {min_distance:.4f}, Median: {median_distance:.4f}, Max: {max_distance:.4f}")

# Node index mapping
node_to_idx = {node: i for i, node in enumerate(nodes)}

# ============================================================================
# 2. Expand distance matrix to accommodate new words
# ============================================================================
n_original = len(nodes)
n_new = len(missing_words)
n_total = n_original + n_new

# New matrix: fill with max_distance (unconnected pairs)
new_distance_matrix = np.full((n_total, n_total), fill_value=max_distance)
new_distance_matrix[:n_original, :n_original] = distance_matrix

# ============================================================================
# 3. Add each missing word with edges based on co-occurrence counts
# ============================================================================
# The CIFAR corpus is ~27% the size of VG corpus, so we need to scale counts
# to make distances comparable to the VG network distances

# Get VG corpus statistics for scaling
vg_total_tokens = len(vg_tokens)
cifar_total_tokens = len(cifar_tokens)
corpus_scale_factor = vg_total_tokens / cifar_total_tokens

print(f"\nCorpus scale factor: {corpus_scale_factor:.2f}x (VG is {corpus_scale_factor:.1f}x larger)")

# Get a reference for what "typical" adjacency counts look like in VG
vg_adj_counts = list(vg_network_data['adjacency_counts'].values())
median_vg_adj = np.median(vg_adj_counts)
print(f"Median VG adjacency count: {median_vg_adj:.0f}")

print(f"\nAdding {len(missing_words)} missing words to network...")

edges_added_total = 0
for word in missing_words:
    # Add node
    graph.add_node(word)
    new_idx = len(nodes)
    nodes.append(word)
    node_to_idx[word] = new_idx
    
    # Get this word's connections
    conns = missing_word_connections[word]
    
    # Set token count based on total co-occurrence frequency
    # (sum of all connections, scaled by corpus ratio)
    raw_count = sum(conns.values())
    scaled_count = int(raw_count * corpus_scale_factor) if raw_count > 0 else 1000
    token_counts[word] = max(scaled_count, 1000)  # Minimum count to avoid filtering
    
    # Add edges and compute distances
    for connected_word, co_count in conns.items():
        connected_idx = node_to_idx[connected_word]
        
        # Add edge to graph
        graph.add_edge(word, connected_word)
        edges_added_total += 1
        
        # Compute distance: inversely proportional to (scaled) co-occurrence count
        # Higher co-occurrence = lower distance (closer relationship)
        scaled_co_count = co_count * corpus_scale_factor
        
        # Use inverse relationship similar to how VG distances are computed
        # distance = max_count / count (so higher count = lower distance)
        distance = median_vg_adj / max(scaled_co_count, 1)
        distance = np.clip(distance, min_distance, max_distance)
        
        new_distance_matrix[new_idx, connected_idx] = distance
        new_distance_matrix[connected_idx, new_idx] = distance
    
    # Self-distance = 0
    new_distance_matrix[new_idx, new_idx] = 0.0

print(f"✓ Added {len(missing_words)} nodes and {edges_added_total} edges")

# ============================================================================
# 4. Create augmented network dictionary
# ============================================================================
augmented_network = {
    'graph': graph,
    'nodes': nodes,
    'distance_matrix': new_distance_matrix,
    'token_counts': token_counts,
    'count_matrix': None,
    'adjacency_counts': None,  # Would need to be recomputed if needed
    'rare_tokens': vg_network_data['rare_tokens'],
    'original_tokens': vg_network_data['original_tokens'],
}

# ============================================================================
# 5. Verify coverage
# ============================================================================
print("\n" + "=" * 80)
print("AUGMENTED NETWORK SUMMARY")
print("=" * 80)

final_vocab = set(augmented_network['nodes'])
cifar_found = cifar_words & final_vocab
cifar_missing_final = sorted(cifar_words - final_vocab)

print(f"Original network: {n_original} nodes, {vg_network_data['graph'].number_of_edges():,} edges")
print(f"Augmented network: {len(final_vocab)} nodes, {graph.number_of_edges():,} edges")
print(f"New edges added: {edges_added_total}")
print()
print(f"CIFAR-100 coverage: {len(cifar_found)}/100")

if cifar_missing_final:
    print(f"❌ Still missing: {cifar_missing_final}")
else:
    print("✅ ALL 100 CIFAR-100 WORDS ARE IN THE NETWORK!")

# ============================================================================
# 6. Save for use in other notebooks
# ============================================================================
import pickle
with open('augmented_network.pkl', 'wb') as f:
    pickle.dump(augmented_network, f)
print("\n✓ Saved augmented_network to 'augmented_network.pkl'")


Existing network distance stats:
  Min: 1.0000, Median: 970325.5000, Max: 970325.5000

Corpus scale factor: 3.63x (VG is 3.6x larger)
Median VG adjacency count: 6

Adding 68 missing words to network...
✓ Added 68 nodes and 6790 edges

AUGMENTED NETWORK SUMMARY
Original network: 458 nodes, 50,127 edges
Augmented network: 526 nodes, 56,917 edges
New edges added: 6790

CIFAR-100 coverage: 100/100
✅ ALL 100 CIFAR-100 WORDS ARE IN THE NETWORK!

✓ Saved augmented_network to 'augmented_network.pkl'


In [None]:
# NOTE Edges per word
print("=" * 80)
print("EDGES PER NODE (DEGREE)")
print("=" * 80)

graph = augmented_network['graph']

# Get degree for each node
degrees = dict(graph.degree())

# Sort by degree (descending)
sorted_degrees = sorted(degrees.items(), key=lambda x: -x[1])

print(f"Total nodes: {graph.number_of_nodes()}")
print(f"Total edges: {graph.number_of_edges()}")
print(f"Average degree: {sum(degrees.values()) / len(degrees):.2f}")
print()

print(f"{'Rank':<6} {'Word':<20} {'Degree':>10} {'% of Max':>10}")
print("-" * 50)

max_degree = sorted_degrees[0][1] if sorted_degrees else 1
for rank, (word, degree) in enumerate(sorted_degrees, 1):
    pct = (degree / max_degree) * 100
    print(f"{rank:<6} {word:<20} {degree:>10} {pct:>9.1f}%")

print()
print("=" * 80)
print(f"Degree range: {min(degrees.values())} - {max(degrees.values())}")

EDGES PER NODE (DEGREE)
Total nodes: 526
Total edges: 56917
Average degree: 216.41

Rank   Word                     Degree   % of Max
--------------------------------------------------
1      .                           525     100.0%
2      is                          525     100.0%
3      the                         524      99.8%
4      on                          523      99.6%
5      of                          523      99.6%
6      in                          523      99.6%
7      and                         523      99.6%
8      a                           522      99.4%
9      with                        522      99.4%
10     ,                           521      99.2%
11     to                          518      98.7%
12     white                       517      98.5%
13     at                          508      96.8%
14     black                       505      96.2%
15     by                          504      96.0%
16     for                         503      95.8%
17     small   

----------------------------
# TRAINING SKIPGRAM MODEL

In [20]:
import os
from IPython.display import Image, display
from lab6 import train_embeddings

print("\n" + "="*80)
print("🚀 STARTING TRAINING RUN")
print("This may take a few minutes. We are running the full pipeline...")
print("="*80)

# --- Hyperparameters ---
# We're using a smaller dimension (64 for speed).
# A wide context_size (4) captures broader graph relationships.
# The batch size is large for GPU efficiency.
results = train_embeddings(
    network_data=augmented_network,
    embedding_dim=64,        # Dimension of the learned vectors
    batch_size=32,           # Number of pairs per training step #NOTE adjusted this from 2048 to 32. idk
    epochs=100, #from 10               # Max epochs (will stop early) #NOTE I just reduced this to 10 cus 20 took so long
    learning_rate=0.005, #from 0.001      # AdamW learning rate
    num_negative=8,          # 8 negatives per 1 positive
    validation_fraction=0.1, # Use 10% of edges for validation
    context_size=2,          #NOTE I've just changed this from 4 to 2 .4 would find things like "Dog" is connected to "Bark" -> "Tree" -> "Building" -> "Sky".

    # --- Regularization Stack ---
    dropout=0.0, #(Was 0.3)            # Prevent neuron co-adaptation
    weight_decay=0.0, #(Was 1e-4)      # L2 penalty to keep weights small
    label_smoothing=0.0, #(Was 0.1)    # Prevent overconfidence (target 0.9, not 1.0)
    patience=20,   #(Was 5)           # Stop if val_loss doesn't improve for 5 epochs
    device=None                        # Auto-detects 'cuda' or 'cpu'
)

# --- Training Summary ---
nodes = results['nodes']
embeddings = results['embeddings']

print("\n" + "="*80)
print("✅ TRAINING COMPLETE")
print("="*80)
print(f"Learned embeddings for {len(nodes):,} words")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"\n💡 Key features of this training run:")
print(f"  • Punctuation filtering prevented 'hub poisoning'")
print(f"  • Weighted sampling focused on important pairs")
print(f"  • Regularization (Dropout, L2, Smoothing) prevented overfitting")
print(f"  • A wide context (size=4) learned from the graph structure")


🚀 STARTING TRAINING RUN
This may take a few minutes. We are running the full pipeline...

🔧 PUNCTUATION FILTER:
  Removed: {'.', ',', "'", '<RARE>'}
  Nodes: 526 → 523
  Edges: 56,917 → 55,417

Train edges: 49,875, Val edges: 5,542

📊 SkipGramDataset Statistics:
  Vocabulary size: 523
  Positive pairs: 273,006
  Negatives per positive: 8
  Total samples per epoch: 2,457,054

  Weight distribution:
    Min: 0.071974
    Mean: 1.000000
    Median: 0.071974
    Max: 16.642145





📊 SkipGramDataset Statistics:
  Vocabulary size: 523
  Positive pairs: 172,754
  Negatives per positive: 8
  Total samples per epoch: 1,554,786

  Weight distribution:
    Min: 0.095653
    Mean: 1.000000
    Median: 0.095653
    Max: 15.452855

Training on cpu
Vocab: 523, Embed dim: 64, Context: 2, Negatives: 8
Regularization: dropout=0.0, weight_decay=0.0, label_smoothing=0.0


                                                                                    

Epoch 01  train=2.2573  val=4.3771  lr=0.005000
  → Best model (val_loss=4.3771), saved to best_model.pth


                                                                                    

Epoch 02  train=2.0882  val=4.5376  lr=0.005000


                                                                                    

Epoch 03  train=2.0768  val=4.5649  lr=0.005000


                                                                                    

Epoch 04  train=2.0782  val=4.5204  lr=0.005000


                                                                                    

Epoch 05  train=2.0008  val=4.4210  lr=0.002500


                                                                                    

Epoch 06  train=1.9777  val=4.4415  lr=0.002500


                                                                                    

Epoch 07  train=1.9782  val=4.4138  lr=0.002500


                                                                                    

Epoch 08  train=1.9405  val=4.3601  lr=0.001250
  → Best model (val_loss=4.3601), saved to best_model.pth


                                                                                    

Epoch 09  train=1.9269  val=4.3542  lr=0.001250
  → Best model (val_loss=4.3542), saved to best_model.pth


                                                                                     

Epoch 10  train=1.9278  val=4.3274  lr=0.001250
  → Best model (val_loss=4.3274), saved to best_model.pth


                                                                                     

Epoch 11  train=1.9224  val=4.3355  lr=0.001250


                                                                                     

Epoch 12  train=1.9247  val=4.3365  lr=0.001250


                                                                                     

Epoch 13  train=1.9242  val=4.3375  lr=0.001250


                                                                                     

Epoch 14  train=1.9025  val=4.3085  lr=0.000625
  → Best model (val_loss=4.3085), saved to best_model.pth


                                                                                     

Epoch 15  train=1.8997  val=4.3291  lr=0.000625


                                                                                     

Epoch 16  train=1.9014  val=4.3122  lr=0.000625


                                                                                     

Epoch 17  train=1.8991  val=4.2991  lr=0.000625
  → Best model (val_loss=4.2991), saved to best_model.pth


                                                                                     

Epoch 18  train=1.8951  val=4.3053  lr=0.000625


                                                                                     

Epoch 19  train=1.8969  val=4.2983  lr=0.000625
  → Best model (val_loss=4.2983), saved to best_model.pth


                                                                                     

Epoch 20  train=1.8930  val=4.3023  lr=0.000625


                                                                                     

Epoch 21  train=1.8968  val=4.2926  lr=0.000625
  → Best model (val_loss=4.2926), saved to best_model.pth


                                                                                     

Epoch 22  train=1.8943  val=4.3003  lr=0.000625


                                                                                     

Epoch 23  train=1.8938  val=4.3070  lr=0.000625


                                                                                     

Epoch 24  train=1.8966  val=4.3095  lr=0.000625


                                                                                     

Epoch 25  train=1.8832  val=4.3019  lr=0.000313


                                                                                     

Epoch 26  train=1.8839  val=4.2831  lr=0.000313
  → Best model (val_loss=4.2831), saved to best_model.pth


                                                                                     

Epoch 27  train=1.8813  val=4.2907  lr=0.000313


                                                                                     

Epoch 28  train=1.8788  val=4.2880  lr=0.000313


                                                                                     

Epoch 29  train=1.8796  val=4.2888  lr=0.000313


                                                                                     

Epoch 30  train=1.8805  val=4.2849  lr=0.000156


                                                                                     

Epoch 31  train=1.8750  val=4.2802  lr=0.000156
  → Best model (val_loss=4.2802), saved to best_model.pth


                                                                                     

Epoch 32  train=1.8780  val=4.2800  lr=0.000156
  → Best model (val_loss=4.2800), saved to best_model.pth


                                                                                     

Epoch 33  train=1.8785  val=4.2767  lr=0.000156
  → Best model (val_loss=4.2767), saved to best_model.pth


                                                                                     

Epoch 34  train=1.8739  val=4.2794  lr=0.000156


                                                                                     

Epoch 35  train=1.8765  val=4.2742  lr=0.000156
  → Best model (val_loss=4.2742), saved to best_model.pth


                                                                                     

Epoch 36  train=1.8782  val=4.2790  lr=0.000156


                                                                                     

Epoch 37  train=1.8763  val=4.2756  lr=0.000156


                                                                                     

Epoch 38  train=1.8745  val=4.2714  lr=0.000156
  → Best model (val_loss=4.2714), saved to best_model.pth


                                                                                     

Epoch 39  train=1.8740  val=4.2761  lr=0.000156


                                                                                     

Epoch 40  train=1.8776  val=4.2731  lr=0.000156


                                                                                     

Epoch 41  train=1.8790  val=4.2681  lr=0.000156
  → Best model (val_loss=4.2681), saved to best_model.pth


                                                                                     

Epoch 42  train=1.8739  val=4.2755  lr=0.000156


                                                                                     

Epoch 43  train=1.8732  val=4.2786  lr=0.000156


                                                                                     

Epoch 44  train=1.8732  val=4.2789  lr=0.000156


                                                                                     

Epoch 45  train=1.8725  val=4.2713  lr=0.000078


                                                                                     

Epoch 46  train=1.8720  val=4.2778  lr=0.000078


                                                                                     

Epoch 47  train=1.8709  val=4.2713  lr=0.000078


                                                                                     

Epoch 48  train=1.8725  val=4.2718  lr=0.000039


                                                                                     

Epoch 49  train=1.8718  val=4.2691  lr=0.000039


                                                                                     

Epoch 50  train=1.8727  val=4.2742  lr=0.000039


                                                                                     

Epoch 51  train=1.8706  val=4.2703  lr=0.000020


                                                                                     

Epoch 52  train=1.8731  val=4.2753  lr=0.000020


                                                                                     

Epoch 53  train=1.8727  val=4.2716  lr=0.000020


                                                                                     

Epoch 54  train=1.8692  val=4.2679  lr=0.000010
  → Best model (val_loss=4.2679), saved to best_model.pth


                                                                                     

Epoch 55  train=1.8713  val=4.2719  lr=0.000010


                                                                                     

Epoch 56  train=1.8745  val=4.2701  lr=0.000010


                                                                                     

Epoch 57  train=1.8721  val=4.2734  lr=0.000005


                                                                                     

Epoch 58  train=1.8669  val=4.2712  lr=0.000005


                                                                                     

Epoch 59  train=1.8704  val=4.2717  lr=0.000005


                                                                                     

Epoch 60  train=1.8687  val=4.2717  lr=0.000002


                                                                                     

Epoch 61  train=1.8691  val=4.2706  lr=0.000002


                                                                                     

Epoch 62  train=1.8658  val=4.2670  lr=0.000002
  → Best model (val_loss=4.2670), saved to best_model.pth


                                                                                     

Epoch 63  train=1.8678  val=4.2702  lr=0.000002


                                                                                     

Epoch 64  train=1.8714  val=4.2722  lr=0.000002


                                                                                     

Epoch 65  train=1.8740  val=4.2675  lr=0.000002


                                                                                     

Epoch 66  train=1.8704  val=4.2689  lr=0.000001


                                                                                     

Epoch 67  train=1.8705  val=4.2669  lr=0.000001
  → Best model (val_loss=4.2669), saved to best_model.pth


                                                                                     

Epoch 68  train=1.8680  val=4.2709  lr=0.000001


                                                                                     

KeyboardInterrupt: 