# Libraries

In [17]:
import pandas as pd
import numpy as np
import os
import sys
import json
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Inputs

In [2]:
EN_PATH = '../Data/Input/conceptnet_en_full.csv'

# Purpose
***

Allow an agent to interact with the full relation universe and create a full knowledge graph.

### Phase 1. 
***
Load the english triples

In [3]:
english_triples = pd.read_csv(EN_PATH)
english_triples.head()

Unnamed: 0,uri,relation,start,end,weight,dataset,sources,surfaceText,license,context
0,"/a/[/r/Antonym/,/c/en/0/n/,/c/en/1/]",/r/Antonym,/c/en/0/n,/c/en/1,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc...",,,,,
1,"/a/[/r/Antonym/,/c/en/12_hour_clock/n/,/c/en/2...",/r/Antonym,/c/en/12_hour_clock/n,/c/en/24_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc...",,,,,
2,"/a/[/r/Antonym/,/c/en/24_hour_clock/n/,/c/en/1...",/r/Antonym,/c/en/24_hour_clock/n,/c/en/12_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc...",,,,,
3,"/a/[/r/Antonym/,/c/en/5/n/,/c/en/3/]",/r/Antonym,/c/en/5/n,/c/en/3,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc...",,,,,
4,"/a/[/r/Antonym/,/c/en/a.c/n/,/c/en/d.c/]",/r/Antonym,/c/en/a.c/n,/c/en/d.c,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc...",,,,,


#### Phase 1.A 
***
Preprocess and prepare the concept data

In [4]:
english_triples['weight'].iloc[0]

'{"dataset": "/d/wiktionary/fr", "license": "cc:by-sa/4.0", "sources": [{"contributor": "/s/resource/wiktionary/fr", "process": "/s/process/wikiparsec/2"}], "weight": 1.0}'

In [9]:
def clean_column(df, col):
    col_string = df[col].astype(str)
    col_string = col_string.str.split('/')
    col_string = col_string.str[-1]
    df[f'{col}_cleaned'] = col_string
    return df

def extract_weight(df):
    df['weight_cleaned'] = df['weight'].apply(lambda x: json.loads(x)['weight'])
    return df

def preprocess_data(df):
    df_copy = df.copy()
    for col in ['relation', 'start', 'end']:
        df_copy = clean_column(df_copy, col)
    df_copy = extract_weight(df_copy)
    return df_copy

cleaned_cols = ['relation', 'start', 'end', 'weight']
cleaned_cols = [f'{col}_cleaned' for col in cleaned_cols]

reprocess_triples = False

if reprocess_triples:
    cleaned_english_triples = preprocess_data(english_triples)[cleaned_cols].drop_duplicates()
    cleaned_english_triples[cleaned_cols].head()
    # Save as a parquet file
    cleaned_english_triples.to_parquet(
        os.path.join(os.path.dirname(EN_PATH), 'conceptnet_en_full_cleaned.parquet.gzip'),
        index=False,
        compression='gzip'
    )
else:
    # Load the cleaned data
    cleaned_english_triples = pd.read_parquet(
        os.path.join(os.path.dirname(EN_PATH), 'conceptnet_en_full_cleaned.parquet.gzip')
    )
cleaned_english_triples.head()

Unnamed: 0,relation_cleaned,start_cleaned,end_cleaned,weight_cleaned
0,Antonym,n,1,1.0
1,Antonym,n,24_hour_clock,1.0
2,Antonym,n,12_hour_clock,1.0
3,Antonym,n,3,1.0
4,Antonym,n,d.c,1.0


# Phase 2. Seed the agent with random english relations
***

In [12]:
def create_stratified_weighted_sample(df, sample_size=5000, min_weight_threshold=0.5, verbose=True):
    """
    Create a stratified sample maintaining relation distribution while prioritizing higher weights
    
    Parameters:
    - df: DataFrame with columns ['start_concept', 'end_concept', 'relation_type', 'edge_weight']
    - sample_size: Target number of triples in sample
    - min_weight_threshold: Minimum weight to consider (filters low-quality relations)
    - verbose: Print detailed progress
    
    Returns:
    - DataFrame: Stratified sample
    """
    
    df_copy = df.copy()
    df_copy = df_copy.rename(columns={
        'relation_cleaned': 'relation_type',
        'start_cleaned': 'start_concept',
        'end_cleaned': 'end_concept',
        'weight_cleaned': 'edge_weight'
    })
    
    print(f"🎯 Creating stratified weighted sample of {sample_size:,} triples...")
    
    # Step 1: Filter by minimum weight threshold
    if verbose:
        print(f"   Filtering triples with weight >= {min_weight_threshold}")
    
    initial_count = len(df_copy)
    filtered_df = df_copy[df_copy['edge_weight'] >= min_weight_threshold].copy()
    filtered_count = len(filtered_df)
    
    if verbose:
        print(f"   Kept {filtered_count:,} of {initial_count:,} triples ({filtered_count/initial_count*100:.1f}%)")
    
    # Step 2: Calculate current relation distribution
    relation_counts = filtered_df['relation_type'].value_counts()
    relation_proportions = relation_counts / len(filtered_df)
    
    if verbose:
        print(f"\n📊 Original relation distribution (top 10):")
        for relation, prop in relation_proportions.head(10).items():
            count = relation_counts[relation]
            print(f"   {relation}: {count:,} ({prop*100:.1f}%)")
    
    # Step 3: Sort by weight within each relation (highest first)
    if verbose:
        print(f"\n⚖️  Sorting by weight within each relation...")
    
    filtered_df = filtered_df.sort_values(['relation_type', 'edge_weight'], 
                                        ascending=[True, False])
    
    # Step 4: Calculate target samples per relation
    target_samples_per_relation = {}
    for relation in relation_proportions.index:
        target_count = int(sample_size * relation_proportions[relation])
        # Ensure at least 1 sample for each relation if possible
        target_count = max(1, target_count)
        target_samples_per_relation[relation] = target_count
    
    if verbose:
        print(f"\n🎯 Target samples per relation:")
        total_targeted = sum(target_samples_per_relation.values())
        for relation, target in sorted(target_samples_per_relation.items(), 
                                     key=lambda x: x[1], reverse=True)[:10]:
            print(f"   {relation}: {target:,}")
        print(f"   Total targeted: {total_targeted:,}")
    
    # Step 5: Sample from each relation group
    sampled_dfs = []
    actual_samples = {}
    
    if verbose:
        print(f"\n🔄 Sampling from each relation group...")
    
    for relation, target_count in tqdm(target_samples_per_relation.items(), 
                                     desc="Sampling relations"):
        relation_data = filtered_df[filtered_df['relation_type'] == relation]
        
        # Take top weighted samples up to target count
        actual_count = min(target_count, len(relation_data))
        sampled_data = relation_data.head(actual_count)
        
        sampled_dfs.append(sampled_data)
        actual_samples[relation] = actual_count
    
    # Step 6: Combine all samples
    stratified_sample = pd.concat(sampled_dfs, ignore_index=True)
    
    # Step 7: If we're short, fill with highest-weight remaining samples
    current_size = len(stratified_sample)
    if current_size < sample_size:
        shortage = sample_size - current_size
        if verbose:
            print(f"   Short by {shortage:,} samples, filling with highest-weight remaining...")
        
        # Get samples not already included
        used_indices = set(stratified_sample.index) if hasattr(stratified_sample, 'index') else set()
        remaining_df = filtered_df[~filtered_df.index.isin(used_indices)]
        
        if len(remaining_df) > 0:
            # Sort by weight and take top samples
            top_remaining = remaining_df.nlargest(shortage, 'edge_weight')
            stratified_sample = pd.concat([stratified_sample, top_remaining], ignore_index=True)
    
    # Step 8: Final shuffle to mix relations
    stratified_sample = stratified_sample.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    # Step 9: Validation and statistics
    final_size = len(stratified_sample)
    final_relation_counts = stratified_sample['relation_type'].value_counts()
    final_relation_proportions = final_relation_counts / final_size
    
    print(f"\n✅ Stratified sample created!")
    print(f"   Final size: {final_size:,} triples")
    print(f"   Weight range: {stratified_sample['edge_weight'].min():.3f} - {stratified_sample['edge_weight'].max():.3f}")
    print(f"   Mean weight: {stratified_sample['edge_weight'].mean():.3f}")
    
    if verbose:
        print(f"\n📊 Final relation distribution (top 10):")
        for relation, prop in final_relation_proportions.head(10).items():
            count = final_relation_counts[relation]
            original_prop = relation_proportions.get(relation, 0)
            print(f"   {relation}: {count:,} ({prop*100:.1f}% vs {original_prop*100:.1f}% orig)")
    
    return stratified_sample

#### Phase 2A. 
***
Created stratified sampling for agent seeded nodes

In [18]:
resample_data = True

if resample_data:
    # Apply the stratified sampling to your cleaned data
    print("🎲 Creating stratified weighted sample for agent initialization...")

    stratified_seed_data = create_stratified_weighted_sample(
        df=cleaned_english_triples,
        sample_size=5000,
        min_weight_threshold=0.5,  # Only include relations with decent confidence
        verbose=True
    )

    # Show sample characteristics
    print(f"\n🔍 Sample characteristics:")
    print(f"Weight distribution:")
    print(stratified_seed_data['edge_weight'].describe())

    print(f"\nTop concept pairs by weight:")
    top_weighted = stratified_seed_data.nlargest(5, 'edge_weight')
    for _, row in top_weighted.iterrows():
        print(f"   {row['start_concept']} --{row['relation_type']}--> {row['end_concept']} (weight: {row['edge_weight']:.3f})")
    # Save the stratified sample
    stratified_output_path = os.path.join(os.path.dirname(EN_PATH), 'conceptnet_en_stratified_seed_5k.parquet.gzip')
    stratified_seed_data.to_parquet(stratified_output_path, index=False, compression='gzip')
    print(f"\n💾 Stratified seed data saved to: {stratified_output_path}")
else:
    # Load the stratified sample
    stratified_seed_data = pd.read_parquet(
        os.path.join(os.path.dirname(EN_PATH), 'conceptnet_en_full_stratified.parquet.gzip')
    )
    print(f"Loaded {len(stratified_seed_data):,} triples from stratified sample.")

🎲 Creating stratified weighted sample for agent initialization...
🎯 Creating stratified weighted sample of 5,000 triples...
   Filtering triples with weight >= 0.5
   Kept 1,477,248 of 1,655,522 triples (89.2%)

📊 Original relation distribution (top 10):
   RelatedTo: 417,772 (28.3%)
   DerivedFrom: 324,167 (21.9%)
   FormOf: 294,073 (19.9%)
   Synonym: 107,359 (7.3%)
   IsA: 65,328 (4.4%)
   UsedFor: 39,470 (2.7%)
   AtLocation: 27,708 (1.9%)
   HasSubevent: 25,238 (1.7%)
   HasPrerequisite: 22,710 (1.5%)
   CapableOf: 22,677 (1.5%)

⚖️  Sorting by weight within each relation...

🎯 Target samples per relation:
   RelatedTo: 1,414
   DerivedFrom: 1,097
   FormOf: 995
   Synonym: 363
   IsA: 221
   UsedFor: 133
   AtLocation: 93
   HasSubevent: 85
   HasPrerequisite: 76
   CapableOf: 76
   Total targeted: 4,983

🔄 Sampling from each relation group...


Sampling relations: 100%|██████████| 47/47 [00:01<00:00, 27.80it/s]


   Short by 17 samples, filling with highest-weight remaining...

✅ Stratified sample created!
   Final size: 5,000 triples
   Weight range: 0.500 - 22.891
   Mean weight: 4.063

📊 Final relation distribution (top 10):
   RelatedTo: 1,424 (28.5% vs 28.3% orig)
   DerivedFrom: 1,097 (21.9% vs 21.9% orig)
   FormOf: 995 (19.9% vs 19.9% orig)
   Synonym: 363 (7.3% vs 7.3% orig)
   IsA: 226 (4.5% vs 4.4% orig)
   UsedFor: 133 (2.7% vs 2.7% orig)
   AtLocation: 93 (1.9% vs 1.9% orig)
   HasSubevent: 85 (1.7% vs 1.7% orig)
   CapableOf: 77 (1.5% vs 1.5% orig)
   HasPrerequisite: 76 (1.5% vs 1.5% orig)

🔍 Sample characteristics:
Weight distribution:
count    5000.000000
mean        4.062885
std         2.422774
min         0.500000
25%         2.000000
50%         2.828000
75%         5.759000
max        22.891000
Name: edge_weight, dtype: float64

Top concept pairs by weight:
   baseball --IsA--> sport (weight: 22.891)
   baseball --IsA--> sport (weight: 22.891)
   yo_yo --IsA--> toy (weight

## Phase 3
***
Agent memory graph construction

In [None]:
# Phase 2.B - Initialize Agent with Stratified Seed Data

print("🧠 Initializing Knowledge Graph Agent with stratified seed...")
agent = KnowledgeGraphAgent(validate_on_add=True, verbose=False)

# Load the stratified sample
print(f"📊 Loading {len(stratified_seed_data):,} stratified triples...")
agent.bulk_load_triples(stratified_seed_data)

print("\n" + "="*60)
print("🎯 TESTING AGENT WITH STRATIFIED SEED")
print("="*60)

# Test 1: Show relation diversity
print("\n📊 Relation diversity in loaded graph:")
relation_stats = {}
for _, _, data in agent.graph.edges(data=True):
    relation = data.get('relation', 'Unknown')
    if relation not in relation_stats:
        relation_stats[relation] = 0
    relation_stats[relation] += 1

# Show top relations
sorted_relations = sorted(relation_stats.items(), key=lambda x: x[1], reverse=True)
print("Top 10 relations in graph:")
for relation, count in sorted_relations[:10]:
    print(f"   {relation}: {count:,}")

# Test 2: Quality check - show high-weight concepts
print(f"\n⚖️  High-quality relationships (weight > 0.8):")
high_quality_count = 0
for start, end, data in agent.graph.edges(data=True):
    if data.get('weight', 0) > 0.8:
        relation = data.get('relation', 'Unknown')
        weight = data.get('weight', 0)
        print(f"   {start} --{relation}--> {end} (weight: {weight:.3f})")
        high_quality_count += 1
        if high_quality_count >= 10:  # Limit output
            break

print(f"Total high-quality relationships: {high_quality_count:,}")

# Test 3: Concept connectivity analysis
print(f"\n🕸️  Connectivity analysis:")
node_degrees = dict(agent.graph.degree())
top_connected = sorted(node_degrees.items(), key=lambda x: x[1], reverse=True)[:10]

print("Most connected concepts:")
for concept, degree in top_connected:
    print(f"   {concept}: {degree} connections")

# Test 4: Sample queries on well-connected concepts
print(f"\n🔍 Testing queries on top concepts:")
for concept, degree in top_connected[:3]:
    print(f"\n--- Relationships for '{concept}' (degree: {degree}) ---")
    agent.query_concept(concept, max_results=5)

print(f"\n🎉 Agent successfully initialized with high-quality stratified seed!")
print(f"   Ready for knowledge graph reasoning and expansion!")