# This is the project called Entity Alignment using Agentic AI

This is the script for the dataset processing. We are going to map the entity ids to the entity name.

In [16]:
import pandas as pd

# Load the train.cand_list.20 file as a pandas DataFrame
file_path = "data/DBP15K/torch_geometric_cache/raw/fr_en/train.cand_list.20"

# Read the file and parse it
data = []
with open(file_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            # Split by ': ' to separate index from candidate list
            parts = line.split(': ', 1)
            if len(parts) == 2:
                index = int(parts[0])
                candidates = [int(x) for x in parts[1].split()]
                data.append({'index': index, 'candidates': candidates, 'num_candidates': len(candidates)})

# Create DataFrame
df = pd.DataFrame(data)
print(f"DataFrame shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
print(f"\nDataFrame info:")
print(df.info())
print(f"\nSample candidates for first entity:")
print(f"Entity {df.iloc[0]['index']}: {df.iloc[0]['candidates'][:10]}...")  # Show first 10 candidates


DataFrame shape: (4500, 3)

First few rows:
   index                                         candidates  num_candidates
0      0  [10500, 33775, 36175, 17181, 20105, 32176, 117...              20
1      1  [10501, 36449, 11420, 18572, 33228, 15363, 333...              20
2      2  [20759, 10502, 18084, 16541, 13414, 15062, 332...              20
3      3  [10503, 15732, 31027, 19403, 38929, 37146, 176...              20
4      4  [33507, 15190, 10504, 19425, 38842, 37861, 324...              20

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           4500 non-null   int64 
 1   candidates      4500 non-null   object
 2   num_candidates  4500 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 105.6+ KB
None

Sample candidates for first entity:
Entity 0: [10500, 33775, 36175, 17181, 20105, 32176, 11787, 380

In [17]:
df.head()

Unnamed: 0,index,candidates,num_candidates
0,0,"[10500, 33775, 36175, 17181, 20105, 32176, 117...",20
1,1,"[10501, 36449, 11420, 18572, 33228, 15363, 333...",20
2,2,"[20759, 10502, 18084, 16541, 13414, 15062, 332...",20
3,3,"[10503, 15732, 31027, 19403, 38929, 37146, 176...",20
4,4,"[33507, 15190, 10504, 19425, 38842, 37861, 324...",20


I have successfully processed the DBP15K entity alignment dataset to create enhanced dataframes with comprehensive entity information including names, URIs, IDs, and language labels. This preprocessing step is crucial for the "Entity Alignment using Agentic AI" research project.
Key Accomplishments:
Data Loading & Parsing:
Loaded the train.cand_list.20 file containing 4,500 French entities with 20 English candidate matches each
Parsed entity candidate lists with proper data type handling
Entity Mapping Integration:
Successfully loaded 19,661 French entities from ent_ids_1
Successfully loaded 19,993 English entities from ent_ids_2
Created comprehensive entity mappings with names extracted from DBpedia URIs
Enhanced Dataset Creation:
Format: [entity_name, entity_uri, entity_id, language] for both index and candidate entities
Index entities: French DBpedia entities (KG1)
Candidate entities: English DBpedia entities (KG2)
Zero unknown entities - 100% mapping success rate
Output Files Generated:
entity_alignment_with_names_and_uris.csv: Main dataset with candidate lists as strings (4,500 rows)
entity_alignment_flattened_with_uris.csv: Flattened format with each index-candidate pair as separate row (90,000 rows)
Data Quality:
Total entities processed: 4,500 French index entities
Total candidate pairs: 90,000 (exactly 20 candidates per entity)
Coverage: 100% entity name resolution (no unknown entities)
Languages: Proper French/English language tagging
Technical Implementation:
Created reusable functions for entity mapping loading
Implemented robust error handling for unknown entities
Generated comprehensive statistics and validation reports
Organized outputs in dedicated processed_data/ directory

In [18]:
import os
import pandas as pd
import ast

# Create processed_data directory if it doesn't exist
os.makedirs('processed_data', exist_ok=True)

def load_entity_mappings(file_path):
    """Load entity ID to URI mappings from file"""
    mappings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split('\t', 1)
                if len(parts) == 2:
                    entity_id = int(parts[0])
                    entity_uri = parts[1]
                    # Extract entity name from URI (last part after /)
                    entity_name = entity_uri.split('/')[-1]
                    mappings[entity_id] = {
                        'name': entity_name,
                        'uri': entity_uri
                    }
    return mappings

# Load entity mappings
print("Loading entity mappings...")
kg1_mappings = load_entity_mappings("data/DBP15K/torch_geometric_cache/raw/fr_en/ent_ids_1")  # French entities
kg2_mappings = load_entity_mappings("data/DBP15K/torch_geometric_cache/raw/fr_en/ent_ids_2")  # English entities

print(f"Loaded {len(kg1_mappings)} entities from KG1 (French)")
print(f"Loaded {len(kg2_mappings)} entities from KG2 (English)")

# Check first few mappings
print("\nSample mappings from KG1:")
for i, (k, v) in enumerate(list(kg1_mappings.items())[:3]):
    print(f"  ID {k}: {v['name']} ({v['uri']})")

print("\nSample mappings from KG2:")
for i, (k, v) in enumerate(list(kg2_mappings.items())[:3]):
    print(f"  ID {k}: {v['name']} ({v['uri']})")


Loading entity mappings...
Loaded 19661 entities from KG1 (French)
Loaded 19993 entities from KG2 (English)

Sample mappings from KG1:
  ID 0: Saint-Joseph-de-Coleraine (http://fr.dbpedia.org/resource/Saint-Joseph-de-Coleraine)
  ID 1: Self_Portrait (http://fr.dbpedia.org/resource/Self_Portrait)
  ID 2: Alliance_des_libéraux_et_des_démocrates_pour_l'Europe (http://fr.dbpedia.org/resource/Alliance_des_libéraux_et_des_démocrates_pour_l'Europe)

Sample mappings from KG2:
  ID 10500: Saint-Joseph-de-Coleraine,_Quebec (http://dbpedia.org/resource/Saint-Joseph-de-Coleraine,_Quebec)
  ID 10501: Self_Portrait_(Bob_Dylan_album) (http://dbpedia.org/resource/Self_Portrait_(Bob_Dylan_album))
  ID 10502: Alliance_of_Liberals_and_Democrats_for_Europe_Party (http://dbpedia.org/resource/Alliance_of_Liberals_and_Democrats_for_Europe_Party)


In [19]:
# Create the enhanced dataframe with entity names and URIs
entity_alignment_data = []

print("Creating enhanced dataframe with entity names and URIs...")
for _, row in df.iterrows():
    index_id = row['index']
    candidates = row['candidates']
    
    # Get index entity information (from KG1 - French)
    if index_id in kg1_mappings:
        index_name = kg1_mappings[index_id]['name']
        index_uri = kg1_mappings[index_id]['uri']
        index_language = 'French'
    else:
        index_name = f"Unknown_Entity_{index_id}"
        index_uri = f"Unknown_URI_{index_id}"
        index_language = 'Unknown'
    
    # Get candidate entities information (from KG2 - English)
    candidate_info = []
    for candidate_id in candidates:
        if candidate_id in kg2_mappings:
            candidate_name = kg2_mappings[candidate_id]['name']
            candidate_uri = kg2_mappings[candidate_id]['uri']
            candidate_language = 'English'
        else:
            candidate_name = f"Unknown_Entity_{candidate_id}"
            candidate_uri = f"Unknown_URI_{candidate_id}"
            candidate_language = 'Unknown'
        
        candidate_info.append([candidate_name, candidate_uri, candidate_id, candidate_language])
    
    # Create the row structure: [index_name, index_uri, index_id, language, candidates_list]
    entity_alignment_data.append({
        'index_name': index_name,
        'index_uri': index_uri,
        'index_id': index_id,
        'index_language': index_language,
        'candidates': candidate_info,
        'num_candidates': len(candidate_info)
    })

# Create DataFrame
enhanced_df = pd.DataFrame(entity_alignment_data)

print(f"Enhanced DataFrame shape: {enhanced_df.shape}")
print("\nFirst few rows of enhanced DataFrame:")
print(enhanced_df[['index_name', 'index_uri', 'index_id', 'index_language', 'num_candidates']].head())

# Show a sample of candidates for the first entity
print(f"\nSample candidates for first entity ('{enhanced_df.iloc[0]['index_name']}'):")
for i, candidate in enumerate(enhanced_df.iloc[0]['candidates'][:5]):  # Show first 5 candidates
    print(f"  {i+1}. {candidate[0]} (URI: {candidate[1]}, ID: {candidate[2]}, Language: {candidate[3]})")


Creating enhanced dataframe with entity names and URIs...
Enhanced DataFrame shape: (4500, 6)

First few rows of enhanced DataFrame:
                                          index_name  \
0                          Saint-Joseph-de-Coleraine   
1                                      Self_Portrait   
2  Alliance_des_libéraux_et_des_démocrates_pour_l...   
3                                             Wallon   
4                                            Android   

                                           index_uri  index_id index_language  \
0  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         French   
1       http://fr.dbpedia.org/resource/Self_Portrait         1         French   
2  http://fr.dbpedia.org/resource/Alliance_des_li...         2         French   
3              http://fr.dbpedia.org/resource/Wallon         3         French   
4             http://fr.dbpedia.org/resource/Android         4         French   

   num_candidates  
0              20  
1  

In [20]:
# Save the enhanced dataframe to CSV
csv_file_path = 'processed_data/entity_alignment_with_names_and_uris.csv'

# For CSV export, we need to convert the candidates list to a string format
# that can be properly saved and loaded later
enhanced_df_for_csv = enhanced_df.copy()
enhanced_df_for_csv['candidates_str'] = enhanced_df_for_csv['candidates'].apply(str)

# Save the main columns (excluding the original candidates list which is complex)
columns_to_save = ['index_name', 'index_uri', 'index_id', 'index_language', 'candidates_str', 'num_candidates']
enhanced_df_for_csv[columns_to_save].to_csv(csv_file_path, index=False)

print(f"Enhanced dataframe saved to: {csv_file_path}")

# Also create a more readable version where each candidate is on a separate row
flattened_data = []
for _, row in enhanced_df.iterrows():
    index_name = row['index_name']
    index_uri = row['index_uri']
    index_id = row['index_id']
    index_language = row['index_language']
    
    for candidate in row['candidates']:
        candidate_name, candidate_uri, candidate_id, candidate_language = candidate
        flattened_data.append({
            'index_name': index_name,
            'index_uri': index_uri,
            'index_id': index_id,
            'index_language': index_language,
            'candidate_name': candidate_name,
            'candidate_uri': candidate_uri,
            'candidate_id': candidate_id,
            'candidate_language': candidate_language
        })

flattened_df = pd.DataFrame(flattened_data)
flattened_csv_path = 'processed_data/entity_alignment_flattened_with_uris.csv'
flattened_df.to_csv(flattened_csv_path, index=False)

print(f"Flattened dataframe saved to: {flattened_csv_path}")
print(f"Flattened DataFrame shape: {flattened_df.shape}")
print("\nFirst few rows of flattened DataFrame:")
print(flattened_df.head(10))


Enhanced dataframe saved to: processed_data/entity_alignment_with_names_and_uris.csv
Flattened dataframe saved to: processed_data/entity_alignment_flattened_with_uris.csv
Flattened DataFrame shape: (90000, 8)

First few rows of flattened DataFrame:
                  index_name  \
0  Saint-Joseph-de-Coleraine   
1  Saint-Joseph-de-Coleraine   
2  Saint-Joseph-de-Coleraine   
3  Saint-Joseph-de-Coleraine   
4  Saint-Joseph-de-Coleraine   
5  Saint-Joseph-de-Coleraine   
6  Saint-Joseph-de-Coleraine   
7  Saint-Joseph-de-Coleraine   
8  Saint-Joseph-de-Coleraine   
9  Saint-Joseph-de-Coleraine   

                                           index_uri  index_id index_language  \
0  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         French   
1  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         French   
2  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         French   
3  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         Fre

In [21]:
# Summary statistics
print("=== SUMMARY STATISTICS ===")
print(f"Total number of index entities (French): {len(enhanced_df)}")
print(f"Total number of candidate pairs: {len(flattened_df)}")
print(f"Average candidates per index entity: {flattened_df.shape[0] / enhanced_df.shape[0]:.2f}")

# Check for any unknown entities
unknown_index = enhanced_df[enhanced_df['index_language'] == 'Unknown'].shape[0]
unknown_candidates = flattened_df[flattened_df['candidate_language'] == 'Unknown'].shape[0]

print(f"\nUnknown entities:")
print(f"  - Unknown index entities: {unknown_index}")
print(f"  - Unknown candidate entities: {unknown_candidates}")

print(f"\nFiles created in 'processed_data' folder:")
print(f"  1. entity_alignment_with_names_and_uris.csv - Main dataframe with candidates as string")
print(f"  2. entity_alignment_flattened_with_uris.csv - Each index-candidate pair as separate row")

# Show sample of the data structure requested
print(f"\n=== SAMPLE DATA STRUCTURE ===")
sample_entity = enhanced_df.iloc[0]
print(f"Format: [entity_name, entity_uri, entity_id, language] [[candidate_1_name, candidate_1_uri, candidate_1_id, language], ...]")
print(f"Example:")
print(f"['{sample_entity['index_name']}', '{sample_entity['index_uri']}', {sample_entity['index_id']}, '{sample_entity['index_language']}']")
print(f"Candidates: {sample_entity['candidates'][:2]}...")  # Show first 2 candidates


=== SUMMARY STATISTICS ===
Total number of index entities (French): 4500
Total number of candidate pairs: 90000
Average candidates per index entity: 20.00

Unknown entities:
  - Unknown index entities: 0
  - Unknown candidate entities: 0

Files created in 'processed_data' folder:
  1. entity_alignment_with_names_and_uris.csv - Main dataframe with candidates as string
  2. entity_alignment_flattened_with_uris.csv - Each index-candidate pair as separate row

=== SAMPLE DATA STRUCTURE ===
Format: [entity_name, entity_uri, entity_id, language] [[candidate_1_name, candidate_1_uri, candidate_1_id, language], ...]
Example:
['Saint-Joseph-de-Coleraine', 'http://fr.dbpedia.org/resource/Saint-Joseph-de-Coleraine', 0, 'French']
Candidates: [['Saint-Joseph-de-Coleraine,_Quebec', 'http://dbpedia.org/resource/Saint-Joseph-de-Coleraine,_Quebec', 10500, 'English'], ['Saint-Joseph-de-Beauce', 'http://dbpedia.org/resource/Saint-Joseph-de-Beauce', 33775, 'English']]...


In [31]:
# Entity Categorization using Ollama gemma:4b

import requests
import json
import time
import pandas as pd
from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def query_ollama_gemma(entity_name, entity_uri, max_retries=3, delay=1):
    """
    Query Ollama gemma:4b model to categorize an entity
    
    Args:
        entity_name (str): Name of the entity
        entity_uri (str): URI of the entity for additional context
        max_retries (int): Maximum number of retry attempts
        delay (int): Delay between retries in seconds
    
    Returns:
        str: Category of the entity
    """
    
    # Define the categories
    categories = ["Person", "Place", "Event", "building/place", "Creative Work", "uncertain"]
    
    # Create the prompt
    prompt = f"""You are an expert entity classifier. Given an entity name and its URI, classify it into one of these categories:
- Person: Individual people, historical figures, celebrities, etc.
- Place: Geographical locations, cities, countries, regions, etc.
- Event: Historical events, competitions, festivals, wars, etc.
- building/place: Specific buildings, monuments, structures, venues, etc.
- Creative Work: Books, movies, songs, artworks, publications, etc.
- uncertain: When the category is unclear or ambiguous

Entity Name: {entity_name}
Entity URI: {entity_uri}

Based on the entity name and URI context, classify this entity. Respond with ONLY one of these exact categories: {', '.join(categories)}

Category:"""

    url = "http://localhost:11434/api/generate"
    
    data = {
        "model": "gemma3:1b",
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0.1,
            "top_p": 0.9,
            "num_predict": 50
        }
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.post(url, json=data, timeout=30)
            response.raise_for_status()
            
            result = response.json()
            category = result.get('response', '').strip()
            
            # Clean and validate the response
            category = category.replace('.', '').replace('\n', '').strip()
            
            # Check if the response matches one of our categories
            for cat in categories:
                if cat.lower() in category.lower():
                    return cat
            
            # If no exact match, try to map common variations
            category_lower = category.lower()
            if any(word in category_lower for word in ['person', 'people', 'individual', 'human']):
                return "Person"
            elif any(word in category_lower for word in ['place', 'location', 'city', 'country', 'region']):
                return "Place"
            elif any(word in category_lower for word in ['event', 'competition', 'festival', 'war']):
                return "Event"
            elif any(word in category_lower for word in ['building', 'structure', 'monument', 'venue']):
                return "building/place"
            elif any(word in category_lower for word in ['creative', 'work', 'book', 'movie', 'song', 'art']):
                return "Creative Work"
            else:
                return "uncertain"
                
        except requests.exceptions.RequestException as e:
            logger.warning(f"Attempt {attempt + 1} failed for {entity_name}: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(delay * (attempt + 1))
            else:
                logger.error(f"All attempts failed for {entity_name}")
                return "uncertain"
        except Exception as e:
            logger.error(f"Unexpected error for {entity_name}: {str(e)}")
            return "uncertain"
    
    return "uncertain"

# Test the connection to Ollama
def test_ollama_connection():
    """Test if Ollama is running and gemma:4b is available"""
    try:
        url = "http://localhost:11434/api/tags"
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        
        models = response.json().get('models', [])
        gemma_available = any('gemma3:1b' in model.get('name', '') for model in models)
        
        if gemma_available:
            print("✅ Ollama is running and gemma:4b model is available")
            return True
        else:
            print("❌ gemma:4b model not found. Available models:")
            for model in models:
                print(f"  - {model.get('name', 'Unknown')}")
            return False
            
    except requests.exceptions.RequestException as e:
        print(f"❌ Failed to connect to Ollama: {str(e)}")
        print("Make sure Ollama is running on localhost:11434")
        return False

# Test the connection
if test_ollama_connection():
    print("Ready to start entity categorization!")
else:
    print("Please start Ollama and ensure gemma3:1b model is available before proceeding.")

✅ Ollama is running and gemma:4b model is available
Ready to start entity categorization!


In [32]:
# Main batch processing functions for entity categorization

def categorize_entities_batch(enhanced_df, batch_size=50, save_interval=100):
    """
    Categorize entities using Ollama gemma:4b model in batches
    
    Args:
        enhanced_df (pd.DataFrame): DataFrame with entity information
        batch_size (int): Number of entities to process in each batch
        save_interval (int): Save progress every N entities
    
    Returns:
        pd.DataFrame: DataFrame with categorized entities
    """
    
    # Create a unique list of index entities to categorize
    unique_entities = enhanced_df[['index_name', 'index_uri', 'index_id', 'index_language']].drop_duplicates()
    
    print(f"Starting categorization of {len(unique_entities)} unique entities")
    print(f"Batch size: {batch_size}, Save interval: {save_interval}")
    
    # Initialize results list
    categorized_entities = []
    
    # Progress tracking
    processed_count = 0
    start_time = time.time()
    
    # Process entities in batches
    for i in tqdm(range(0, len(unique_entities), batch_size), desc="Processing batches"):
        batch = unique_entities.iloc[i:i+batch_size]
        
        for _, entity in batch.iterrows():
            entity_name = entity['index_name']
            entity_uri = entity['index_uri']
            entity_id = entity['index_id']
            entity_language = entity['index_language']
            
            # Query Ollama for categorization
            category = query_ollama_gemma(entity_name, entity_uri)
            
            # Store results
            categorized_entities.append({
                'entity_name': entity_name,
                'entity_uri': entity_uri,
                'entity_id': entity_id,
                'entity_language': entity_language,
                'entity_category': category,
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
            })
            
            processed_count += 1
            
            # Save intermediate results
            if processed_count % save_interval == 0:
                temp_df = pd.DataFrame(categorized_entities)
                temp_csv_path = f'processed_data/categorised_entities_by_gemma_temp_{processed_count}.csv'
                temp_df.to_csv(temp_csv_path, index=False)
                
                elapsed_time = time.time() - start_time
                entities_per_sec = processed_count / elapsed_time
                print(f"\n💾 Saved intermediate results: {processed_count}/{len(unique_entities)} entities")
                print(f"⏱️  Processing speed: {entities_per_sec:.2f} entities/second")
                print(f"📊 Current category breakdown:")
                print(temp_df['entity_category'].value_counts().to_string())
                print()
        
        # Add a small delay between batches to avoid overwhelming the API
        time.sleep(0.5)
    
    # Create final DataFrame
    final_df = pd.DataFrame(categorized_entities)
    
    # Save final results
    final_csv_path = 'processed_data/categorised_entities_by_gemma.csv'
    final_df.to_csv(final_csv_path, index=False)
    
    # Calculate final statistics
    elapsed_time = time.time() - start_time
    entities_per_sec = processed_count / elapsed_time
    
    print(f"\n🎉 Categorization completed!")
    print(f"📁 Final results saved to: {final_csv_path}")
    print(f"📊 Total entities processed: {processed_count}")
    print(f"⏱️  Total time: {elapsed_time:.2f} seconds ({entities_per_sec:.2f} entities/second)")
    print(f"\n📈 Final category breakdown:")
    print(final_df['entity_category'].value_counts().to_string())
    
    return final_df

# Test with a small sample first (optional)
def test_categorization_sample(enhanced_df, sample_size=5):
    """Test categorization with a small sample"""
    print(f"Testing categorization with {sample_size} entities...")
    
    sample_entities = enhanced_df.head(sample_size)
    test_results = []
    
    for _, entity in sample_entities.iterrows():
        entity_name = entity['index_name']
        entity_uri = entity['index_uri']
        
        print(f"🔍 Categorizing: {entity_name}")
        category = query_ollama_gemma(entity_name, entity_uri)
        print(f"📝 Category: {category}")
        
        test_results.append({
            'entity_name': entity_name,
            'entity_uri': entity_uri,
            'entity_category': category
        })
        
        time.sleep(1)
    
    test_df = pd.DataFrame(test_results)
    print(f"\n📊 Test results:")
    print(test_df[['entity_name', 'entity_category']].to_string(index=False))
    
    return test_df

print("Entity categorization functions loaded!")
print("Choose an option:")
print("1. Run test with 5 sample entities: test_categorization_sample(enhanced_df)")
print("2. Start full categorization: categorize_entities_batch(enhanced_df)")
print("3. Start with custom batch size: categorize_entities_batch(enhanced_df, batch_size=10)")


Entity categorization functions loaded!
Choose an option:
1. Run test with 5 sample entities: test_categorization_sample(enhanced_df)
2. Start full categorization: categorize_entities_batch(enhanced_df)
3. Start with custom batch size: categorize_entities_batch(enhanced_df, batch_size=10)


In [33]:
# Execute entity categorization
# Uncomment the line below to run the categorization you want:

# Option 1: Test with 5 sample entities first
# test_result = test_categorization_sample(enhanced_df, sample_size=5)

# Option 2: Run full categorization with default settings (50 entities per batch)
# categorized_df = categorize_entities_batch(enhanced_df)

# Option 3: Run with smaller batch size (good for testing)
categorized_df = categorize_entities_batch(enhanced_df, batch_size=10, save_interval=50)

print("🚀 Ready to categorize entities!")
print("Uncomment one of the options above to start the categorization process.")
print("Make sure Ollama is running with gemma:4b model before proceeding.")


Starting categorization of 4500 unique entities
Batch size: 10, Save interval: 50


Processing batches:   1%|          | 4/450 [01:51<3:25:39, 27.67s/it]


💾 Saved intermediate results: 50/4500 entities
⏱️  Processing speed: 0.36 entities/second
📊 Current category breakdown:
entity_category
Creative Work    24
Person           16
uncertain         7
Place             2
Event             1



Processing batches:   2%|▏         | 8/450 [04:05<3:45:46, 30.65s/it]


KeyboardInterrupt: 

In [4]:
import pandas as pd

file_path = r"C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\processed_data\fr_en_train_cand_list_20_entity_alignment_with_names.csv"

df = pd.read_csv(file_path, usecols=['index_name', 'index_id'])

print(df.head())



                                          index_name  index_id
0                          Saint-Joseph-de-Coleraine         0
1                                      Self_Portrait         1
2  Alliance_des_libéraux_et_des_démocrates_pour_l...         2
3                                             Wallon         3
4                                            Android         4


In [6]:
import pandas as pd

file_path = r"C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\processed_data\fr_en_train_cand_list_20_entity_alignment_with_names.csv"

df = pd.read_csv(file_path, usecols=['index_name', 'index_id'])

print(df.head())

df.to_csv(r"C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\processed_data\entity_names_id.csv", index=False)

                                          index_name  index_id
0                          Saint-Joseph-de-Coleraine         0
1                                      Self_Portrait         1
2  Alliance_des_libéraux_et_des_démocrates_pour_l...         2
3                                             Wallon         3
4                                            Android         4


In [16]:
def batch_entities_by_category(csv_path: str, batch_size: int = 500):
    """
    Loads a CSV, groups entities by category, and creates batches for each category.

    Args:
        csv_path (str): The path to the input CSV file.
        batch_size (int): The number of entities per batch.

    Returns:
        dict: A dictionary where keys are category names and values are lists
              of pandas DataFrames, with each DataFrame representing a batch.
    """
    try:
        df = pd.read_csv(csv_path)
        print(f"Successfully loaded {len(df)} rows from '{csv_path}'.")
    except FileNotFoundError:
        print(f"Error: The file '{csv_path}' was not found.")
        return None

    grouped_by_category = df.groupby('category')
    all_batches = {}

    for category_name, category_df in grouped_by_category:
        category_batches = []
        num_entities = len(category_df)
        for i in range(0, num_entities, batch_size):
            batch_df = category_df.iloc[i : i + batch_size]
            category_batches.append(batch_df)
        all_batches[category_name] = category_batches
        print(f"Category '{category_name}': Created {len(category_batches)} batches.")

    return all_batches

# --- 3. NEW: Function to Save Batches to CSV Files ---

def save_batches_to_csv(batched_data: dict, output_dir: str):
    """
    Saves each batch DataFrame to a separate CSV file in the specified directory.

    Args:
        batched_data (dict): The dictionary of batches generated by batch_entities_by_category.
        output_dir (str): The directory where the CSV files will be saved.
    """
    print(f"\nSaving batches to directory: '{output_dir}'")
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Iterate through each category and its list of batches
    for category, batches in batched_data.items():
        # Sanitize the category name to make it a valid filename
        # This replaces slashes (like in 'buildings/places') with underscores
        sanitized_category_name = category.replace('/', '_')
        
        print(f"-> Saving category: {category}")
        
        # Iterate through each batch in the list, using enumerate for a batch ID
        for i, batch_df in enumerate(batches):
            # The batch ID will be 1, 2, 3, ... instead of 0, 1, 2, ...
            batch_id = i + 1
            
            # Construct the filename
            filename = f"{sanitized_category_name}_batch_{batch_id}.csv"
            filepath = os.path.join(output_dir, filename)
            
            # Save the batch DataFrame to a CSV file
            # index=False prevents pandas from writing the DataFrame index as a column
            batch_df.to_csv(filepath, index=False)
    
    print("\nAll batches have been saved successfully.")


# --- 4. Running the Code ---


    # Step 2: Process the CSV and create the batches in memory
batched_data = batch_entities_by_category(r"C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\processed_data\fr_en_train_cand_list_20_entity_alignment_with_names_and_uris_category.csv", batch_size=500)

    # Step 3: Save the generated batches to files, if any were created
if batched_data:
    save_batches_to_csv(batched_data, output_dir=r"C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\categories")

    print("\n--- Example Output ---")
    print(f"Check the 'categories' folder.")
    print("You should see files like:")
    print("  - Person_batch_1.csv")
    print("  - place_batch_1.csv")
    print("  - buildings_places_batch_1.csv")
    print("  - etc.")

Successfully loaded 4500 rows from 'C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\processed_data\fr_en_train_cand_list_20_entity_alignment_with_names_and_uris_category.csv'.
Category 'Person': Created 4 batches.
Category 'buildings/places': Created 1 batches.
Category 'creative_work': Created 2 batches.
Category 'event': Created 1 batches.
Category 'place': Created 2 batches.
Category 'uncertain': Created 2 batches.

Saving batches to directory: 'C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\categories'
-> Saving category: Person
-> Saving category: buildings/places
-> Saving category: creative_work
-> Saving category: event
-> Saving category: place
-> Saving category: uncertain

All batches have been saved successfully.

--- Example Output ---
Check the 'categories' folder.
You should see files like:
  - Person_batch_1.csv
  - place_batch_1.csv
  - buildings_places_batch_1.csv
  - etc.


In [1]:
import pandas as pd
import os

# --- Configuration ---
SOURCE_FILE = 'mvp-kg-alignment\processed_data\entity_name_id_category.csv'
TARGET_FILE = 'mvp-kg-alignment\processed_data\fr_en_train_cand_list_20_entity_alignment_with_names_and_uris.csv'
OUTPUT_FILE = 'mvp-kg-alignment\processed_data\fr_en_train_cand_list_20_entity_alignment_with_names_and_uris_category.csv'

def merge_category_data(source_path, target_path, output_path):
    """
    Merges a category column from a source CSV into a target CSV.

    Args:
        source_path (str): Path to the CSV with (index_id, category).
        target_path (str): Path to the main CSV file.
        output_path (str): Path to save the merged CSV file.
    """
    # --- 1. Load the CSV files into pandas DataFrames ---
    try:
        source_df = pd.read_csv(source_path)
        target_df = pd.read_csv(target_path)
    except FileNotFoundError as e:
        print(f"Error: Could not find a file. {e}")
        return

    print("Successfully loaded source and target files.")
    
    # --- 2. Prepare the source data ---
    # We only need the 'index_id' and 'category' columns for the merge.
    # This avoids potential conflicts with other columns like 'index_name'.
    category_data = source_df[['index_id', 'category']]

    # --- 3. Merge the two DataFrames ---
    # We use a 'left' merge to keep all rows from the target_df.
    # If an index_id from the target file doesn't exist in the source file,
    # the 'category' for that row will be empty (NaN).
    merged_df = pd.merge(target_df, category_data, on='index_id', how='left')
    
    # Fill any missing categories with an empty string or 'N/A' if preferred
    merged_df['category'].fillna('', inplace=True)
    
    # --- 4. Reorder columns to the desired structure ---
    # The new 'category' column is currently at the end. We move it.
    desired_order = [
        'index_name',
        'index_uri',
        'index_id',
        'index_language',
        'category',  # <-- Inserted here
        'candidates_str',
        'num_candidates'
    ]
    
    # Check if all desired columns exist in the merged dataframe
    # This is a safeguard in case of unexpected column names
    final_df = merged_df[desired_order]
    
    # --- 5. Save the result to a new CSV file ---
    # index=False prevents pandas from writing the DataFrame index as a column
    final_df.to_csv(output_path, index=False)
    
    print(f"Successfully merged data and saved to '{output_path}'")
    print(f"\nFinal DataFrame preview:\n{final_df.head()}")

In [9]:
SOURCE_FILE = r"C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\processed_data\entity_name_id_category.csv"
TARGET_FILE = r"C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\processed_data\fr_en_train_cand_list_20_entity_alignment_with_names_and_uris.csv"
OUTPUT_FILE = r"C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\processed_data\fr_en_train_cand_list_20_entity_alignment_with_names_and_uris_category.csv"
merge_category_data(SOURCE_FILE, TARGET_FILE, OUTPUT_FILE)

Successfully loaded source and target files.
Successfully merged data and saved to 'C:\Users\DavidCaraman\Desktop\Licenta 1.0\mvp-kg-alignment\processed_data\fr_en_train_cand_list_20_entity_alignment_with_names_and_uris_category.csv'

Final DataFrame preview:
                                          index_name  \
0                          Saint-Joseph-de-Coleraine   
1                                      Self_Portrait   
2  Alliance_des_libéraux_et_des_démocrates_pour_l...   
3                                             Wallon   
4                                            Android   

                                           index_uri  index_id index_language  \
0  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         French   
1       http://fr.dbpedia.org/resource/Self_Portrait         1         French   
2  http://fr.dbpedia.org/resource/Alliance_des_li...         2         French   
3              http://fr.dbpedia.org/resource/Wallon         3         French  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['category'].fillna('', inplace=True)
