# This is the project called Entity Alignment using Agentic AI

This is the script for the dataset processing. We are going to map the entity ids to the entity name.

In [16]:
import pandas as pd

# Load the train.cand_list.20 file as a pandas DataFrame
file_path = "data/DBP15K/torch_geometric_cache/raw/fr_en/train.cand_list.20"

# Read the file and parse it
data = []
with open(file_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            # Split by ': ' to separate index from candidate list
            parts = line.split(': ', 1)
            if len(parts) == 2:
                index = int(parts[0])
                candidates = [int(x) for x in parts[1].split()]
                data.append({'index': index, 'candidates': candidates, 'num_candidates': len(candidates)})

# Create DataFrame
df = pd.DataFrame(data)
print(f"DataFrame shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
print(f"\nDataFrame info:")
print(df.info())
print(f"\nSample candidates for first entity:")
print(f"Entity {df.iloc[0]['index']}: {df.iloc[0]['candidates'][:10]}...")  # Show first 10 candidates


DataFrame shape: (4500, 3)

First few rows:
   index                                         candidates  num_candidates
0      0  [10500, 33775, 36175, 17181, 20105, 32176, 117...              20
1      1  [10501, 36449, 11420, 18572, 33228, 15363, 333...              20
2      2  [20759, 10502, 18084, 16541, 13414, 15062, 332...              20
3      3  [10503, 15732, 31027, 19403, 38929, 37146, 176...              20
4      4  [33507, 15190, 10504, 19425, 38842, 37861, 324...              20

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           4500 non-null   int64 
 1   candidates      4500 non-null   object
 2   num_candidates  4500 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 105.6+ KB
None

Sample candidates for first entity:
Entity 0: [10500, 33775, 36175, 17181, 20105, 32176, 11787, 380

In [17]:
df.head()

Unnamed: 0,index,candidates,num_candidates
0,0,"[10500, 33775, 36175, 17181, 20105, 32176, 117...",20
1,1,"[10501, 36449, 11420, 18572, 33228, 15363, 333...",20
2,2,"[20759, 10502, 18084, 16541, 13414, 15062, 332...",20
3,3,"[10503, 15732, 31027, 19403, 38929, 37146, 176...",20
4,4,"[33507, 15190, 10504, 19425, 38842, 37861, 324...",20


In [18]:
import os
import pandas as pd
import ast

# Create processed_data directory if it doesn't exist
os.makedirs('processed_data', exist_ok=True)

def load_entity_mappings(file_path):
    """Load entity ID to URI mappings from file"""
    mappings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split('\t', 1)
                if len(parts) == 2:
                    entity_id = int(parts[0])
                    entity_uri = parts[1]
                    # Extract entity name from URI (last part after /)
                    entity_name = entity_uri.split('/')[-1]
                    mappings[entity_id] = {
                        'name': entity_name,
                        'uri': entity_uri
                    }
    return mappings

# Load entity mappings
print("Loading entity mappings...")
kg1_mappings = load_entity_mappings("data/DBP15K/torch_geometric_cache/raw/fr_en/ent_ids_1")  # French entities
kg2_mappings = load_entity_mappings("data/DBP15K/torch_geometric_cache/raw/fr_en/ent_ids_2")  # English entities

print(f"Loaded {len(kg1_mappings)} entities from KG1 (French)")
print(f"Loaded {len(kg2_mappings)} entities from KG2 (English)")

# Check first few mappings
print("\nSample mappings from KG1:")
for i, (k, v) in enumerate(list(kg1_mappings.items())[:3]):
    print(f"  ID {k}: {v['name']} ({v['uri']})")

print("\nSample mappings from KG2:")
for i, (k, v) in enumerate(list(kg2_mappings.items())[:3]):
    print(f"  ID {k}: {v['name']} ({v['uri']})")


Loading entity mappings...
Loaded 19661 entities from KG1 (French)
Loaded 19993 entities from KG2 (English)

Sample mappings from KG1:
  ID 0: Saint-Joseph-de-Coleraine (http://fr.dbpedia.org/resource/Saint-Joseph-de-Coleraine)
  ID 1: Self_Portrait (http://fr.dbpedia.org/resource/Self_Portrait)
  ID 2: Alliance_des_libéraux_et_des_démocrates_pour_l'Europe (http://fr.dbpedia.org/resource/Alliance_des_libéraux_et_des_démocrates_pour_l'Europe)

Sample mappings from KG2:
  ID 10500: Saint-Joseph-de-Coleraine,_Quebec (http://dbpedia.org/resource/Saint-Joseph-de-Coleraine,_Quebec)
  ID 10501: Self_Portrait_(Bob_Dylan_album) (http://dbpedia.org/resource/Self_Portrait_(Bob_Dylan_album))
  ID 10502: Alliance_of_Liberals_and_Democrats_for_Europe_Party (http://dbpedia.org/resource/Alliance_of_Liberals_and_Democrats_for_Europe_Party)


In [19]:
# Create the enhanced dataframe with entity names and URIs
entity_alignment_data = []

print("Creating enhanced dataframe with entity names and URIs...")
for _, row in df.iterrows():
    index_id = row['index']
    candidates = row['candidates']
    
    # Get index entity information (from KG1 - French)
    if index_id in kg1_mappings:
        index_name = kg1_mappings[index_id]['name']
        index_uri = kg1_mappings[index_id]['uri']
        index_language = 'French'
    else:
        index_name = f"Unknown_Entity_{index_id}"
        index_uri = f"Unknown_URI_{index_id}"
        index_language = 'Unknown'
    
    # Get candidate entities information (from KG2 - English)
    candidate_info = []
    for candidate_id in candidates:
        if candidate_id in kg2_mappings:
            candidate_name = kg2_mappings[candidate_id]['name']
            candidate_uri = kg2_mappings[candidate_id]['uri']
            candidate_language = 'English'
        else:
            candidate_name = f"Unknown_Entity_{candidate_id}"
            candidate_uri = f"Unknown_URI_{candidate_id}"
            candidate_language = 'Unknown'
        
        candidate_info.append([candidate_name, candidate_uri, candidate_id, candidate_language])
    
    # Create the row structure: [index_name, index_uri, index_id, language, candidates_list]
    entity_alignment_data.append({
        'index_name': index_name,
        'index_uri': index_uri,
        'index_id': index_id,
        'index_language': index_language,
        'candidates': candidate_info,
        'num_candidates': len(candidate_info)
    })

# Create DataFrame
enhanced_df = pd.DataFrame(entity_alignment_data)

print(f"Enhanced DataFrame shape: {enhanced_df.shape}")
print("\nFirst few rows of enhanced DataFrame:")
print(enhanced_df[['index_name', 'index_uri', 'index_id', 'index_language', 'num_candidates']].head())

# Show a sample of candidates for the first entity
print(f"\nSample candidates for first entity ('{enhanced_df.iloc[0]['index_name']}'):")
for i, candidate in enumerate(enhanced_df.iloc[0]['candidates'][:5]):  # Show first 5 candidates
    print(f"  {i+1}. {candidate[0]} (URI: {candidate[1]}, ID: {candidate[2]}, Language: {candidate[3]})")


Creating enhanced dataframe with entity names and URIs...
Enhanced DataFrame shape: (4500, 6)

First few rows of enhanced DataFrame:
                                          index_name  \
0                          Saint-Joseph-de-Coleraine   
1                                      Self_Portrait   
2  Alliance_des_libéraux_et_des_démocrates_pour_l...   
3                                             Wallon   
4                                            Android   

                                           index_uri  index_id index_language  \
0  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         French   
1       http://fr.dbpedia.org/resource/Self_Portrait         1         French   
2  http://fr.dbpedia.org/resource/Alliance_des_li...         2         French   
3              http://fr.dbpedia.org/resource/Wallon         3         French   
4             http://fr.dbpedia.org/resource/Android         4         French   

   num_candidates  
0              20  
1  

In [20]:
# Save the enhanced dataframe to CSV
csv_file_path = 'processed_data/entity_alignment_with_names_and_uris.csv'

# For CSV export, we need to convert the candidates list to a string format
# that can be properly saved and loaded later
enhanced_df_for_csv = enhanced_df.copy()
enhanced_df_for_csv['candidates_str'] = enhanced_df_for_csv['candidates'].apply(str)

# Save the main columns (excluding the original candidates list which is complex)
columns_to_save = ['index_name', 'index_uri', 'index_id', 'index_language', 'candidates_str', 'num_candidates']
enhanced_df_for_csv[columns_to_save].to_csv(csv_file_path, index=False)

print(f"Enhanced dataframe saved to: {csv_file_path}")

# Also create a more readable version where each candidate is on a separate row
flattened_data = []
for _, row in enhanced_df.iterrows():
    index_name = row['index_name']
    index_uri = row['index_uri']
    index_id = row['index_id']
    index_language = row['index_language']
    
    for candidate in row['candidates']:
        candidate_name, candidate_uri, candidate_id, candidate_language = candidate
        flattened_data.append({
            'index_name': index_name,
            'index_uri': index_uri,
            'index_id': index_id,
            'index_language': index_language,
            'candidate_name': candidate_name,
            'candidate_uri': candidate_uri,
            'candidate_id': candidate_id,
            'candidate_language': candidate_language
        })

flattened_df = pd.DataFrame(flattened_data)
flattened_csv_path = 'processed_data/entity_alignment_flattened_with_uris.csv'
flattened_df.to_csv(flattened_csv_path, index=False)

print(f"Flattened dataframe saved to: {flattened_csv_path}")
print(f"Flattened DataFrame shape: {flattened_df.shape}")
print("\nFirst few rows of flattened DataFrame:")
print(flattened_df.head(10))


Enhanced dataframe saved to: processed_data/entity_alignment_with_names_and_uris.csv
Flattened dataframe saved to: processed_data/entity_alignment_flattened_with_uris.csv
Flattened DataFrame shape: (90000, 8)

First few rows of flattened DataFrame:
                  index_name  \
0  Saint-Joseph-de-Coleraine   
1  Saint-Joseph-de-Coleraine   
2  Saint-Joseph-de-Coleraine   
3  Saint-Joseph-de-Coleraine   
4  Saint-Joseph-de-Coleraine   
5  Saint-Joseph-de-Coleraine   
6  Saint-Joseph-de-Coleraine   
7  Saint-Joseph-de-Coleraine   
8  Saint-Joseph-de-Coleraine   
9  Saint-Joseph-de-Coleraine   

                                           index_uri  index_id index_language  \
0  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         French   
1  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         French   
2  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         French   
3  http://fr.dbpedia.org/resource/Saint-Joseph-de...         0         Fre

In [21]:
# Summary statistics
print("=== SUMMARY STATISTICS ===")
print(f"Total number of index entities (French): {len(enhanced_df)}")
print(f"Total number of candidate pairs: {len(flattened_df)}")
print(f"Average candidates per index entity: {flattened_df.shape[0] / enhanced_df.shape[0]:.2f}")

# Check for any unknown entities
unknown_index = enhanced_df[enhanced_df['index_language'] == 'Unknown'].shape[0]
unknown_candidates = flattened_df[flattened_df['candidate_language'] == 'Unknown'].shape[0]

print(f"\nUnknown entities:")
print(f"  - Unknown index entities: {unknown_index}")
print(f"  - Unknown candidate entities: {unknown_candidates}")

print(f"\nFiles created in 'processed_data' folder:")
print(f"  1. entity_alignment_with_names_and_uris.csv - Main dataframe with candidates as string")
print(f"  2. entity_alignment_flattened_with_uris.csv - Each index-candidate pair as separate row")

# Show sample of the data structure requested
print(f"\n=== SAMPLE DATA STRUCTURE ===")
sample_entity = enhanced_df.iloc[0]
print(f"Format: [entity_name, entity_uri, entity_id, language] [[candidate_1_name, candidate_1_uri, candidate_1_id, language], ...]")
print(f"Example:")
print(f"['{sample_entity['index_name']}', '{sample_entity['index_uri']}', {sample_entity['index_id']}, '{sample_entity['index_language']}']")
print(f"Candidates: {sample_entity['candidates'][:2]}...")  # Show first 2 candidates


=== SUMMARY STATISTICS ===
Total number of index entities (French): 4500
Total number of candidate pairs: 90000
Average candidates per index entity: 20.00

Unknown entities:
  - Unknown index entities: 0
  - Unknown candidate entities: 0

Files created in 'processed_data' folder:
  1. entity_alignment_with_names_and_uris.csv - Main dataframe with candidates as string
  2. entity_alignment_flattened_with_uris.csv - Each index-candidate pair as separate row

=== SAMPLE DATA STRUCTURE ===
Format: [entity_name, entity_uri, entity_id, language] [[candidate_1_name, candidate_1_uri, candidate_1_id, language], ...]
Example:
['Saint-Joseph-de-Coleraine', 'http://fr.dbpedia.org/resource/Saint-Joseph-de-Coleraine', 0, 'French']
Candidates: [['Saint-Joseph-de-Coleraine,_Quebec', 'http://dbpedia.org/resource/Saint-Joseph-de-Coleraine,_Quebec', 10500, 'English'], ['Saint-Joseph-de-Beauce', 'http://dbpedia.org/resource/Saint-Joseph-de-Beauce', 33775, 'English']]...
