In [1]:
import pandas as pd
import numpy as np
import os
import sys
import json
from tqdm import tqdm

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))


if parent_dir not in sys.path:
    sys.path.append(parent_dir)
import Utils as u

Loading Dataset

In [3]:
base_url = "hf://datasets/cyanic-selkie/aida-conll-yago-wikidata/"
splits = {'train': 'train.parquet', 'validation': 'validation.parquet', 'test': 'test.parquet'}

In [4]:
df_train = pd.read_parquet(base_url + splits['train'])
df_val = pd.read_parquet(base_url + splits['validation'])
df_test = pd.read_parquet(base_url + splits['test'])

In [5]:
print(f"Train: {len(df_train)} examples")
print(f"Validation: {len(df_val)} examples")
print(f"Test: {len(df_test)} examples")

Train: 946 examples
Validation: 216 examples
Test: 231 examples


Showing Original Example

In [6]:
print(f"Train shape: {df_train.shape}")
print(f"Columns: {df_train.columns.tolist()}")
print(f"\nFirst row text length: {len(df_train.iloc[0]['text'])}")
print(f"First row entities: {len(df_train.iloc[0]['entities'])}")

if len(df_train.iloc[0]['entities']) > 0:
    print(f"First entity: {df_train.iloc[0]['entities'][0]}")

Train shape: (946, 3)
Columns: ['document_id', 'text', 'entities']

First row text length: 2790
First row entities: 48
First entity: {'start': 0, 'end': 2, 'tag': 'ORG', 'pageid': None, 'qid': None, 'title': None}


## Preprocessing Pipeline

The following steps transform the raw AIDA dataset into a format suitable for entity linking tasks. Each step addresses specific data quality issues and prepares the data for model training.

Apply preprocessing to a sample

In [7]:
#df_sample = df_train.head(100).copy()

### Step 1: Filter Valid Entities (Optional)

**Purpose:** Remove entities with invalid or incomplete information (e.g., missing QIDs, malformed spans).

**What it does:**
- Validates entity offsets are within text bounds
- Ensures all required fields (start, end, mention, qid) are present
- Filters out entities with corrupted data

**When to use:** Enable if the dataset has quality issues. AIDA is generally clean, so this step is optional.

In [8]:
# df_sample = u.apply_filter_valid_entities(df_sample, inplace=True)

### Step 2: Add Context to Entities

**Purpose:** Extract surrounding text context for each entity mention to provide disambiguation information.

**What it does:**
- Extracts text window before and after each mention (default: 200 characters)
- Creates `context_left` and `context_right` fields
- Preserves original text boundaries (doesn't cut mid-word)

**Why it matters:** Context is crucial for entity linking - the surrounding words help determine which entity a mention refers to (e.g., "Jordan the country" vs "Michael Jordan").

In [9]:
#df_sample = u.apply_add_context(df_sample, context_window=200, inplace=False)

In [10]:
df_train_processed = u.apply_add_context(df_train, context_window=200, inplace=False)
df_val_processed = u.apply_add_context(df_val, context_window=200, inplace=False)
df_test_processed = u.apply_add_context(df_test, context_window=200, inplace=False)

Adding context (window=200) to entities...
Adding context (window=200) to entities...
Adding context (window=200) to entities...


### Step 3: Normalize Mentions

**Purpose:** Standardize mention text for consistent matching and improved disambiguation.

**What it does:**
- Converts mentions to lowercase
- Removes extra whitespace
- Strips punctuation from edges
- Creates a `normalized_mention` field

**Why it matters:** "Microsoft", "microsoft", and "MICROSOFT" should be treated as the same mention. Normalization improves matching with knowledge bases and reduces vocabulary size.

In [11]:
#df_sample = u.apply_normalize_mentions(df_sample, inplace=True)

In [12]:
df_train_processed = u.apply_normalize_mentions(df_train_processed, inplace=True)
df_val_processed = u.apply_normalize_mentions(df_val_processed, inplace=True)
df_test_processed = u.apply_normalize_mentions(df_test_processed, inplace=True)

Normalizing mentions in 946 documents...
Normalizing mentions in 216 documents...
Normalizing mentions in 231 documents...


### Step 4: Remove Overlapping Entities

**Purpose:** Resolve cases where multiple entity annotations cover the same or overlapping text spans.

**What it does:**
- Detects entities with overlapping character ranges
- Keeps the longer/more specific entity when overlap occurs
- Prevents duplicate or conflicting annotations

**Example:** If "New York City" and "York" are both annotated, keeps "New York City" and removes the partial "York".

**Why it matters:** Overlapping entities cause confusion during training and can lead to incorrect predictions.

In [13]:
#df_sample = u.apply_remove_overlaps(df_sample, inplace=True)

In [14]:
df_train_processed = u.apply_remove_overlaps(df_train_processed, inplace=True)
df_val_processed = u.apply_remove_overlaps(df_val_processed, inplace=True)
df_test_processed = u.apply_remove_overlaps(df_test_processed, inplace=True)

Removing overlapping entities...
  Removed 0 overlapping entities
Removing overlapping entities...
  Removed 0 overlapping entities
Removing overlapping entities...
  Removed 0 overlapping entities


In [15]:

output_dir = '../../data/processed/aida'
os.makedirs(output_dir, exist_ok=True)

df_train_processed.to_parquet(f'{output_dir}/train.parquet', index=False)
df_val_processed.to_parquet(f'{output_dir}/validation.parquet', index=False)
df_test_processed.to_parquet(f'{output_dir}/test.parquet', index=False)

print(f"\n‚úÖ Saved processed data to {output_dir}/")
print(f"   Train: {len(df_train_processed)} docs")
print(f"   Val: {len(df_val_processed)} docs")
print(f"   Test: {len(df_test_processed)} docs")

# Verify normalized_mention exists
print("\nüîç Verifying preprocessing...")
sample = df_val_processed.iloc[0]['entities'][0]
print(f"Sample entity keys: {list(sample.keys())}")

if 'normalized_mention' in sample:
    print("‚úÖ normalized_mention field present!")
    
    # Count unique normalized mentions in validation
    unique_val = set()
    for _, row in df_val_processed.iterrows():
        for entity in row['entities']:
            unique_val.add(entity['normalized_mention'])
    
    print(f"‚úÖ Validation unique normalized mentions: {len(unique_val)}")
    print(f"   (Should be ~1,500-2,000 instead of 2,795)")
else:
    print("‚ùå WARNING: normalized_mention missing!")


‚úÖ Saved processed data to ../../data/processed/aida/
   Train: 946 docs
   Val: 216 docs
   Test: 231 docs

üîç Verifying preprocessing...
Sample entity keys: ['start', 'end', 'tag', 'pageid', 'qid', 'title', 'mention', 'left_context', 'right_context', 'full_context', 'mention_start', 'mention_end', 'normalized_mention']
‚úÖ normalized_mention field present!
‚úÖ Validation unique normalized mentions: 2597
   (Should be ~1,500-2,000 instead of 2,795)


In [17]:
# Check train set reduction
print("\nüîç Checking TRAIN set deduplication...")

train_normalized = set()
train_total = 0
for _, row in df_train_processed.iterrows():
    for entity in row['entities']:
        train_normalized.add(entity.get('normalized_mention', ''))
        train_total += 1

print(f"üìä Train total entities: {train_total:,}")
print(f"üìä Train unique normalized: {len(train_normalized):,}")
print(f"üìä Train reduction: {((train_total - len(train_normalized)) / train_total * 100):.1f}%")
print(f"‚è±Ô∏è Estimated clarification time: {len(train_normalized) * 0.5 / 3 / 60:.1f} minutes (parallel, 3 workers)")


üîç Checking TRAIN set deduplication...
üìä Train total entities: 23,393
üìä Train unique normalized: 7,542
üìä Train reduction: 67.8%
‚è±Ô∏è Estimated clarification time: 20.9 minutes (parallel, 3 workers)
