In [1]:
import pandas as pd
import numpy as np
import os
import sys
import json
from tqdm import tqdm

In [3]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))


if parent_dir not in sys.path:
    sys.path.append(parent_dir)
import Utils as u

Loading Dataset

In [4]:
base_url = "hf://datasets/cyanic-selkie/aida-conll-yago-wikidata/"
splits = {'train': 'train.parquet', 'validation': 'validation.parquet', 'test': 'test.parquet'}

In [5]:
df_train = pd.read_parquet(base_url + splits['train'])
df_val = pd.read_parquet(base_url + splits['validation'])
df_test = pd.read_parquet(base_url + splits['test'])

In [6]:
print(f"Train: {len(df_train)} examples")
print(f"Validation: {len(df_val)} examples")
print(f"Test: {len(df_test)} examples")

Train: 946 examples
Validation: 216 examples
Test: 231 examples


Showing Original Example

In [7]:
print(f"Train shape: {df_train.shape}")
print(f"Columns: {df_train.columns.tolist()}")
print(f"\nFirst row text length: {len(df_train.iloc[0]['text'])}")
print(f"First row entities: {len(df_train.iloc[0]['entities'])}")

if len(df_train.iloc[0]['entities']) > 0:
    print(f"First entity: {df_train.iloc[0]['entities'][0]}")

Train shape: (946, 3)
Columns: ['document_id', 'text', 'entities']

First row text length: 2790
First row entities: 48
First entity: {'start': 0, 'end': 2, 'tag': 'ORG', 'pageid': None, 'qid': None, 'title': None}


## Preprocessing Pipeline

The following steps transform the raw AIDA dataset into a format suitable for entity linking tasks. Each step addresses specific data quality issues and prepares the data for model training.

Apply preprocessing to a sample

In [8]:
df_sample = df_train.head(100).copy()

### Step 1: Filter Valid Entities (Optional)

**Purpose:** Remove entities with invalid or incomplete information (e.g., missing QIDs, malformed spans).

**What it does:**
- Validates entity offsets are within text bounds
- Ensures all required fields (start, end, mention, qid) are present
- Filters out entities with corrupted data

**When to use:** Enable if the dataset has quality issues. AIDA is generally clean, so this step is optional.

In [9]:
# df_sample = u.apply_filter_valid_entities(df_sample, inplace=True)

### Step 2: Add Context to Entities

**Purpose:** Extract surrounding text context for each entity mention to provide disambiguation information.

**What it does:**
- Extracts text window before and after each mention (default: 200 characters)
- Creates `context_left` and `context_right` fields
- Preserves original text boundaries (doesn't cut mid-word)

**Why it matters:** Context is crucial for entity linking - the surrounding words help determine which entity a mention refers to (e.g., "Jordan the country" vs "Michael Jordan").

In [10]:
df_sample = u.apply_add_context(df_sample, context_window=200, inplace=False)

Adding context (window=200) to entities...


### Step 3: Normalize Mentions

**Purpose:** Standardize mention text for consistent matching and improved disambiguation.

**What it does:**
- Converts mentions to lowercase
- Removes extra whitespace
- Strips punctuation from edges
- Creates a `normalized_mention` field

**Why it matters:** "Microsoft", "microsoft", and "MICROSOFT" should be treated as the same mention. Normalization improves matching with knowledge bases and reduces vocabulary size.

In [11]:
df_sample = u.apply_normalize_mentions(df_sample, inplace=True)

Normalizing entity mentions...


### Step 4: Remove Overlapping Entities

**Purpose:** Resolve cases where multiple entity annotations cover the same or overlapping text spans.

**What it does:**
- Detects entities with overlapping character ranges
- Keeps the longer/more specific entity when overlap occurs
- Prevents duplicate or conflicting annotations

**Example:** If "New York City" and "York" are both annotated, keeps "New York City" and removes the partial "York".

**Why it matters:** Overlapping entities cause confusion during training and can lead to incorrect predictions.

In [14]:
df_sample = u.apply_remove_overlaps(df_sample, inplace=True)

Removing overlapping entities...
  Removed 0 overlapping entities


### Step 5: Create Candidate Pairs

**Purpose:** Generate training examples by pairing each mention with its candidate entities (including the correct one).

**What it does:**
- For each mention, retrieves candidate entities from knowledge base
- Creates (mention, candidate_entity) pairs
- Includes context and features for ranking
- Labels the correct entity as positive, others as negative

**Output:** `mention_candidate_pairs` field containing list of candidates per mention.

**Why it matters:** This creates the actual training data for the entity linking model - the model learns to rank candidates and select the correct entity.

In [17]:
df_sample = u.apply_create_candidate_pairs(df_sample, inplace=True)

Creating mention-candidate pairs...


### Step 6: Create NIL Detection Examples

**Purpose:** Generate examples for identifying mentions that don't link to any entity in the knowledge base (NIL entities).

**What it does:**
- Identifies mentions without valid entity links (qid is null)
- Creates negative examples for NIL detection task
- Separates NIL mentions from linkable mentions

**Output:** `nil_examples` and `linked_examples` fields.

**Why it matters:** Not all mentions can be linked to Wikipedia/Wikidata. The model needs to learn when to predict "no entity exists" rather than forcing a wrong link. This is crucial for real-world applications.

In [20]:
df_sample = u.apply_create_nil_examples(df_sample, inplace=True)

Creating NIL detection examples...
  NIL entities: 504, Linked entities: 2139


### Step 7: Split Long Documents (Optional)

**Purpose:** Break very long documents into smaller chunks that fit model input limits.

**What it does:**
- Splits documents exceeding max_length (e.g., 512 tokens)
- Preserves entity annotations within each chunk
- Maintains context continuity where possible

**When to use:** Enable when using transformer models with fixed input limits (BERT, RoBERTa, etc.).

**Note:** Currently disabled for AIDA as documents are reasonably sized.

In [None]:
print(f"Processed shape: {df_sample.shape}")
print(f"New columns: {df_sample.columns.tolist()}")
print(f"\nSample of new fields:")
print(f"- Context fields: context_left, context_right")
print(f"- Normalized mentions: normalized_mention")
print(f"- Candidate pairs: mention_candidate_pairs")
print(f"- NIL detection: nil_examples, linked_examples")

In [None]:
# Uncomment if documents exceed model input limits
# df_sample = u.apply_split_long_documents(df_sample, max_length=512)

### Export Preprocessed Data

Save the preprocessed sample for inspection or downstream use.

In [25]:
df_sample.to_parquet("preprocessed_sample_aida.parquet", index=False)

In [None]:
# Example: View structure of candidate pairs
# x = df_sample[['mention_candidate_pairs', 'nil_examples', 'linked_examples']].head(1)
# x

In [None]:
# Example: Expand candidate pairs to see individual entries
# df_mcp = x['mention_candidate_pairs'].explode().apply(pd.Series)
# df_mcp