In [40]:
import sys
import os
print(f"Current Working Directory: {os.getcwd()}")
!pip install -r requirements.txt -q

Current Working Directory: /home/dtrautner/dev/ner-quality-impact
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [41]:
from src.noise.semantic import semantic_noise
from src.noise.orthographic import typo_tokens
from src.noise.label_noise import apply_label_noise_on_spans
from src.noise.syntactic import syntactic_noise

from src.data_preprocessing import build_label_maps, load_conll2003

print("Setup complete. Modules imported successfully.")

Setup complete. Modules imported successfully.


In [42]:
ds = load_conll2003()
id2label, label2id = build_label_maps(ds["train"].features, "ner_tags")
id2pos, pos2id = build_label_maps(ds["train"].features, "pos_tags")

# Take two example sentences from the training set
train_ds = ds["train"]
num_examples = len(train_ds)
middle_index = num_examples // 2
examples = train_ds.select(range(middle_index, middle_index + 2))

print(f"Loaded {len(examples)} example sentences.")

Loaded 2 example sentences.


In [43]:
def print_tokens_with_labels(tokens, ner_labels):
    """A helper function to print tokens and NER labels in an aligned format."""
    token_lengths = [len(token) for token in tokens]

    token_line = " ".join(tokens)
    
    # Create the string for labels, padded to align with the tokens above
    label_line = " ".join([label.ljust(length) for label, length in zip(ner_labels, token_lengths)])
    
    print("Tokens: " + token_line)
    print("Labels: " + label_line)

In [44]:
def demonstrate_noise_effect(
    noise_function, 
    example, 
    id2label, 
    id2pos, 
    noise_params, 
):
    """
    A generic function to apply any noise function to an example and print the result.
    """
    original_tokens = example["tokens"]
    pos_tag_ids = example["pos_tags"]
    original_ner_tag_ids  = example["ner_tags"]
    
    pos_tags = [id2pos[tag_id] for tag_id in pos_tag_ids]
    original_ner_labels = [id2label[tag_id] for tag_id in original_ner_tag_ids ]

    print("=" * 60)
    print("Original Sentence:")
    print_tokens_with_labels(original_tokens, original_ner_labels)
    print("\n")
    
    func_name = noise_function.__name__
    noisy_tokens = original_tokens
    noisy_ner_labels = original_ner_labels

    if func_name == "semantic_noise":
        noisy_tokens = noise_function(
            tokens=original_tokens,
            pos_tags=pos_tags,
            ner_tags=original_ner_tag_ids ,
            id2label=id2label,
            **noise_params
        )
    elif func_name == "typo_tokens":
        noisy_tokens = noise_function(
            tokens=original_tokens,
            ner_tags=original_ner_tag_ids,
            id2label=id2label,
            **noise_params
        )
    elif func_name == "syntactic_noise":
        params_with_o_label = {"o_label": label2id["O"], **noise_params}
        new_tokens, new_ner_tag_ids = noise_function(
            original_tokens, original_ner_tag_ids, id2label=id2label, label2id=label2id, **params_with_o_label
        )
        noisy_tokens = new_tokens
        noisy_ner_labels = [id2label[tag_id] for tag_id in new_ner_tag_ids]
    elif "apply_label_noise_on_spans" in func_name: 
        new_ner_tag_ids = noise_function(
            original_tokens, original_ner_tag_ids, id2label, label2id, **noise_params
        )
        noisy_ner_labels = [id2label[tag_id] for tag_id in new_ner_tag_ids]
    else:
        # Fallback for other potential functions
        print(f"Warning: Don't know the specific signature for {func_name}. Trying a generic call.")
        noisy_tokens = noise_function(tokens=original_tokens, **noise_params)

    print(f"Sentence After '{func_name}':")
    print_tokens_with_labels(noisy_tokens, noisy_ner_labels)
    print("=" * 60)
    print("\n")

In [45]:

semantic_params = {
    "p": 0.3,
    "entity_strategy": "all",
    #"ops": ["contextual"], #"synonym", "word_embs", "contextual"
}

index = 32
examples = train_ds.select(range(index, index + 2))

for example in examples:
    demonstrate_noise_effect(
            noise_function=semantic_noise,
            example=example,
            id2label=id2label,
            id2pos=id2pos,
            noise_params=semantic_params,
        )

Original Sentence:
Tokens: State media quoted China 's top negotiator with Taipei , Tang Shubei , as telling a visiting group from Taiwan on Wednesday that it was time for the rivals to hold political talks .
Labels: O     O     O      B-LOC O  O   O          O    B-LOC  O B-PER I-PER  O O  O       O O        O     O    B-LOC  O  O         O    O  O   O    O   O   O      O  O    O         O     O


Sentence After 'semantic_noise':
Tokens: State video unnamed China 's greatest negotiator that Taipei , Shu Zhengping , as seeing a visiting group from Guangzhou on Wednesday that it was time this the participants to hold political talks .
Labels: O     O     O       B-LOC O  O        O          O    B-LOC  O B-PER I-PER     O O  O      O O        O     O    B-LOC     O  O         O    O  O   O    O    O   O            O  O    O         O     O


Original Sentence:
Tokens: " Now is the time for the two sides to engage in political talks ...
Labels: O O   O  O   O    O   O   O   O     O  O   

In [46]:
orthographic_params = { "p": 0.3, "entity_strategy": "all" }

index = 134
examples = train_ds.select(range(index, index + 2))

for example in examples:
    demonstrate_noise_effect(
            noise_function=typo_tokens,
            example=example,
            id2label=id2label,
            id2pos=id2pos,
            noise_params=orthographic_params,
        )

Original Sentence:
Tokens: A spokesman for the group said the meeting " signals a new level of cooperation between Mujahideen Khalq and the Iranian Kurdish oppositions " .
Labels: O O         O   O   O     O    O   O       O O       O O   O     O  O           O       B-ORG      I-ORG O   O   B-MISC  I-MISC  O           O O


Sentence After 'typo_tokens':
Tokens: A spokesman for the group said the MeetIng " signals a new levle of cooperation brtween Mujahideen Khalq and the Iranian Kurdish op√ºpositions " .
Labels: O O         O   O   O     O    O   O       O O       O O   O     O  O           O       B-ORG      I-ORG O   O   B-MISC  I-MISC  O            O O


Original Sentence:
Tokens: Iran heavily bombarded targets in northern Iraq in July in pursuit of KDPI guerrillas based in Iraqi Kurdish areas outside the control of the government in Baghdad .
Labels: B-LOC O       O         O       O  O        B-LOC O  O    O  O       O  B-ORG O          O     O  B-MISC I-MISC  O     O       O   

In [52]:
syntactic_params  = { "p": 0.3}

index = 138
examples = train_ds.select(range(index, index + 2))

for example in examples:
    demonstrate_noise_effect(
            noise_function=syntactic_noise,
            example=example,
            id2label=id2label,
            id2pos=id2pos,
            noise_params=syntactic_params,
        )

Original Sentence:
Tokens: Clashes between the two parties broke out at the weekend in the most serious fighting since a U.S.-sponsored ceasefire last year .
Labels: O       O       O   O   O       O     O   O  O   O       O  O   O    O       O        O     O B-MISC         O         O    O    O


Sentence After 'syntactic_noise':
Tokens: Clashes between the two parties broke out th e weekend i n the most serious fighting since a U.S.-sponsored ceasefire last yea r .
Labels: O       O       O   O   O       O     O   O  O O       O O O   O    O       O        O     O B-MISC         O         O    O   O O


Original Sentence:
Tokens: Mujahideen Khalq said Iranian troops had also been shelling KDP positions in Qasri region in Suleimaniya province near the Iranian border over the last two days .
Labels: B-ORG      I-ORG O    B-MISC  O      O   O    O    O        B-ORG O         O  B-LOC O      O  B-LOC       O        O    O   B-MISC  O      O    O   O    O   O    O


Sentence After 'syntac

In [55]:
label_params  = { "p": 0.30}

index = 139
examples = train_ds.select(range(index, index + 2))

for example in examples:
    demonstrate_noise_effect(
            noise_function=apply_label_noise_on_spans,
            example=example,
            id2label=id2label,
            id2pos=id2pos,
            noise_params=label_params,
        )

Original Sentence:
Tokens: Mujahideen Khalq said Iranian troops had also been shelling KDP positions in Qasri region in Suleimaniya province near the Iranian border over the last two days .
Labels: B-ORG      I-ORG O    B-MISC  O      O   O    O    O        B-ORG O         O  B-LOC O      O  B-LOC       O        O    O   B-MISC  O      O    O   O    O   O    O


Sentence After 'apply_label_noise_on_spans':
Tokens: Mujahideen Khalq said Iranian troops had also been shelling KDP positions in Qasri region in Suleimaniya province near the Iranian border over the last two days .
Labels: O          O     O    B-MISC  O      O   O    O    O        B-PER O         O  B-LOC O      O  B-LOC       O        O    O   B-MISC  O      O    O   O    O   O    O


Original Sentence:
Tokens: It said about 100 Iraqi Kurds were killed or wounded in the attack .
Labels: O  O    O     O   B-MISC I-MISC O    O      O  O       O  O   O      O


Sentence After 'apply_label_noise_on_spans':
Tokens: It said about 