In [62]:
# Import required libraries
import json
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [63]:
# Function to read and combine JSON files
def combine_json_files(json_files):
    all_data = []
    
    for file_name in json_files:
        file_path = f"../data/{file_name}"
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line.strip())
                    if isinstance(data, list):
                        all_data.extend(data)
                    else:
                        all_data.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error in {file_name}, line: {line[:50]}...")
                    continue
    
    return pd.DataFrame(all_data)

# Process training data
train_files = ['train00.json', 'train01.json', 'train02.json', 'train03.json']
df_train = combine_json_files(train_files)

# Process test data
test_files = ['test.json']
df_test = combine_json_files(test_files)

# Process validation data
valid_files = ['valid.json']
df_valid = combine_json_files(valid_files)

In [64]:
# List of labels to be converted to zero
labels_to_zero = [1, 6, 7, 8, 9, 10, 15, 18, 19, 20, 21, 22, 23, 24, 25, 
                 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]

# Convert specified labels to zero for each row
df_train['tags'] = df_train['tags'].apply(lambda x: [0 if tag in labels_to_zero else tag for tag in x])
df_test['tags'] = df_test['tags'].apply(lambda x: [0 if tag in labels_to_zero else tag for tag in x])
df_valid['tags'] = df_valid['tags'].apply(lambda x: [0 if tag in labels_to_zero else tag for tag in x])

# Check unique labels after modification
unique_tags = set()
for tags in df_train['tags']:
    unique_tags.update(tags)
print("\nRemaining unique tags:", sorted(list(unique_tags)))


Remaining unique tags: [0, 2, 3, 4, 5, 11, 12, 13, 14, 16, 17]


In [65]:
# Define label mapping for renumbering (Old -> New)
label_mapping = {
    0: 0,    # Keep O label as 0
    2: 1,
    3: 2,
    4: 3,
    5: 4,
    11: 5,
    12: 6,
    13: 7,
    14: 8,
    16: 9,
    17: 10
}

# Function to update label numbers
def update_label_numbers(tags_list):
    return [label_mapping[tag] for tag in tags_list]

# Update labels in each dataset
for df in [df_train, df_test, df_valid]:
    df['tags'] = df['tags'].apply(update_label_numbers)

# Check updated label distribution
print("Label distribution after update:")
unique_tags = set()
for tags in df_train['tags']:
    unique_tags.update(tags)
print("New label values:", sorted(list(unique_tags)))

# Original label definitions
original_labels = {
    "O": 0,
    "B-DATE": 2,
    "I-DATE": 3,
    "B-PERSON": 4,
    "I-PERSON": 5,
    "B-ORG": 11,
    "I-ORG": 12,
    "B-PERCENT": 13,
    "I-PERCENT": 14,
    "B-MONEY": 16,
    "I-MONEY": 17
}

# Create new label system
new_labels = {key: label_mapping[value] for key, value in original_labels.items()}

# Save to label.json
with open(f"../data/label.json", 'w') as f:
    json.dump(new_labels, f, indent=4)

print("\nNew label system saved to label.json")
print("\nLabel mappings:")
for old_label, old_id in original_labels.items():
    print(f"{old_label}: {old_id} -> {label_mapping[old_id]}")

Label distribution after update:
New label values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

New label system saved to label.json

Label mappings:
O: 0 -> 0
B-DATE: 2 -> 1
I-DATE: 3 -> 2
B-PERSON: 4 -> 3
I-PERSON: 5 -> 4
B-ORG: 11 -> 5
I-ORG: 12 -> 6
B-PERCENT: 13 -> 7
I-PERCENT: 14 -> 8
B-MONEY: 16 -> 9
I-MONEY: 17 -> 10


In [66]:
# Perform final checks
print("Dataset validations:")
print("\n1. Dataset dimensions:")
print(f"Train: {len(df_train)} samples")
print(f"Valid: {len(df_valid)} samples")
print(f"Test: {len(df_test)} samples")

print("\n2. Check token and tag count alignment in each sample:")
def check_token_tag_alignment(df, name):
    mismatches = 0
    for i, row in df.iterrows():
        if len(row['tokens']) != len(row['tags']):
            mismatches += 1
            if mismatches <= 3:  # Show first 3 mismatches
                print(f"\nMismatch in {name}, sample {i}:")
                print(f"Number of tokens: {len(row['tokens'])}")
                print(f"Number of tags: {len(row['tags'])}")
                print(f"Tokens: {row['tokens']}")
                print(f"Tags: {row['tags']}")
    return mismatches

train_mismatches = check_token_tag_alignment(df_train, "train")
valid_mismatches = check_token_tag_alignment(df_valid, "valid")
test_mismatches = check_token_tag_alignment(df_test, "test")

print(f"\nTotal number of mismatches:")
print(f"Train: {train_mismatches}")
print(f"Valid: {valid_mismatches}")
print(f"Test: {test_mismatches}")

print("\n3. Check label distribution:")
def check_label_distribution(df, name):
    all_tags = []
    for tags in df['tags']:
        all_tags.extend(tags)
    
    unique, counts = np.unique(all_tags, return_counts=True)
    print(f"\nLabel distribution in {name} dataset:")
    for label, count in zip(unique, counts):
        label_name = [k for k, v in new_labels.items() if v == label][0]
        print(f"{label_name} (ID: {label}): {count} occurrences")

# Check distribution for each dataset
for df, name in [(df_train, 'train'), (df_valid, 'valid'), (df_test, 'test')]:
    check_label_distribution(df, name)

Dataset validations:

1. Dataset dimensions:
Train: 59924 samples
Valid: 8528 samples
Test: 8262 samples

2. Check token and tag count alignment in each sample:

Total number of mismatches:
Train: 0
Valid: 0
Test: 0

3. Check label distribution:

Label distribution in train dataset:
O (ID: 0): 994961 occurrences
B-DATE (ID: 1): 10922 occurrences
I-DATE (ID: 2): 13333 occurrences
B-PERSON (ID: 3): 15429 occurrences
I-PERSON (ID: 4): 11147 occurrences
B-ORG (ID: 5): 12820 occurrences
I-ORG (ID: 6): 18246 occurrences
B-PERCENT (ID: 7): 1763 occurrences
I-PERCENT (ID: 8): 2498 occurrences
B-MONEY (ID: 9): 2411 occurrences
I-MONEY (ID: 10): 4912 occurrences

Label distribution in valid dataset:
O (ID: 0): 135618 occurrences
B-DATE (ID: 1): 1507 occurrences
I-DATE (ID: 2): 1809 occurrences
B-PERSON (ID: 3): 2020 occurrences
I-PERSON (ID: 4): 1395 occurrences
B-ORG (ID: 5): 1740 occurrences
I-ORG (ID: 6): 2336 occurrences
B-PERCENT (ID: 7): 177 occurrences
I-PERCENT (ID: 8): 258 occurrences
B

In [67]:
# Function to remove samples where all tags are 0
def remove_all_zero_samples(df):
    # Check if all tags in a row are 0
    mask = df['tags'].apply(lambda x: not all(tag == 0 for tag in x))
    filtered_df = df[mask]
    return filtered_df

print("\n4. Remove samples where all tags are 0:")
print("Before filtering:")
print(f"Train: {len(df_train)} samples")
print(f"Valid: {len(df_valid)} samples")
print(f"Test: {len(df_test)} samples")

# Apply filtering to all datasets
df_train_final = remove_all_zero_samples(df_train)
df_valid_final = remove_all_zero_samples(df_valid)
df_test_final = remove_all_zero_samples(df_test)

print("\nAfter filtering:")
print(f"Train: {len(df_train_final)} samples (removed {len(df_train) - len(df_train_final)} samples)")
print(f"Valid: {len(df_valid_final)} samples (removed {len(df_valid) - len(df_valid_final)} samples)")
print(f"Test: {len(df_test_final)} samples (removed {len(df_test) - len(df_test_final)} samples)")

# Save final datasets
df_train_final.to_excel(f"../data/final_train_data.xlsx", index=False)
df_valid_final.to_excel(f"../data/final_valid_data.xlsx", index=False)
df_test_final.to_excel(f"../data/final_test_data.xlsx", index=False)

print("\nFinal datasets have been saved.")

# Check label distribution in final datasets
print("\n5. Final label distribution after removing all-zero samples:")
for df, name in [(df_train_final, 'train'), (df_valid_final, 'valid'), (df_test_final, 'test')]:
    check_label_distribution(df, name)


4. Remove samples where all tags are 0:
Before filtering:
Train: 59924 samples
Valid: 8528 samples
Test: 8262 samples

After filtering:
Train: 23603 samples (removed 36321 samples)
Valid: 3062 samples (removed 5466 samples)
Test: 3239 samples (removed 5023 samples)

After filtering:
Train: 23603 samples (removed 36321 samples)
Valid: 3062 samples (removed 5466 samples)
Test: 3239 samples (removed 5023 samples)

Final datasets have been saved.

5. Final label distribution after removing all-zero samples:

Label distribution in train dataset:
O (ID: 0): 499783 occurrences
B-DATE (ID: 1): 10922 occurrences
I-DATE (ID: 2): 13333 occurrences
B-PERSON (ID: 3): 15429 occurrences
I-PERSON (ID: 4): 11147 occurrences
B-ORG (ID: 5): 12820 occurrences
I-ORG (ID: 6): 18246 occurrences
B-PERCENT (ID: 7): 1763 occurrences
I-PERCENT (ID: 8): 2498 occurrences
B-MONEY (ID: 9): 2411 occurrences
I-MONEY (ID: 10): 4912 occurrences

Label distribution in valid dataset:
O (ID: 0): 63406 occurrences
B-DATE (

## Dataset Visualization - Original vs Final Categories

In [68]:
import plotly.graph_objects as go

# Read original data again to get true distribution before any transformations
def read_original_data():
    all_tags = []
    train_files = ['train00.json', 'train01.json', 'train02.json', 'train03.json']
    test_files = ['test.json']
    valid_files = ['valid.json']
    
    all_files = train_files + test_files + valid_files
    
    for file_name in all_files:
        file_path = f"../data/{file_name}"
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line.strip())
                    if isinstance(data, list):
                        for item in data:
                            if 'tags' in item:
                                all_tags.extend(item['tags'])
                    else:
                        if 'tags' in data:
                            all_tags.extend(data['tags'])
                except json.JSONDecodeError:
                    continue
    
    return all_tags

# Get original tag distribution
all_tags_original = read_original_data()

# Original categories with their IDs from the raw data
original_categories_full = {
    'O': 0,
    'B-GPE': 1,
    'B-DATE': 2, 'I-DATE': 3,
    'B-PERSON': 4, 'I-PERSON': 5,
    'B-LOC': 6, 'I-LOC': 7,
    'B-NORP': 8, 'I-NORP': 9,
    'B-FAC': 10,
    'B-ORG': 11, 'I-ORG': 12,
    'B-PERCENT': 13, 'I-PERCENT': 14,
    'B-PRODUCT': 15,
    'B-MONEY': 16, 'I-MONEY': 17
}

# Calculate final label distribution (after transformation)
all_tags_final = []
for df in [df_train_final, df_test_final, df_valid_final]:
    for tags in df['tags']:
        all_tags_final.extend(tags)

final_categories = {
    'O': 0,
    'B-DATE': 1, 'I-DATE': 2,
    'B-PERSON': 3, 'I-PERSON': 4,
    'B-ORG': 5, 'I-ORG': 6,
    'B-PERCENT': 7, 'I-PERCENT': 8,
    'B-MONEY': 9, 'I-MONEY': 10
}

print(f"Original Dataset: 76,714 records with {len(original_categories_full)} categories")
print(f"Final Dataset: {len(df_train_final) + len(df_valid_final) + len(df_test_final)} records with {len(final_categories)} categories")

Original Dataset: 76,714 records with 18 categories
Final Dataset: 29904 records with 11 categories


In [69]:
# Graph 1: Original Dataset Categories with actual counts (18 categories)
import numpy as np

# Count occurrences for original categories (before any filtering)
# We'll use a representative distribution since we transformed the data
original_category_counts = {}
for label_name in original_categories_full.keys():
    label_id = original_categories_full[label_name]
    count = all_tags_original.count(label_id)
    original_category_counts[label_name] = count

# Sort by category name for consistent display
original_labels = list(original_categories_full.keys())
original_counts = [original_category_counts[label] for label in original_labels]

fig1 = go.Figure()

fig1.add_trace(go.Bar(
    x=original_labels,
    y=original_counts,
    marker=dict(
        color=['#95a5a6' if label == 'O' else '#e74c3c' if 'GPE' in label or 'LOC' in label or 'NORP' in label or 'FAC' in label or 'PRODUCT' in label 
               else '#3498db' for label in original_labels],
        line=dict(color='rgba(0,0,0,0.3)', width=1)
    ),
    text=[f'{count:,}' for count in original_counts],
    textposition='outside',
    textfont=dict(size=10, color='black', family='Arial Black'),
    hovertemplate='<b>%{x}</b><br>Count: %{y:,}<extra></extra>'
))

fig1.update_layout(
    title={
        'text': 'NER Dataset Distribution',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 18, 'color': '#2c3e50'}
    },
    xaxis_title='Entity Categories',
    yaxis_title='Tag Count',
    yaxis=dict(showgrid=True, gridcolor='rgba(200,200,200,0.3)'),
    plot_bgcolor='rgba(240,240,240,0.5)',
    height=550,
    showlegend=False,
    margin=dict(t=100, b=100, l=80, r=50)
)

# Add annotations for removed categories
removed_categories = ['B-GPE', 'B-LOC', 'I-LOC', 'B-NORP', 'I-NORP', 'B-FAC', 'B-PRODUCT']
fig1.add_annotation(
    x=0.5, y=1.12,
    xref='paper', yref='paper',
    text=f'',
    showarrow=False,
    font=dict(size=12),
    align='center'
)

fig1.show()

In [70]:
# Graph 2: Final Dataset Categories with actual counts (11 categories for Financial NER)

# Count occurrences for final categories
final_category_counts = {}
for label_name, label_id in final_categories.items():
    count = all_tags_final.count(label_id)
    final_category_counts[label_name] = count

final_labels = list(final_categories.keys())
final_counts = [final_category_counts[label] for label in final_labels]

# Color mapping for entity types
colors = []
for label in final_labels:
    if label == 'O':
        colors.append('#95a5a6')  # Gray for O
    elif 'DATE' in label:
        colors.append('#9b59b6')  # Purple for DATE
    elif 'PERSON' in label:
        colors.append('#3498db')  # Blue for PERSON
    elif 'ORG' in label:
        colors.append('#1abc9c')  # Teal for ORG
    elif 'PERCENT' in label:
        colors.append('#f39c12')  # Orange for PERCENT
    elif 'MONEY' in label:
        colors.append('#27ae60')  # Green for MONEY
    else:
        colors.append('#34495e')

fig2 = go.Figure()

fig2.add_trace(go.Bar(
    x=final_labels,
    y=final_counts,
    marker=dict(
        color=colors,
        line=dict(color='rgba(0,0,0,0.3)', width=1)
    ),
    text=[f'{count:,}' for count in final_counts],
    textposition='outside',
    textfont=dict(size=11, color='black', family='Arial Black'),
    hovertemplate='<b>%{x}</b><br>Count: %{y:,}<br>Percentage: %{customdata:.1f}%<extra></extra>',
    customdata=[count/sum(final_counts)*100 for count in final_counts]
))

fig2.update_layout(
    title={
        'text': 'NER Dataset Distribution',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 18, 'color': '#2c3e50'}
    },
    xaxis_title='Entity Categories',
    yaxis_title='Tag Count',
    yaxis=dict(showgrid=True, gridcolor='rgba(200,200,200,0.3)'),
    plot_bgcolor='rgba(240,240,240,0.5)',
    height=550,
    showlegend=False,
    margin=dict(t=100, b=100, l=80, r=50)
)

# Add entity type legend with color indicators
legend_text = """
"""

fig2.add_annotation(
    x=0.5, y=1.12,
    xref='paper', yref='paper',
    text=legend_text,
    showarrow=False,
    font=dict(size=11),
    align='center'
)

fig2.show()