In [20]:
# Import required libraries
import json
import pandas as pd
import numpy as np
from pathlib import Path
import ast
import os

In [21]:
# Get the current file's directory (src folder)
current_dir = Path(__file__).parent if '__file__' in globals() else Path.cwd()

# Define data directory (one level up from src, then into data)
DATA_DIR = current_dir.parent / 'data'

# Create data directory if it doesn't exist
DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory path: {DATA_DIR}")
print(f"Data directory exists: {DATA_DIR.exists()}")

Data directory path: c:\Users\gorkemozkan\Desktop\gorkDrive\finnews-insights\data
Data directory exists: True


In [22]:
# Function to read and combine JSON files
def combine_json_files(json_files):
    all_data = []
    
    for file_name in json_files:
        file_path = DATA_DIR / file_name
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line.strip())
                    if isinstance(data, list):
                        all_data.extend(data)
                    else:
                        all_data.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error in {file_name}, line: {line[:50]}...")
                    continue
    
    return pd.DataFrame(all_data)

# Process training data
train_files = ['train00.json', 'train01.json', 'train02.json', 'train03.json']
df_train = combine_json_files(train_files)

# Save to Excel
output_file = DATA_DIR / 'combined_train_data.xlsx'
df_train.to_excel(output_file, index=False)

print(f"Data successfully saved to {output_file}")
print(f"Total number of rows: {len(df_train)}")
print("\nFirst few rows of the DataFrame:")
display(df_train.head())

Data successfully saved to c:\Users\gorkemozkan\Desktop\gorkDrive\finnews-insights\data\combined_train_data.xlsx
Total number of rows: 59924

First few rows of the DataFrame:


Unnamed: 0,tags,tokens
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[People, start, their, own, businesses, for, m..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[But, a, chance, to, fill, out, sales, -, tax,..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[Red, tape, is, the, bugaboo, of, small, busin..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Ironically, ,, the, person, who, wants, to, r..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Yet, every, business, owner, has, to, face, t..."


In [23]:
# Process test data
test_files = ['test.json']
df_test = combine_json_files(test_files)

# Save to Excel
output_file = DATA_DIR / 'combined_test_data.xlsx'
df_test.to_excel(output_file, index=False)

print(f"Data successfully saved to {output_file}")
print(f"Total number of rows: {len(df_test)}")
print("\nFirst few rows of the DataFrame:")
display(df_test.head())

Data successfully saved to c:\Users\gorkemozkan\Desktop\gorkDrive\finnews-insights\data\combined_test_data.xlsx
Total number of rows: 8262

First few rows of the DataFrame:


Unnamed: 0,tags,tokens
0,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, ...","[The, following, were, among, Friday, 's, offe..."
1,"[11, 12, 12, 12]","[Dow, Chemical, Co., --]"
2,"[16, 17, 17, 0, 13, 14, 0, 0, 0, 2, 3, 3, 3, 0...","[$, 150, million, of, 8.55, %, senior, notes, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, ...","[The, issue, ,, which, is, puttable, back, to,..."
4,"[0, 0, 0, 0, 0, 1, 0, 11, 12, 12, 12, 12, 0, 0...","[Rated, single, -, A, -, 1, by, Moody, 's, Inv..."


In [24]:
# Process validation data
valid_files = ['valid.json']
df_valid = combine_json_files(valid_files)

# Save to Excel
output_file = DATA_DIR / 'combined_valid_data.xlsx'
df_valid.to_excel(output_file, index=False)

print(f"Data successfully saved to {output_file}")
print(f"Total number of rows: {len(df_valid)}")
print("\nFirst few rows of the DataFrame:")
display(df_valid.head())

Data successfully saved to c:\Users\gorkemozkan\Desktop\gorkDrive\finnews-insights\data\combined_valid_data.xlsx
Total number of rows: 8528

First few rows of the DataFrame:


Unnamed: 0,tags,tokens
0,"[0, 6, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...","[A, Russian, diver, has, found, the, bodies, o..."
1,"[0, 0, 0, 0, 0, 0, 0, 6, 0, 6, 0, 0, 0, 0, 0, ...","[The, diver, entered, the, sub, after, a, Russ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[The, drilling, and, cutting, effort, took, se..."
3,"[11, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[Navy, officials, do, not, expect, that, all, ..."
4,"[11, 12, 12, 12, 0, 0, 7, 0, 0, 0, 0, 0, 7, 0,...","[The, Balkan, Stability, Pact, has, admitted, ..."


In [25]:
# Function to convert string representations of lists to actual lists
def convert_string_to_list(string_list):
    try:
        return ast.literal_eval(string_list)
    except:
        return []

# Read Excel files
df_train = pd.read_excel(DATA_DIR / 'combined_train_data.xlsx')
df_test = pd.read_excel(DATA_DIR / 'combined_test_data.xlsx')
df_valid = pd.read_excel(DATA_DIR / 'combined_valid_data.xlsx')

# Convert string lists to actual lists for each dataset
for df in [df_train, df_test, df_valid]:
    df['tags'] = df['tags'].apply(convert_string_to_list)

# Labels to be converted to zero
labels_to_zero = [1, 6, 7, 8, 9, 10, 15, 18, 19, 20, 21, 22, 23, 24, 25, 
                 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]

# Function to update tags
def update_tags(tags_list):
    return [0 if tag in labels_to_zero else tag for tag in tags_list]

# Update datasets
for df in [df_train, df_test, df_valid]:
    df['tags'] = df['tags'].apply(update_tags)

# Save updated datasets
df_train.to_excel(DATA_DIR / 'updated_train_data.xlsx', index=False)
df_test.to_excel(DATA_DIR / 'updated_test_data.xlsx', index=False)
df_valid.to_excel(DATA_DIR / 'updated_valid_data.xlsx', index=False)

print("Datasets have been updated and saved.")
print("\nSample rows from training dataset:")
display(df_train.head())

# Check unique labels distribution
print("\nUnique labels remaining in the training dataset after update:")
unique_tags = set()
for tags in df_train['tags']:
    unique_tags.update(tags)
print(sorted(list(unique_tags)))

Datasets have been updated and saved.

Sample rows from training dataset:


Unnamed: 0,tags,tokens
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","['People', 'start', 'their', 'own', 'businesse..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","['But', 'a', 'chance', 'to', 'fill', 'out', 's..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","['Red', 'tape', 'is', 'the', 'bugaboo', 'of', ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['Ironically', ',', 'the', 'person', 'who', 'w..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['Yet', 'every', 'business', 'owner', 'has', '..."



Unique labels remaining in the training dataset after update:
[0, 2, 3, 4, 5, 11, 12, 13, 14, 16, 17]


In [26]:
# Define label mapping for renumbering (Old -> New)
label_mapping = {
    0: 0,    # Keep O label as 0
    2: 1,
    3: 2,
    4: 3,
    5: 4,
    11: 5,
    12: 6,
    13: 7,
    14: 8,
    16: 9,
    17: 10
}

# Function to update label numbers
def update_label_numbers(tags_list):
    return [label_mapping[tag] for tag in tags_list]

# Update labels in each dataset
for df in [df_train, df_test, df_valid]:
    df['tags'] = df['tags'].apply(update_label_numbers)

# Check updated label distribution
print("Label distribution after update:")
unique_tags = set()
for tags in df_train['tags']:
    unique_tags.update(tags)
print("New label values:", sorted(list(unique_tags)))

# Original label definitions
original_labels = {
    "O": 0,
    "B-DATE": 2,
    "I-DATE": 3,
    "B-PERSON": 4,
    "I-PERSON": 5,
    "B-ORG": 11,
    "I-ORG": 12,
    "B-PERCENT": 13,
    "I-PERCENT": 14,
    "B-MONEY": 16,
    "I-MONEY": 17
}

# Create new label system
new_labels = {key: label_mapping[value] for key, value in original_labels.items()}

# Save to label.json
with open(DATA_DIR / 'label.json', 'w') as f:
    json.dump(new_labels, f, indent=4)

print("\nNew label system saved to label.json")
print("\nLabel mappings:")
for old_label, old_id in original_labels.items():
    print(f"{old_label}: {old_id} -> {label_mapping[old_id]}")

# Save updated datasets
df_train.to_excel(DATA_DIR / 'updated_train_data.xlsx', index=False)
df_test.to_excel(DATA_DIR / 'updated_test_data.xlsx', index=False)
df_valid.to_excel(DATA_DIR / 'updated_valid_data.xlsx', index=False)

print("\nUpdated datasets have been saved.")

Label distribution after update:
New label values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

New label system saved to label.json

Label mappings:
O: 0 -> 0
B-DATE: 2 -> 1
I-DATE: 3 -> 2
B-PERSON: 4 -> 3
I-PERSON: 5 -> 4
B-ORG: 11 -> 5
I-ORG: 12 -> 6
B-PERCENT: 13 -> 7
I-PERCENT: 14 -> 8
B-MONEY: 16 -> 9
I-MONEY: 17 -> 10

New label system saved to label.json

Label mappings:
O: 0 -> 0
B-DATE: 2 -> 1
I-DATE: 3 -> 2
B-PERSON: 4 -> 3
I-PERSON: 5 -> 4
B-ORG: 11 -> 5
I-ORG: 12 -> 6
B-PERCENT: 13 -> 7
I-PERCENT: 14 -> 8
B-MONEY: 16 -> 9
I-MONEY: 17 -> 10

Updated datasets have been saved.

Updated datasets have been saved.


In [27]:
# Perform final checks
print("Dataset validations:")
print("\n1. Dataset dimensions:")
print(f"Train: {len(df_train)} samples")
print(f"Valid: {len(df_valid)} samples")
print(f"Test: {len(df_test)} samples")

print("\n2. Check token and tag count alignment in each sample:")
def check_token_tag_alignment(df, name):
    mismatches = 0
    for i, row in df.iterrows():
        if len(row['tokens']) != len(row['tags']):
            mismatches += 1
            if mismatches <= 3:  # Show first 3 mismatches
                print(f"\nMismatch in {name}, sample {i}:")
                print(f"Number of tokens: {len(row['tokens'])}")
                print(f"Number of tags: {len(row['tags'])}")
                print(f"Tokens: {row['tokens']}")
                print(f"Tags: {row['tags']}")
    return mismatches

train_mismatches = check_token_tag_alignment(df_train, "train")
valid_mismatches = check_token_tag_alignment(df_valid, "valid")
test_mismatches = check_token_tag_alignment(df_test, "test")

print(f"\nTotal number of mismatches:")
print(f"Train: {train_mismatches}")
print(f"Valid: {valid_mismatches}")
print(f"Test: {test_mismatches}")

print("\n3. Check label distribution:")
def check_label_distribution(df, name):
    all_tags = []
    for tags in df['tags']:
        all_tags.extend(tags)
    
    unique, counts = np.unique(all_tags, return_counts=True)
    print(f"\nLabel distribution in {name} dataset:")
    for label, count in zip(unique, counts):
        label_name = [k for k, v in new_labels.items() if v == label][0]
        print(f"{label_name} (ID: {label}): {count} occurrences")

# Check distribution for each dataset
for df, name in [(df_train, 'train'), (df_valid, 'valid'), (df_test, 'test')]:
    check_label_distribution(df, name)

Dataset validations:

1. Dataset dimensions:
Train: 59924 samples
Valid: 8528 samples
Test: 8262 samples

2. Check token and tag count alignment in each sample:

Mismatch in train, sample 0:
Number of tokens: 80
Number of tags: 9
Tokens: ['People', 'start', 'their', 'own', 'businesses', 'for', 'many', 'reasons', '.']
Tags: [0, 0, 0, 0, 0, 0, 0, 0, 0]

Mismatch in train, sample 1:
Number of tokens: 117
Number of tags: 16
Tokens: ['But', 'a', 'chance', 'to', 'fill', 'out', 'sales', '-', 'tax', 'records', 'is', 'rarely', 'one', 'of', 'them', '.']
Tags: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Mismatch in train, sample 2:
Number of tokens: 71
Number of tags: 9
Tokens: ['Red', 'tape', 'is', 'the', 'bugaboo', 'of', 'small', 'business', '.']
Tags: [0, 0, 0, 0, 0, 0, 0, 0, 0]

Mismatch in valid, sample 0:
Number of tokens: 221
Number of tags: 27
Tokens: ['A', 'Russian', 'diver', 'has', 'found', 'the', 'bodies', 'of', 'three', 'of', 'the', '118', 'sailors', 'who', 'were', 'killed', 'wh

In [28]:
# Function to remove samples where all tags are 0
def remove_all_zero_samples(df):
    # Check if all tags in a row are 0
    mask = df['tags'].apply(lambda x: not all(tag == 0 for tag in x))
    filtered_df = df[mask]
    return filtered_df

print("\n4. Remove samples where all tags are 0:")
print("Before filtering:")
print(f"Train: {len(df_train)} samples")
print(f"Valid: {len(df_valid)} samples")
print(f"Test: {len(df_test)} samples")

# Apply filtering to all datasets
df_train_final = remove_all_zero_samples(df_train)
df_valid_final = remove_all_zero_samples(df_valid)
df_test_final = remove_all_zero_samples(df_test)

print("\nAfter filtering:")
print(f"Train: {len(df_train_final)} samples (removed {len(df_train) - len(df_train_final)} samples)")
print(f"Valid: {len(df_valid_final)} samples (removed {len(df_valid) - len(df_valid_final)} samples)")
print(f"Test: {len(df_test_final)} samples (removed {len(df_test) - len(df_test_final)} samples)")

# Save final datasets
df_train_final.to_excel(DATA_DIR / 'final_train_data.xlsx', index=False)
df_valid_final.to_excel(DATA_DIR / 'final_valid_data.xlsx', index=False)
df_test_final.to_excel(DATA_DIR / 'final_test_data.xlsx', index=False)

print("\nFinal datasets have been saved.")

# Check label distribution in final datasets
print("\n5. Final label distribution after removing all-zero samples:")
for df, name in [(df_train_final, 'train'), (df_valid_final, 'valid'), (df_test_final, 'test')]:
    check_label_distribution(df, name)


4. Remove samples where all tags are 0:
Before filtering:
Train: 59924 samples
Valid: 8528 samples
Test: 8262 samples

After filtering:
Train: 23603 samples (removed 36321 samples)
Valid: 3062 samples (removed 5466 samples)
Test: 3239 samples (removed 5023 samples)

Final datasets have been saved.

5. Final label distribution after removing all-zero samples:

Label distribution in train dataset:
O (ID: 0): 499783 occurrences
B-DATE (ID: 1): 10922 occurrences
I-DATE (ID: 2): 13333 occurrences
B-PERSON (ID: 3): 15429 occurrences
I-PERSON (ID: 4): 11147 occurrences
B-ORG (ID: 5): 12820 occurrences
I-ORG (ID: 6): 18246 occurrences
B-PERCENT (ID: 7): 1763 occurrences
I-PERCENT (ID: 8): 2498 occurrences
B-MONEY (ID: 9): 2411 occurrences
I-MONEY (ID: 10): 4912 occurrences

Label distribution in valid dataset:
O (ID: 0): 63406 occurrences
B-DATE (ID: 1): 1507 occurrences
I-DATE (ID: 2): 1809 occurrences
B-PERSON (ID: 3): 2020 occurrences
I-PERSON (ID: 4): 1395 occurrences
B-ORG (ID: 5): 1740 