In [7]:
import pandas as pd
import json
import re
from tqdm import tqdm

# Load the CSV files
text_data = pd.read_csv("preprocessed.csv")  # Contains 'text' column
ascii_data = pd.read_csv("filtered_gazetteer.csv")  # Contains 'ascii name' column

# Extract lists of text and country names
texts = text_data['text'].tolist()
country_names = ascii_data['asciiname'].dropna().astype(str).tolist()  # Drop NaNs and ensure strings

# Convert country names to a set for faster lookups
country_set = set(country_names)

# Initialize a list to store JSONL format data
jsonl_data = []

# Process each text for tagging with tqdm progress bar
for sentence in tqdm(texts, desc="Processing sentences"):
    entities = []
    if isinstance(sentence, str):  # Only proceed if sentence is a string
        # Tokenize the sentence
        tokens = sentence.split()  # Simple whitespace-based tokenization
        for token in tokens:
            # Remove punctuation from the token for matching
            token_cleaned = re.sub(r'[^\w\s]', '', token)
            start = sentence.find(token)  # Get the start index of the token
            end = start + len(token)  # Calculate end index
            
            # Check if the cleaned token is in the country set
            if token_cleaned in country_set:
                entities.append([start, end, "GPE"])
    
    # Append the sentence and entities in required JSONL format
    jsonl_data.append([sentence, {"entities": entities}])

# Write the output to a .jsonl file
output_path = "tagged_texts.jsonl"
with open(output_path, 'w', encoding='utf-8') as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + "\n")

print(f"Tagged data saved to {output_path}")


Processing sentences: 100%|██████████| 1011/1011 [00:00<00:00, 30179.79it/s]

Tagged data saved to tagged_texts.jsonl





In [8]:
import json

# Input and output file paths
input_path = "tagged_texts.jsonl"
output_path = "filtered_tagged_texts.jsonl"

# Open the input file and create an output file
with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        data = json.loads(line)  # Load each line as JSON
        entities = data[1].get("entities", [])  # Extract entities
        
        # Write only lines with entities
        if entities:
            outfile.write(json.dumps(data) + "\n")

print(f"Filtered data saved to {output_path}")


Filtered data saved to filtered_tagged_texts.jsonl
