In [3]:
import json
import pandas as pd

# Load and sort Japan locations by population from JP.txt
jp_data = pd.read_csv("JP.txt", sep='\t', header=None, usecols=[2, 14], names=["location", "population"])  # Column 1 is location, column 14 is population
jp_data = jp_data.dropna(subset=["location", "population"])  # Remove rows with missing location or population
jp_data = jp_data.sort_values(by="population", ascending=False)  # Sort by population descending
jp_locations = jp_data["location"].astype(str).unique().tolist()  # Ensure locations are strings
jp_location_iter = iter(jp_locations)  # Create an iterator

# Function to replace GPE entities with Japan locations and adjust indices
def replace_gpe_with_japan_location(text, entities):
    new_entities = []
    new_text = text
    offset = 0

    for start, end, label in entities:
        if label == "GPE":
            try:
                # Get the next available Japan location
                japan_location = next(jp_location_iter)
            except StopIteration:
                print("Ran out of Japan locations.")
                break
            
            # Replace text at the GPE entity position with the Japan location
            original_gpe = text[start:end]
            new_text = new_text[:start + offset] + japan_location + new_text[end + offset:]
            
            # Calculate the new start and end positions
            new_start = start + offset
            new_end = new_start + len(japan_location)
            offset += len(japan_location) - len(original_gpe)
            
            # Append the new entity with adjusted indices
            new_entities.append([new_start, new_end, label])
        else:
            # If not GPE, keep the entity as it is
            adjusted_start = start + offset
            adjusted_end = end + offset
            new_entities.append([adjusted_start, adjusted_end, label])
    
    return new_text, new_entities

# Process each entry in the input .jsonl file
updated_entries = []
with open("data2.jsonl", "r") as file:
    for line in file:
        entry = json.loads(line)
        text, entity_info = entry[0], entry[1]
        entities = entity_info["entities"]
        
        # Replace GPEs and adjust entity indices
        new_text, new_entities = replace_gpe_with_japan_location(text, entities)
        
        # Add updated entry to the list
        updated_entries.append([new_text, {"entities": new_entities}])

# Save updated entries to a new .jsonl file
with open("data3.jsonl", "w") as file:
    for entry in updated_entries:
        json.dump(entry, file)
        file.write("\n")