In [1]:
import json

# Function to check for overlap between two entities
def has_overlap(ent1, ent2):
    return ent1[1] > ent2[0] and ent2[1] > ent1[0]

# Function to filter overlapping entities, keeping the shortest
def filter_overlapping_entities(entities):
    # Sort entities by start position, then by length (shorter ones first)
    entities = sorted(entities, key=lambda x: (x[0], x[1] - x[0]))
    filtered_entities = []
    
    for ent in entities:
        # Add entity only if it doesn't overlap with the last added one
        if not filtered_entities or not has_overlap(filtered_entities[-1], ent):
            filtered_entities.append(ent)
    
    return filtered_entities

# Load the annotated data from file
with open("data3.jsonl", "r") as file:
    annotated_data = [json.loads(line) for line in file]

# Process each entry to filter entities
filtered_data = []
for entry in annotated_data:
    text, entity_info = entry[0], entry[1]
    
    # Filter overlapping entities and keep only non-empty entity lists
    filtered_entities = filter_overlapping_entities(entity_info["entities"])
    
    if filtered_entities:
        filtered_data.append([text, {"entities": filtered_entities}])

# Save filtered data back to JSONL
with open("data3.jsonl", "w") as file:
    for entry in filtered_data:
        json.dump(entry, file)
        file.write("\n")
