In [1]:
import json

def clean_overlapping_entities(data):
    cleaned_data = []
    
    for item in data:
        text, annotations = item
        entities = annotations['entities']
        
        # Sort entities by their start position, and then by end position (to detect overlaps)
        entities = sorted(entities, key=lambda x: (x[0], x[1]))
        
        # Initialize a list to hold cleaned entities
        cleaned_entities = []
        prev_end = -1
        
        for entity in entities:
            start, end, label = entity
            
            # If the current entity does not overlap with the previous one, add it to cleaned_entities
            if start >= prev_end:
                cleaned_entities.append(entity)
                prev_end = end
        
        # Add the cleaned example to cleaned_data
        cleaned_data.append([text, {"entities": cleaned_entities}])
    
    return cleaned_data

# Function to load JSONL data from file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Function to save cleaned JSONL data back to file
def save_jsonl(data, output_file):
    with open(output_file, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

# Example usage with file paths
input_file = 'train/train_data_2.jsonl'  # Replace with your input file path
output_file = 'tezzzt.jsonl'  # Output file path

# Load, clean, and save the data
test_data = load_jsonl(input_file)
cleaned_test_data = clean_overlapping_entities(test_data)
save_jsonl(cleaned_test_data, output_file)

print(f"Cleaned data saved to {output_file}")


Cleaned data saved to tezzzt.jsonl
