In [1]:
import json
from typing import List, Dict, Tuple
from copy import deepcopy
from pprint import pprint

In [2]:
def convert_to_hashable(obj):
    """Convert a dictionary into a hashable tuple representation."""
    if isinstance(obj, dict):
        return tuple(
            (k, convert_to_hashable(v)) 
            for k, v in sorted(obj.items())
        )
    elif isinstance(obj, list):
        return tuple(convert_to_hashable(item) for item in obj)
    return obj

In [3]:
def remove_duplicates(data: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
    """Remove duplicate dictionaries from the list while preserving order."""
    seen = set()
    unique_data = []
    dups = []
    
    for i, item in enumerate(data):
        # Convert the dictionary to a hashable format
        hashable_item = convert_to_hashable(item)
        
        # If we haven't seen this item before, add it to our results
        if hashable_item not in seen:
            seen.add(hashable_item)
            unique_data.append(deepcopy(item))
        else:
            dups.append(deepcopy(item))

        print(f"Processed {i+1} items", end='\r')
    
    return unique_data, dups

In [4]:
infile = 'dataset_6.json'
outfile = f"{'_'.join(infile.split("_")[:2])}.json"
print(f"outfile: {outfile}")

outfile: dataset_6.json.json


In [5]:
# Read the JSON file
with open(infile, 'r') as f:
    data = json.load(f)

plans = data['plans']

# Remove duplicates
unique_data, duplicates = remove_duplicates(plans)

# Print statistics
print(f"Original number of plans: {len(plans)}")
print(f"Number of plans after removing duplicates: {len(unique_data)}")
print(f"Number of duplicates removed: {len(plans) - len(unique_data)}")

Original number of plans: 44175
Number of plans after removing duplicates: 44175
Number of duplicates removed: 0


In [7]:
# print("Duplicates:")
# for item in duplicates:
#     pprint(item)

In [8]:
# Write back to file if needed
with open(outfile, 'w') as f:
    data['plans'] = unique_data
    json.dump(data, f, indent=2)