Still a work in progress.
In sequence:
1. Crawl your spiders, and store the output data as json, in a specific folder.
2. Step 1 (individual json): Map each org_id with org_sort. The map reference is in org_mapping.json
3. Step 2 (individual json): Create UUID & Sort data according to; division_sort_order, person_sort_order. 
- A new key is introduced: 'person_sort'. This fixes the incorrect 'person_sort_order'.
4. Step 3 (all json): Compile all data.
- Removed 'person_sort_order'.
5. Step 4 (all json): Re-sort the compiled.json according to; the org_id, division_sort, person_sort.
- The final output.

### 1. Mapping org_id with org_sort

In [8]:
import json
import os

def load_org_mapping(mapping_file):
    with open(mapping_file, 'r') as file:
        return json.load(file)

def load_spider_output(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def save_updated_data(data, output_file):
    with open(output_file, 'w') as file:
        json.dump(data, file, indent=4)

def map_org_sort(spider_output, org_mapping):
    for entry in spider_output:
        org_id = entry.get('org_id')
        
        if not org_id:
            print(f"Warning: 'org_id' missing in entry: {entry}")
            continue  #skip this entry if org_id is missing
        
        # Map org_sort if org_id exists in the mapping, otherwise log a warning
        if org_id in org_mapping:
            entry['org_sort'] = org_mapping[org_id]
        else:
            print(f"Warning: 'org_id' {org_id} not found in org_mapping.")

    return spider_output

def clean_output_directory(output_dir):
    """Function to clean the output directory by deleting existing files"""
    if os.path.exists(output_dir):
        for filename in os.listdir(output_dir):
            file_path = os.path.join(output_dir, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)
                print(f"Deleted {file_path}")
    else:
        os.makedirs(output_dir, exist_ok=True)  #create the directory if it doesn't exist

def process_all_files(input_dir, output_dir, org_mapping): #main
    clean_output_directory(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith(".json"):
            input_file_path = os.path.join(input_dir, filename)
            
            output_file_name = f"{os.path.splitext(filename)[0]}_org_mapped.json"
            output_file_path = os.path.join(output_dir, output_file_name)

            spider_output = load_spider_output(input_file_path)

            updated_data = map_org_sort(spider_output, org_mapping)

            save_updated_data(updated_data, output_file_path)

            print(f"Processed {filename}. Saved to {output_file_path}")

def main():
    input_dir = 'input'  # Folder containing the spider output JSON files
    output_dir = 'output1'  # Folder where the updated JSON files will be saved
    org_mapping_file = 'org_mapping.json'  # File containing the org_id to org_sort mapping

    # Load the org_id to org_sort mapping
    org_mapping = load_org_mapping(org_mapping_file)

    process_all_files(input_dir, output_dir, org_mapping)

    print(f"All files processed and saved in {output_dir}")

if __name__ == "__main__":
    main()


Deleted output1/v2_petra_org_mapped.json
Deleted output1/v2_mof_org_mapped.json
Deleted output1/v2_kkdw_anggota_org_mapped.json
Deleted output1/v2_kpkm_org_mapped.json
Deleted output1/v2_kln_org_mapped.json
Processed v2_mof.json. Saved to output1/v2_mof_org_mapped.json
Processed v2_kkdw_anggota.json. Saved to output1/v2_kkdw_anggota_org_mapped.json
Processed v2_kln.json. Saved to output1/v2_kln_org_mapped.json
Processed v2_kpkm.json. Saved to output1/v2_kpkm_org_mapped.json
Processed v2_petra.json. Saved to output1/v2_petra_org_mapped.json
All files processed and saved in output1


### 2. Create UUID & Sort according to : division_sort_order, person_sort_order

In [9]:
import json
import os
import uuid

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def write_json_file(file_path, data):
    with open(file_path, 'w') as file:
        file.write('[\n')
        for i, obj in enumerate(data):
            if i > 0:
                file.write(',\n')  #add a comma and newline between objects
            file.write(json.dumps(obj, separators=(',', ':')))  #write each object compactly
        file.write('\n]')

def generate_uuid():
    return str(uuid.uuid4())

# def process_data(data): #Option 1: the person_sort does not reset for each division
#     sorted_data = sorted(data, key=lambda x: (x['division_sort'], x['person_sort_order']))
#     processed_data = []
#     for idx, entry in enumerate(sorted_data):
#         new_entry = {'id': generate_uuid()}
#         new_entry.update({'person_sort': idx + 1})
#         new_entry.update(entry)
#         processed_data.append(new_entry)
#     return processed_data

def process_data(data): #Option 2: the person_sort will reset for each division
    sorted_data = sorted(data, key=lambda x: (x['division_sort'], x['person_sort_order'])) #sort the data based on 'division_sort' and 'person_sort_order'
    processed_data = []
    current_division = None
    person_sort_counter = 0

    for entry in sorted_data:
        #check if the division has changed and reset the person_sort_counter
        if entry['division_sort'] != current_division:
            current_division = entry['division_sort']
            person_sort_counter = 1
        else:
            person_sort_counter += 1

        #new_entry = {'id': generate_uuid()}  # Place 'id' at the beginning
        #new_entry.update({'person_sort': person_sort_counter})  # Reset person_sort for each division
        new_entry = entry
        new_entry.update({'person_sort': person_sort_counter})  # Reset person_sort for each division
        new_entry.update(entry)  # Add the rest of the original data
        processed_data.append(new_entry)

    return processed_data

def clean_output_directory(output_dir):
    for filename in os.listdir(output_dir):
        if filename.endswith("_sorted.json"):
            file_path = os.path.join(output_dir, filename)
            os.remove(file_path)
            print(f"Deleted {file_path}")

#main function to process all JSON files in a directory
def process_all_files(input_dir, output_dir=None):
    #if no output directory is specified, use the input directory
    if output_dir is None:
        output_dir = input_dir

    #ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    #clean the output directory by removing existing "_sorted.json" files
    clean_output_directory(output_dir)

    # Iterate over all files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".json"):
            input_file_path = os.path.join(input_dir, filename)
            
            data = read_json_file(input_file_path)
            
            processed_data = process_data(data)
            
            output_file_name = f"{os.path.splitext(filename)[0]}_sorted.json"
            output_file_path = os.path.join(output_dir, output_file_name)
            
            write_json_file(output_file_path, processed_data)
            
            print(f"Processed {filename} and saved to {output_file_name}")

input_dir = 'output1'  # where you store the input files
output_dir = 'output2'  # will default to input directory if not provided

process_all_files(input_dir, output_dir)


Deleted output2/v2_kpkm_org_mapped_sorted.json
Deleted output2/v2_mof_org_mapped_sorted.json
Deleted output2/v2_kkdw_anggota_org_mapped_sorted.json
Deleted output2/v2_kln_org_mapped_sorted.json
Deleted output2/v2_petra_org_mapped_sorted.json
Processed v2_petra_org_mapped.json and saved to v2_petra_org_mapped_sorted.json
Processed v2_mof_org_mapped.json and saved to v2_mof_org_mapped_sorted.json
Processed v2_kkdw_anggota_org_mapped.json and saved to v2_kkdw_anggota_org_mapped_sorted.json
Processed v2_kpkm_org_mapped.json and saved to v2_kpkm_org_mapped_sorted.json
Processed v2_kln_org_mapped.json and saved to v2_kln_org_mapped_sorted.json


### 3. Compile all data
- check whether the required keys are there (based on the discussed json structure)
- remove 'person_sort_order'

In [10]:
import json
import os
import glob
import uuid

input_folder = 'output2/'
output_file = 'compiled2.json'

required_keys = [
    #"id", 
    "org_sort", "org_id", "org_name", "org_type", "division_sort",
    "division_name", "unit_name", "person_sort", "person_name",
    "person_position", "person_email", "person_phone", "person_fax", "parent_org_id"
]

def ensure_required_keys(entry, json_file):
    # Add a UUID for the 'id' field if missing
    # if 'id' not in entry:
    #     entry['id'] = str(uuid.uuid4())
    #     print(f"Missing 'id' in {json_file}, assigning UUID.")

    for key in required_keys:
        if key not in entry:
            print(f"Missing '{key}' in {json_file}, setting default value.")
            if key == "parent_org_id":
                entry[key] = []  # Ensure 'parent_org_id' is always a list
            else:
                entry[key] = None  # Default value for missing fields

    return entry

if os.path.exists(output_file):
    os.remove(output_file)
    print(f"Deleted existing {output_file}")

compiled_data = []

if os.path.exists(input_folder):
    json_files = glob.glob(os.path.join(input_folder, '*.json')) #loop

    #read each file and append its content to the compiled_data list
    for json_file in json_files:
        with open(json_file, 'r') as f:
            data = json.load(f)

            for entry in data:
                #remove 'person_sort_order' if it exists in any of the entries
                if 'person_sort_order' in entry:
                    del entry['person_sort_order']

                #ensure each entry has all required keys, and log missing keys
                entry = ensure_required_keys(entry, json_file)

                compiled_data.extend([entry])  #combine all entries from each JSON

    with open(output_file, 'w') as outfile:
        json.dump(compiled_data, outfile, indent=4)

    print(f"Compiled {len(json_files)} JSON files into {output_file}")
else:
    print(f"The folder {input_folder} does not exist.")


Deleted existing compiled2.json
Compiled 5 JSON files into compiled2.json


### 4. Re-sort the compiled.json - the org_id

In [11]:
import json
import os

compiled_file = 'compiled2.json'
output_file = 'compiled_sorted_v2.json'

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def write_json_file(file_path, data):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

#sort the data based on 'org_sort', 'division_sort', and 'person_sort'
def sort_data(data):
    return sorted(data, key=lambda x: (x['org_sort'], x['division_sort'], x['person_sort']))

if os.path.exists(compiled_file):
    compiled_data = read_json_file(compiled_file)

    sorted_data = sort_data(compiled_data)

    write_json_file(output_file, sorted_data)

    print(f"Sorted data and saved to {output_file}")
else:
    print(f"The file {compiled_file} does not exist.")


Sorted data and saved to compiled_sorted_v2.json
