In [3]:
import os
import json

# Define the parent directory containing JSON folders
input_dir = "../data/input/detection/"
output_file = "../data/input/detection/merged.json"

all_json = []

# Iterate over the folders in the input directory
for folder in os.listdir(input_dir):
    folder_path = os.path.join(input_dir, folder)
    
    # Ensure it's a directory
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder}")
        
        # Iterate over the JSON files in the folder
        for jf in os.listdir(folder_path):
            json_file_path = os.path.join(folder_path, jf)
            
            # Ensure it's a file
            if os.path.isfile(json_file_path) and jf.endswith(".json"):
                try:
                    with open(json_file_path, 'r') as json_file:
                        data = json.load(json_file)
                        
                        # Update with absolute path if needed
                        if isinstance(data, dict):
                            data["absolute_path"] = os.path.abspath(json_file_path)
                        elif isinstance(data, list):
                            for entry in data:
                                if isinstance(entry, dict):
                                    entry["absolute_path"] = os.path.abspath(json_file_path)
                        
                        all_json.append(data)
                except Exception as e:
                    print(f"Error processing file {json_file_path}: {e}")

# Save all merged JSON data into a single file
try:
    with open(output_file, "w") as out_file:
        json.dump(all_json, out_file, indent=4)
    print(f"Merged JSON saved to: {os.path.abspath(output_file)}")
except Exception as e:
    print(f"Error saving merged JSON file: {e}")


Processing folder: TsubasaNoKioku
Processing folder: balloon_dream
Processing folder: boureisougi
Processing folder: rasetugari
Processing folder: tencho_isoro
Processing folder: tojime_no_siora
Merged JSON saved to: /work/pi_miyyer_umass_edu/ctpham/cs670-manga/data/input/detection/merged.json


In [2]:
import json

# Define the input JSON file and the output JSON file
input_json_file = "../data/input/detection/merged.json"
output_json_file = "../data/input/detection/updated_merged.json"

# Load the JSON data
try:
    with open(input_json_file, "r") as infile:
        data = json.load(infile)
except Exception as e:
    print(f"Error reading JSON file: {e}")
    exit()

# Remove the 'path' field from each entry
for entry in data:
    if isinstance(entry, dict) and "path" in entry:
        del entry["path"]

# Save the updated data back to a new JSON file
try:
    with open(output_json_file, "w") as outfile:
        json.dump(data, outfile, indent=4)
    print(f"Updated JSON saved to: {output_json_file}")
except Exception as e:
    print(f"Error saving updated JSON file: {e}")


Updated JSON saved to: ../data/input/detection/updated_merged.json


In [11]:
import pandas as pd 
import ast
import json

df = pd.read_json("../data/output/detection/merged.json")
df.head()

Unnamed: 0,absolute_path,path,coordinates,text
0,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,"[[297, 1130, 25, 30], [250, 1121, 47, 49], [17...",
1,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,"[[1471, 927, 48, 32], [1471, 886, 52, 33], [13...",
2,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,"[[647, 961, 44, 83], [949, 849, 143, 268], [12...",
3,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,"[[430, 1109, 25, 30], [1200, 1108, 25, 30], [5...",
4,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,/work/pi_miyyer_umass_edu/ctpham/cs670-manga/d...,"[[418, 1106, 26, 30], [1336, 926, 57, 181], [4...",


In [13]:
# Expand the list in each row into multiple rows
def expand_rows(row):
    coordinates = row['coordinates']
    texts = row['text']
    expanded_rows = []
    for coord, text in zip(coordinates, texts):
        expanded_rows.append({
            "absolute_path": row["absolute_path"],
            "path": row["path"],
            "coordinates": coord,
            "text": text
        })
    return expanded_rows

# Apply the function and expand the DataFrame
expanded_data = []
df = df.dropna().reset_index(drop=True)
for _, row in df.iterrows():
    expanded_data.extend(expand_rows(row))

expanded_df = pd.DataFrame(expanded_data).reset_index(drop=True).to_csv("../data/output/detection/merged.csv", index=False)