In [2]:
import pandas as pd
import os
import json
from glob import glob

In [None]:
test_dir = os.path.join(os.getcwd(), 'test_data')

img_content_mapping = {}

json_files = glob(os.path.join(test_dir, '**/*.json'), recursive=True)

image_extensions = ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']

for json_path in json_files:
    base_name = os.path.splitext(os.path.basename(json_path))[0]
    base_path = os.path.join(os.path.dirname(json_path), base_name)
    
    img_path = None
    for ext in image_extensions:
        potential_img_path = base_path + ext
        if os.path.exists(potential_img_path):
            img_path = potential_img_path
            break
    
    if img_path is None:
        print(f"Warning: No image found for {json_path}")
        continue
    
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
        
        text_content = [item['text'] for item in json_data if 'text' in item]
        
        full_text = ' '.join(text_content)
        
        img_content_mapping[img_path] = full_text
        
    except Exception as e:
        print(f"Error processing {json_path}: {e}")

df = pd.DataFrame(list(img_content_mapping.items()), columns=['image_path', 'content'])

print(f"Found {len(df)} images with mapped content")
df.head()

Found 172 images with mapped content


Unnamed: 0,image_path,content
0,/Users/chigi/Developer/cd_ocr_code_runner/test...,* Cross road at next lights and continue strai...
1,/Users/chigi/Developer/cd_ocr_code_runner/test...,Language for communication use single -word an...
2,/Users/chigi/Developer/cd_ocr_code_runner/test...,Everything will be okay in the end. If it's no...
3,/Users/chigi/Developer/cd_ocr_code_runner/test...,"communication directly. knowledge, recepient c..."
4,/Users/chigi/Developer/cd_ocr_code_runner/test...,Date: 01/31/18 limit the request to the smalle...


In [14]:
df

Unnamed: 0,image_path,content
0,/Users/chigi/Developer/cd_ocr_code_runner/test...,* Cross road at next lights and continue strai...
1,/Users/chigi/Developer/cd_ocr_code_runner/test...,Language for communication use single -word an...
2,/Users/chigi/Developer/cd_ocr_code_runner/test...,Everything will be okay in the end. If it's no...
3,/Users/chigi/Developer/cd_ocr_code_runner/test...,"communication directly. knowledge, recepient c..."
4,/Users/chigi/Developer/cd_ocr_code_runner/test...,Date: 01/31/18 limit the request to the smalle...
...,...,...
167,/Users/chigi/Developer/cd_ocr_code_runner/test...,VITAMIN- A Found in green & leafy vegetables/ ...
168,/Users/chigi/Developer/cd_ocr_code_runner/test...,autonomy energing self-care Bodily Awareness u...
169,/Users/chigi/Developer/cd_ocr_code_runner/test...,Age range. How frequently do you shop on Junwa...
170,/Users/chigi/Developer/cd_ocr_code_runner/test...,1.00 pm See- saw 1.05 pm Scooter in playground...


In [15]:
df.to_csv('test_dataset.csv', index=False)