In [1]:
import os
import glob
import json

In [2]:
base_path = "./data/wikiweb2m/text_image_scores"

json_files = glob.glob(os.path.join(base_path, "*.json"))

data = {}
for file in json_files:
    with open(file, 'r') as f:
        data.update(json.load(f))

print(f"Found {len(data)} image scores")

# save data
with open(os.path.join(os.path.dirname(base_path), 'image_scores.json'), 'w') as f:
    json.dump(data, f, indent=4)

Found 270 image scores


In [6]:
text_score_path = "./text_scores.json"

with open(text_score_path, 'r') as f:
    text_scores = json.load(f)

print(f"Found {len(text_scores)} text scores")


image_score_path = "./data/wikiweb2m/image_scores.json"

with open(image_score_path, 'r') as f:
    image_scores = json.load(f)

print(f"Found {len(image_scores)} image scores")


# find common keys
common_keys = set(text_scores.keys()).intersection(set(image_scores.keys()))
print(f"Found {len(common_keys)} common keys")

Found 10000 text scores
Found 270 image scores
Found 270 common keys


In [8]:
combined_scores_path = "./text_scores_image_True_text_True_scores.json"

with open(combined_scores_path, 'r') as f:
    combined_scores = json.load(f)

print(f"Found {len(combined_scores)} combined scores")

Found 270 combined scores


In [45]:
from evaluate import compare_rankings

t_correct, i_correct, i_t_wrong = compare_rankings(text_scores, image_scores)

In [50]:
n_t_correct, n_i_correct, n_i_t_wrong = len(t_correct), len(i_correct), len(i_t_wrong)

n_t_correct_p = n_t_correct / (n_t_correct + n_i_correct)
n_i_correct_p = n_i_correct / (n_t_correct + n_i_correct)
n_i_t_wrong_p = n_i_t_wrong / len(combined_scores)

print(f"Text correct: {n_t_correct} ({n_t_correct_p})")
print(f"Image correct: {n_i_correct} ({n_i_correct_p})")
print(f"both wrong p: {n_i_t_wrong} ({n_i_t_wrong_p})")

Text correct: 169 (0.9825581395348837)
Image correct: 3 (0.01744186046511628)
both wrong p: 14 (0.05185185185185185)


In [27]:
from tqdm import tqdm
base_path = "./data/wikiweb2m"

# load the documents
print('Loading document map ...')
with open(f'./data/wikiweb2m/test_document_map.json', 'r') as f:
    document_map = json.load(f)


documents = {}
for doc_id, filename in tqdm(document_map.items(), total=len(document_map), desc='Loading documents'):
    with open(filename, 'r') as f:
        document = json.load(f)
        documents[doc_id] = document

Loading document map ...


Loading documents: 100%|██████████| 100833/100833 [00:18<00:00, 5586.39it/s]


In [29]:
docs_i_correct = {doc_id: documents[doc_id] for doc_id in i_correct}

In [33]:
docs_i_correct['test_314'].keys()

dict_keys(['clean_page_description', 'is_page_description_sample', 'page_contains_images', 'page_content_sections_without_table_list', 'page_title', 'page_url', 'raw_page_description', 'split', 'text', 'image_urls', 'image_paths', 'image_captions'])

In [34]:
import os
from PIL import Image
import shutil

docs_i_correct['test_314']

for doc_id, doc in docs_i_correct.items():
    # Create a directory for each document
    doc_dir = os.path.join(base_path, 'image_examples', doc_id)
    os.makedirs(doc_dir, exist_ok=True)

    # Save the page title
    with open(os.path.join(doc_dir, 'title.txt'), 'w') as f:
        f.write(doc['page_title'])

    # Save the images
    for i, image_path in enumerate(doc['image_paths']):
        # Open the image
        img = Image.open(image_path)
        # Save the image to the document's directory
        img.save(os.path.join(doc_dir, f'image_{i}.jpg'))

    print(f"Saved images and title for document {doc_id} to {doc_dir}")


Saved images and title for document test_314 to ./data/wikiweb2m/image_examples/test_314
Saved images and title for document test_164 to ./data/wikiweb2m/image_examples/test_164
Saved images and title for document test_215 to ./data/wikiweb2m/image_examples/test_215
