## Notebook for Frame Annotation for Image and Text for 200 Sampled data

#### Load libraries
`pip install pigeonXT-jupyter`

In [18]:
import pigeonXT as pixt

In [19]:
import pandas as pd
import re
from pathlib import Path
from IPython.display import display, Image

## Create sample of 200 text-image which are filtered

In [22]:
analysis_path = Path('/projects/frame_align/data/annotated/analysis')
sample_200 = pd.read_csv(analysis_path.parent.parent / "sample"/'sampled_annotated_articles.csv')

sampled_200_csv_file = analysis_path.parent.parent / "srishti-analysis"/"sampled_annotated_articles_200.csv"
# delete the file if it exists
if sampled_200_csv_file.exists():
    sampled_200_csv_file.unlink()
    print(f"Existed!Deleted {sampled_200_csv_file}")

# sample 200 from the annotated articles
sample_200 = sample_200.sample(200, random_state=42)
sample_200.to_csv(analysis_path.parent.parent / "srishti-analysis"/"sampled_annotated_articles_200.csv", index=False)
print(f"Saved {sampled_200_csv_file}")

Existed!Deleted /projects/frame_align/data/srishti-analysis/sampled_annotated_articles_200.csv
Saved /projects/frame_align/data/srishti-analysis/sampled_annotated_articles_200.csv


#### Create list of frame names

In [23]:
# Get Frame names
frames = sample_200['text_frame_name'].unique()
frames_list = frames.tolist()
print(f"Frame length: {len(frames_list)}")

Frame length: 15


## Save all 200 sampled images for human annotation


#### Create path of image directories

In [24]:
all_months = ["2023-05-01_2023-05-31", "2023-06-01_2023-06-30", "2023-07-01_2023-07-31", "2023-08-01_2023-08-31", "2023-09-01_2023-09-30",
    "2023-10-01_2023-10-31", "2023-11-01_2023-11-30", "2023-12-01_2023-12-31", "2024-01-01_2024-01-31", "2024-02-01_2024-02-29",
    "2024-03-01_2024-03-31", "2024-04-01_2024-04-30"]


base_img_dir_1 = Path("/projects/frame_align/data/news_img_data/")
base_img_dir_2 = Path("/projects/frame_align/data/img_data/")

month_dir_paths = [base_img_dir_1 / month for month in all_months] + [base_img_dir_2 / month / 'correct_downloaded_imgs' for month in all_months]
sorted_month_dir_paths = sorted(month_dir_paths)

#### Save month and uuids for images

In [25]:
vision_month_uuids = sample_200[['month', 'vision_uuid']]
vision_month_uuids

Unnamed: 0,month,vision_uuid
2754,2023-10-01_2023-10-31,0416eb08-005f-44ba-aec8-ca0ee5e3faaa
2624,2023-10-01_2023-10-31,f55eea4a-8826-4c81-a927-3335b7bdfd64
178,2024-02-01_2024-02-29,ea16e806-d15f-41ba-9146-064f2c8f7ece
1502,2023-07-01_2023-07-31,89e378c2-efce-4100-8ce0-736f326280e8
2728,2023-10-01_2023-10-31,4254cc3e-ff7e-4810-b3dd-b3cacb4aff56
...,...,...
184,2024-02-01_2024-02-29,c69cfe52-e928-4dc3-b36b-cf2c9e081e0f
478,2023-12-01_2023-12-31,688c040c-187b-45c7-8f29-34c94b2e0633
1475,2023-07-01_2023-07-31,72e8990b-2611-4558-ad7f-602058384aab
3083,2023-08-01_2023-08-31,e6dc5ef2-40da-48e7-a517-63a347c2e53c


#### Create a list of image paths we sampled

In [26]:
from tqdm import tqdm

existing_files = []

for index, row in tqdm(vision_month_uuids.iterrows(), total=vision_month_uuids.shape[0]):
    month = row['month']
    uuid = row['vision_uuid']
    for dir_path in sorted_month_dir_paths:
        if month in str(dir_path):
            file_path = dir_path / f"{uuid}.jpg"
            if file_path.exists():
                existing_files.append(file_path)

100%|██████████| 200/200 [00:00<00:00, 581.33it/s]


#### Copy the images to a new directory which we will use for annotation
new image directory= "../human_annotation/images"

In [27]:
import shutil
from tqdm import tqdm

existing_files
destination_dir = Path("../human_annotation/images")
destination_dir.mkdir(parents=True, exist_ok=True)

for file_path in tqdm(existing_files, desc="Copying files"):
    destination = destination_dir / file_path.name
    shutil.copy(file_path, destination)

Copying files: 100%|██████████| 200/200 [00:01<00:00, 135.10it/s]


In [28]:
import glob

all_image_paths = glob.glob(str(destination_dir / "*.jpg"))
print(f"Total number of image files: {len(all_image_paths)}")

Total number of image files: 200


## Image Frame Annotation

Using pigeonXT
Source: https://github.com/dennisbakhuis/pigeonXT 

In [None]:
def custom_display(html_content):
    html_string = str(html_content)

    match = re.search(r'<div>(.*?)</div>', html_string)
    if match:
        image_path = match.group(1)
        # resize the image to 512x512
        resized_img = Image(filename=image_path, width=224, height=224)
        display(resized_img)
        # display(Image(filename=image_path))
    else:
        print("No image path found in HTML content")


image_annotations = pixt.annotate(
    all_image_paths,
    display_fn=custom_display,
    task_type='multilabel-classification',
    options=frames_list, # to do: how to display full options. it gets cut off for long options
)

HTML(value='0 of 200 Examples annotated, Current Position: 0 ')

VBox(children=(HBox(children=(ToggleButton(value=False, description='quality of life'), ToggleButton(value=Fal…

Output()

#### Image_annotation saved to a csv file

In [None]:
image_annotations_file = '../human_annotation/image_annotations.csv'
#delete the file if it exists
if Path(image_annotations_file).exists():
    Path(image_annotations_file).unlink()
    print(f"Existed!Deleted {image_annotations_file}")
    
image_annotations.to_csv(image_annotations_file, index=False)
print(f"Saved {image_annotations_file}")

## Text Frame Annotations

#### Make list of headlines from the sampled data

In [None]:
samples_txt = pd.read_csv("../human_annotation/sampled_annotated_articles_200.csv") # add your file path here
sample_headlines = samples_txt['title'].tolist()

#### Text frame annotation

In [None]:
text_annotations = pixt.annotate(
    sample_headlines,
    options=frames_list,
    reset_buttons_after_click=True
)

#### Save text annotations to a csv file

In [None]:
text_annotations_file = "../human_annotation/text_annotations.csv"
#delete the file if it exists
if Path(text_annotations_file).exists():
    Path(text_annotations_file).unlink()
    print(f"Existed!Deleted {text_annotations_file}")
    
text_annotations.to_csv(text_annotations_file, index=False)
print(f"Saved {text_annotations_file}")
