## Notebook for Frame Annotation for Image and Text for 200 Sampled data

#### Load libraries
`pip install pigeonXT-jupyter`

In [1]:
import pigeonXT as pixt

In [2]:
import pandas as pd
import re
from pathlib import Path
from IPython.display import display, Image

## Create sample of 200 text-image which are filtered

In [None]:
# analysis_path = Path('/projects/frame_align/data/annotated/analysis')

# annotated_sample_path = Path('../human_annotation/sampled_annotated_articles_200.csv')
# sample_200 = pd.read_csv(annotated_sample_path)
# print(f"Loaded {len(sample_200)} annotated samples")

Loaded 200 annotated samples


In [5]:
# uncomment this if you diont have the data

analysis_path = Path('/projects/frame_align/data/annotated/analysis')

sample_all = pd.read_csv(analysis_path.parent.parent / "sample"/'sampled_annotated_articles.csv')
# keep all rows where 'langauge' == 'en'
sample_all = sample_all[sample_all['language'] == 'en']

sampled_600_csv_file = analysis_path.parent.parent / "srishti-analysis"/"sampled_annotated_articles_600.csv"

# delete the file if it exists
if sampled_600_csv_file.exists():
    sampled_600_csv_file.unlink()
    print(f"Existed!Deleted {sampled_600_csv_file}")

# sample 200 from the annotated articles
sample_600 = sample_all.sample(600, random_state=42)
sample_600.to_csv(analysis_path.parent.parent / "srishti-analysis"/"sampled_annotated_articles_600.csv", index=False)
print(f"Saved {sampled_600_csv_file}")

Existed!Deleted /projects/frame_align/data/srishti-analysis/sampled_annotated_articles_600.csv
Saved /projects/frame_align/data/srishti-analysis/sampled_annotated_articles_600.csv


In [8]:
sample_600.head()
sample_600['language'].value_counts()

en    600
Name: language, dtype: int64

In [22]:
sample_600.groupby('month')['text_frame_name'].count()

month
2023-05-01_2023-05-31    47
2023-06-01_2023-06-30    52
2023-07-01_2023-07-31    41
2023-08-01_2023-08-31    41
2023-09-01_2023-09-30    49
2023-10-01_2023-10-31    52
2023-11-01_2023-11-30    55
2023-12-01_2023-12-31    64
2024-01-01_2024-01-31    48
2024-02-01_2024-02-29    51
2024-03-01_2024-03-31    53
2024-04-01_2024-04-30    47
Name: text_frame_name, dtype: int64

In [None]:

pivot_table = sample_600.pivot_table(index='month', columns='text_frame_name', aggfunc='size', fill_value=0)
pivot_table

text_frame_name,capacity and resources,crime and punishment,cultural identity,economic,external regulation and reputation,fairness and equality,health and safety,"legality, constitutionality and jurisprudence",morality,other,policy prescription and evaluation,political,public opinion,quality of life,security and defense
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-05-01_2023-05-31,5,6,3,5,2,4,1,4,4,1,2,2,3,4,1
2023-06-01_2023-06-30,3,3,3,3,2,5,2,5,1,2,8,5,1,7,2
2023-07-01_2023-07-31,1,2,2,4,1,2,6,4,4,0,2,2,4,6,1
2023-08-01_2023-08-31,2,1,1,1,0,4,2,4,3,1,7,2,3,5,5
2023-09-01_2023-09-30,5,3,4,3,3,5,1,2,4,3,4,5,1,1,5
2023-10-01_2023-10-31,2,3,3,3,5,3,2,7,3,1,2,5,4,4,5
2023-11-01_2023-11-30,3,4,5,8,1,3,7,1,3,4,2,2,2,5,5
2023-12-01_2023-12-31,3,4,7,6,2,5,5,4,4,4,1,3,8,5,3
2024-01-01_2024-01-31,5,4,3,6,0,4,3,4,4,4,5,2,2,2,0
2024-02-01_2024-02-29,3,4,3,0,1,4,3,3,4,4,4,5,2,7,4


#### Create list of frame names

In [9]:
# Get Frame names
frames = sample_600['text_frame_name'].unique()
frames_list = frames.tolist()
print(f"Frame length: {len(frames_list)}")

Frame length: 15


## Save all 200 sampled images for human annotation


#### Create path of image directories
Go to next block if you lready have images downloaded

In [10]:
all_months = ["2023-05-01_2023-05-31", "2023-06-01_2023-06-30", "2023-07-01_2023-07-31", "2023-08-01_2023-08-31", "2023-09-01_2023-09-30",
    "2023-10-01_2023-10-31", "2023-11-01_2023-11-30", "2023-12-01_2023-12-31", "2024-01-01_2024-01-31", "2024-02-01_2024-02-29",
    "2024-03-01_2024-03-31", "2024-04-01_2024-04-30"]


base_img_dir_1 = Path("/projects/frame_align/data/news_img_data/")
base_img_dir_2 = Path("/projects/frame_align/data/img_data/")

month_dir_paths = [base_img_dir_1 / month for month in all_months] + [base_img_dir_2 / month / 'correct_downloaded_imgs' for month in all_months]
sorted_month_dir_paths = sorted(month_dir_paths)

#### Save month and uuids for images

In [11]:
vision_month_uuids = sample_600[['month', 'vision_uuid']]
vision_month_uuids

Unnamed: 0,month,vision_uuid
141,2024-02-01_2024-02-29,0f127b67-dcde-4a2e-8b83-6ff09bf824f9
1656,2023-11-01_2023-11-30,36c8e59c-e3b8-45c9-9fe0-1368e6302ac4
485,2023-12-01_2023-12-31,7d63aa64-b321-4a38-9924-f65852030006
1812,2023-05-01_2023-05-31,6087f017-588a-44a9-acec-60b509fabbfb
2417,2024-03-01_2024-03-31,28d1daaa-abd4-450d-ac5e-f79f45ef509e
...,...,...
677,2024-04-01_2024-04-30,b78d5a62-e7d9-408c-b3bd-ebff6bf94952
2485,2024-03-01_2024-03-31,d0a077fd-f0fd-4b33-90d8-20c0d94faf7e
461,2023-12-01_2023-12-31,b77841fa-9579-4e70-a7e4-011f4f88400a
1741,2023-11-01_2023-11-30,a3ceb4bf-dbd1-4a57-bd99-526e47092e61


#### Create a list of image paths we sampled

In [12]:
from tqdm import tqdm

existing_files = []

for index, row in tqdm(vision_month_uuids.iterrows(), total=vision_month_uuids.shape[0]):
    month = row['month']
    uuid = row['vision_uuid']
    for dir_path in sorted_month_dir_paths:
        if month in str(dir_path):
            file_path = dir_path / f"{uuid}.jpg"
            if file_path.exists():
                existing_files.append(file_path)

100%|██████████| 600/600 [00:02<00:00, 296.58it/s]


#### Copy the images to a new directory which we will use for annotation
new image directory= "../human_annotation/images"

In [13]:
import shutil
from tqdm import tqdm

destination_dir = Path("../human_annotation/images")
# delete the directory if it exists
if destination_dir.exists():
    shutil.rmtree(destination_dir)
    print(f"Deleted {destination_dir}")
destination_dir.mkdir(parents=True, exist_ok=True)

for file_path in tqdm(existing_files, desc="Copying files"):
    destination = destination_dir / file_path.name
    shutil.copy(file_path, destination)

Deleted ../human_annotation/images


Copying files: 100%|██████████| 600/600 [00:17<00:00, 34.28it/s]


In [15]:
import glob

all_image_paths = glob.glob(str(destination_dir / "*.jpg"))
print(f"Total number of image files in path {destination_dir}: {len(all_image_paths)}")

Total number of image files in path ../human_annotation/images: 600


## Image Frame Annotation

Using pigeonXT
Source: https://github.com/dennisbakhuis/pigeonXT 

In [None]:
def custom_display(html_content):
    html_string = str(html_content)

    match = re.search(r'<div>(.*?)</div>', html_string)
    if match:
        image_path = match.group(1)
        # resize the image to 800x800
        resized_img = Image(filename=image_path, width=448, height=448)
        display(resized_img)
        # display(Image(filename=image_path))
    else:
        print("No image path found in HTML content")


image_annotations = pixt.annotate(
    all_image_paths,
    display_fn=custom_display,
    task_type='multilabel-classification',
    options=frames_list, # to do: how to display full options. it gets cut off for long options
)

HTML(value='0 of 200 Examples annotated, Current Position: 0 ')

VBox(children=(HBox(children=(ToggleButton(value=False, description='quality of life'), ToggleButton(value=Fal…

Output()

In [32]:
FRAMES = """
    1: Economic - costs, benefits, or other financial implications,
    2: Capacity and resources - availability of physical, human, or financial resources, and capacity of current systems, 
    3: Morality - religious or ethical implications,
    4: Fairness and equality - balance or distribution of rights, responsibilities, and resources,
    5: Legality, constitutionality and jurisprudence - rights, freedoms, and authority of individuals, corporations, and government,
    6: Policy prescription and evaluation - discussion of specific policies aimed at addressing problems
    7: Crime and punishment - effectiveness and implications of laws and their enforcement,
    8: Security and defense - threats to welfare of the individual, community, or nation,
    9: Health and safety - health care, sanitation, public safety,
    10: Quality of life - threats and opportunities for the individual's wealth, happiness, and well-being,
    11: Cultural identity - traditions, customs, or values of a social group in relation to a policy issue,
    12: Public opinion - attitudes and opinions of the general public, including polling and demographics,
    13: Political - considerations related to politics and politicians, including lobbying, elections, and attempts to sway voters,
    14: External regulation and reputation - international reputation or foreign policy of the U.S,
    15: Other - any coherent group of frames not covered by the above categories."""

#### Image_annotation saved to a csv file

In [None]:
image_annotations_file = '../human_annotation/image_annotations_maria.csv'
#delete the file if it exists
if Path(image_annotations_file).exists():
    Path(image_annotations_file).unlink()
    print(f"Existed!Deleted {image_annotations_file}")
    
image_annotations.to_csv(image_annotations_file, index=False)
print(f"Saved {image_annotations_file}")

Existed!Deleted ../human_annotation/image_annotations_srishti.csv
Saved ../human_annotation/image_annotations_srishti.csv


### Accuracy

In [35]:
# from pathlib import Path
import pandas as pd


filename = Path('../human_annotation/image_annotations_srishti.csv')
filename_df = pd.read_csv(filename)
filename_df = filename_df.iloc[0:100]

Get frames per uuid for annotation

In [36]:
# filename_df = filename_df.drop(columns=['changed'])
filename_df['labels'] = filename_df.apply(lambda row: [col for col in filename_df.columns[2:] if row[col] == True], axis=1)
filename_df['uuid'] = filename_df['example'].apply(lambda x: x.split('/')[-1].split('.')[0])
filename_df.head()

Unnamed: 0,example,changed,quality of life,capacity and resources,political,security and defense,policy prescription and evaluation,fairness and equality,other,health and safety,"legality, constitutionality and jurisprudence",morality,economic,public opinion,cultural identity,external regulation and reputation,crime and punishment,labels,uuid
0,../human_annotation/images/17e639d0-f9c1-4ce8-...,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,[quality of life],17e639d0-f9c1-4ce8-ad56-c74cd4cd1165
1,../human_annotation/images/cdc09317-a5f2-4af7-...,True,False,False,True,True,True,False,False,False,False,False,False,False,False,False,False,"[political, security and defense, policy presc...",cdc09317-a5f2-4af7-9fc3-a4f3cae0c1d0
2,../human_annotation/images/c69cfe52-e928-4dc3-...,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,[quality of life],c69cfe52-e928-4dc3-b36b-cf2c9e081e0f
3,../human_annotation/images/1980de34-e867-4b9b-...,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,[political],1980de34-e867-4b9b-9eed-113a831f0266
4,../human_annotation/images/0416eb08-005f-44ba-...,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,[cultural identity],0416eb08-005f-44ba-aec8-ca0ee5e3faaa


In [None]:
merged_df = sample_200.iloc[:100][['vision_uuid', 'vision_frame-name']].merge(filename_df[['uuid', 'labels']], left_on='vision_uuid', right_on='uuid')
merged_df.head(20)

Unnamed: 0,vision_uuid,vision_frame-name,uuid,labels
0,0416eb08-005f-44ba-aec8-ca0ee5e3faaa,health and safety,0416eb08-005f-44ba-aec8-ca0ee5e3faaa,[cultural identity]
1,ea16e806-d15f-41ba-9146-064f2c8f7ece,morality,ea16e806-d15f-41ba-9146-064f2c8f7ece,[quality of life]
2,89e378c2-efce-4100-8ce0-736f326280e8,economic,89e378c2-efce-4100-8ce0-736f326280e8,[political]
3,9ebdbfbc-cd8f-4c8b-b85f-4cf27e2b1196,political,9ebdbfbc-cd8f-4c8b-b85f-4cf27e2b1196,"[fairness and equality, public opinion]"
4,34313b5b-991b-4566-9b91-e0064d6aa1c4,economic,34313b5b-991b-4566-9b91-e0064d6aa1c4,"[fairness and equality, legality, constitution..."
5,cb5f118c-0231-4383-9832-aa9ffedced8d,economic,cb5f118c-0231-4383-9832-aa9ffedced8d,[cultural identity]
6,b17e6119-f679-4c01-b615-4b18b29e8525,economic,b17e6119-f679-4c01-b615-4b18b29e8525,[quality of life]
7,cf474ef5-65bb-43e0-af43-dc9fafe352ed,cultural identity,cf474ef5-65bb-43e0-af43-dc9fafe352ed,[quality of life]
8,078141aa-ed63-4c85-a8a9-417b004e4381,cultural identity,078141aa-ed63-4c85-a8a9-417b004e4381,[other]
9,43bf4fb8-0f60-4e16-9c6f-aff3af243d69,"legality, constitutionality and jurisprudence",43bf4fb8-0f60-4e16-9c6f-aff3af243d69,[other]


Accuracy

In [39]:
accuracy = 0
for idx, row in merged_df.iterrows():
    if row['vision_frame-name'] in row['labels']:
        accuracy += 1
    
print(f"Accuracy: {accuracy/len(merged_df)}")

Accuracy: 0.14583333333333334


-----------------------------------------

## Text Frame Annotations

#### Make list of headlines from the sampled data

In [None]:
samples_txt = pd.read_csv("../human_annotation/sampled_annotated_articles_200.csv") # add your file path here
sample_headlines = samples_txt['title'].tolist()

#### Text frame annotation

In [None]:
text_annotations = pixt.annotate(
    sample_headlines,
    options=frames_list,
    reset_buttons_after_click=True
)

#### Save text annotations to a csv file

In [None]:
text_annotations_file = "../human_annotation/text_annotations.csv"
#delete the file if it exists
if Path(text_annotations_file).exists():
    Path(text_annotations_file).unlink()
    print(f"Existed!Deleted {text_annotations_file}")
    
text_annotations.to_csv(text_annotations_file, index=False)
print(f"Saved {text_annotations_file}")
