In [48]:
import os
import pandas as pd
from pathlib import Path

In [49]:
# classification_output = Path("/home/vsl333/cultural_values/datasets/cvqa_images/output.jsonl")
# classification_output = Path("/home/vsl333/cultural_values/datasets/cvqa_images/llava-1.5-7b-hf_output.jsonl")
classification_output = Path("/home/vsl333/cultural_values/datasets/cvqa_images/llava-1.5-13b-hf_output.jsonl")

output_df = pd.read_json(classification_output, lines=True)

In [None]:
output_df.head(2)

In [50]:
# remove column: plants_and_animals and vehicle_and_transportation
# selected_df = output_df.drop(columns=['plants_and_animals', 'vehicle_and_transportation'])
selected_df = output_df[~output_df['image_path'].apply(lambda x: x.split("/")[-2]).isin(['plants_and_animals', 'vehicle_and_transportation'])]

# remove South Korea from country 
selected_df = selected_df[~selected_df['gt-country'].str.contains("South Korea")]
print(f"Before: {output_df.shape}, After: {selected_df.shape}")
selected_df.head(2)

Before: (105, 5), After: (79, 5)


Unnamed: 0,caption,top-choice,top-choice-reason,image_path,gt-country
1,"The image features a parade float with lions, which is a common symbol in Brazilian culture. The presence of a large crowd watching the parade also suggests that this event is taking place in Brazil.",Brazil,The lions on the float and the large crowd watching the parade are indicative of Brazilian culture and traditions.,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/tradition/carnaval.jpg,Brazil
2,A group of men wearing green and yellow jerseys holding a trophy and a banner that says 'World League 2008 Champions',Brazil,"The men are wearing green and yellow jerseys, which are the colors of the Brazilian national soccer team. The banner also indicates that they won the World League in 2008.",/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/sports_and_recreation/football.jpg,Brazil


In [51]:
# Dollarstreet has: Brazil, Bangladesh, India, Nigeria, Pakistan, South Africa, United States, China 
selected_df['gt-country'].unique()

array(['Brazil', 'Vietnam', 'Mexico', 'France', 'Nigeria', 'Italy',
       'Pakistan', 'South Africa', 'United States', 'China'], dtype=object)

In [52]:
# compute accuracy if top 3 choices are in the ground truth country
# do it per unique gt_country 

unique_gt_countries = selected_df["gt-country"].unique()

accurate_img_dict = {}
for idx, row in selected_df.iterrows():
    if row["gt-country"] == row["top-choice"]:
        accurate_img_dict[idx] = {"image_path": row["image_path"], "prediction": "Correct", "gt_country": row["gt-country"],         "top-choice": row["top-choice"]}
    else:
        accurate_img_dict[idx] = {"image_path": row["image_path"], "prediction": "Incorrect", "gt_country": row["gt-country"],
                                            "top-choice": row["top-choice"]}

accurate_img_df = pd.DataFrame(accurate_img_dict).T

In [53]:
accurate_img_df = accurate_img_df.sort_values(by='image_path')
pd.set_option('display.max_colwidth', 200)
print(f"shape: {accurate_img_df.shape}")
accurate_img_df.head(3)

shape: (79, 4)


Unnamed: 0,image_path,prediction,gt_country,top-choice
9,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/Brands/guarana.jpg,Correct,Brazil,Brazil
8,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/Cooking_and_food/churrasco.jpg,Correct,Brazil,Brazil
7,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/Geography/cidade_maravilhosa.jpg,Correct,Brazil,Brazil


### Find all correct ones

In [54]:
correct_df = accurate_img_df[accurate_img_df["prediction"] == "Correct"]
print(f"shape: {correct_df.shape}")
correct_df.head(10)

shape: (70, 4)


Unnamed: 0,image_path,prediction,gt_country,top-choice
9,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/Brands/guarana.jpg,Correct,Brazil,Brazil
8,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/Cooking_and_food/churrasco.jpg,Correct,Brazil,Brazil
7,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/Geography/cidade_maravilhosa.jpg,Correct,Brazil,Brazil
6,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/Objects/gifts.jpg,Correct,Brazil,Brazil
5,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/People_and_everyday_life/lula-parade-1024x683-1.jpeg,Correct,Brazil,Brazil
2,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/sports_and_recreation/football.jpg,Correct,Brazil,Brazil
1,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/tradition/carnaval.jpg,Correct,Brazil,Brazil
104,/home/vsl333/cultural_values/datasets/cvqa_images/China/Brands/cigarette.jpg,Correct,China,China
103,/home/vsl333/cultural_values/datasets/cvqa_images/China/Cooking_and_food/breakfast.jpg,Correct,China,China
102,/home/vsl333/cultural_values/datasets/cvqa_images/China/Geography/wall.jpeg,Correct,China,China


### Find all incorrect ones

In [55]:
inaccurate_df = accurate_img_df[accurate_img_df["prediction"] == "Incorrect"]
print(f"shape: {inaccurate_df.shape}")
inaccurate_df.tail(25)

shape: (9, 4)


Unnamed: 0,image_path,prediction,gt_country,top-choice
3,/home/vsl333/cultural_values/datasets/cvqa_images/Brazil/public_figure_and_pop_culture/pele.jpg,Incorrect,Brazil,Nigeria
30,/home/vsl333/cultural_values/datasets/cvqa_images/France/tradition/french-republic.jpg,Incorrect,France,United Kingdom
46,/home/vsl333/cultural_values/datasets/cvqa_images/Nigeria/Brands/dangote-logo.png,Incorrect,Nigeria,United States
44,/home/vsl333/cultural_values/datasets/cvqa_images/Nigeria/Geography/Tinubu_Square.png,Incorrect,Nigeria,France
75,/home/vsl333/cultural_values/datasets/cvqa_images/Pakistan/Cooking_and_food/pakistain-food-beef-nihari.png,Incorrect,Pakistan,France
84,/home/vsl333/cultural_values/datasets/cvqa_images/South Africa/Cooking_and_food/food.jpg,Incorrect,South Africa,Nigeria
82,/home/vsl333/cultural_values/datasets/cvqa_images/South Africa/Objects/south-africa-object-chakalaka.jpg,Incorrect,South Africa,France
81,/home/vsl333/cultural_values/datasets/cvqa_images/South Africa/People_and_everyday_life/people.jpg,Incorrect,South Africa,Nigeria
78,/home/vsl333/cultural_values/datasets/cvqa_images/South Africa/tradition/south-africa-tradition-zulu.jpg,Incorrect,South Africa,Brazil


### Create new subdirectory for each incorrect one

In [56]:
from tqdm import tqdm
import shutil

incorrect_predictions_folder = Path("/home/vsl333/cultural_values/datasets/cvqa_incorrect_predictions")
if incorrect_predictions_folder.exists():
    shutil.rmtree(incorrect_predictions_folder)
incorrect_predictions_folder.mkdir(exist_ok=True, parents=True)

# images will be saved in inaccurate_df['image_path].split("/")[-3] folder - country name, inaccurate_df['image_path].split("/")[-2] - category name
for idx, row in tqdm(inaccurate_df.iterrows(), total=inaccurate_df.shape[0]):
    image_path = Path(row["image_path"])
    country_name = image_path.parts[-3]
    category_name = image_path.parts[-2]
    
    print(f"Copying {image_path} to {country_name}/{category_name}")
    new_folder = incorrect_predictions_folder / country_name / category_name
    new_folder.mkdir(exist_ok=True, parents=True)
    
    url_file = image_path.parent / "url.txt"
    shutil.copy(image_path, new_folder)
    shutil.copy(url_file, new_folder)

 78%|███████▊  | 7/9 [00:00<00:00, 64.59it/s]

Copying /home/vsl333/cultural_values/datasets/cvqa_images/Brazil/public_figure_and_pop_culture/pele.jpg to Brazil/public_figure_and_pop_culture
Copying /home/vsl333/cultural_values/datasets/cvqa_images/France/tradition/french-republic.jpg to France/tradition
Copying /home/vsl333/cultural_values/datasets/cvqa_images/Nigeria/Brands/dangote-logo.png to Nigeria/Brands
Copying /home/vsl333/cultural_values/datasets/cvqa_images/Nigeria/Geography/Tinubu_Square.png to Nigeria/Geography
Copying /home/vsl333/cultural_values/datasets/cvqa_images/Pakistan/Cooking_and_food/pakistain-food-beef-nihari.png to Pakistan/Cooking_and_food
Copying /home/vsl333/cultural_values/datasets/cvqa_images/South Africa/Cooking_and_food/food.jpg to South Africa/Cooking_and_food
Copying /home/vsl333/cultural_values/datasets/cvqa_images/South Africa/Objects/south-africa-object-chakalaka.jpg to South Africa/Objects
Copying /home/vsl333/cultural_values/datasets/cvqa_images/South Africa/People_and_everyday_life/people.jpg 

100%|██████████| 9/9 [00:00<00:00, 35.83it/s]
