In [1]:
import os
import pandas as pd
from pathlib import Path

In [2]:
classification_output = Path("/home/vsl333/cultural_values/datasets/cvqa/images/output.jsonl")
output_df = pd.read_json(classification_output, lines=True)

In [3]:
output_df.head()

Unnamed: 0,caption,first-choice,first-choice-reason,second-choice,second-choice-reason,third-choice,third-choice-reason,fourth-choice,fourth-choice-reason,fifth-choice,fifth-choice-reason,image_path,gt-country
0,"A colorful display of food items, including ca...",United States,The variety of food items and the colorful pre...,Canada,Canadian markets and stores also have a reputa...,United Kingdom,British markets and stores often showcase a wi...,Germany,German markets and stores are known for their ...,Australia,Australian markets and stores offer a diverse ...,/home/vsl333/cultural_values/datasets/cvqa/ima...,Ecuador
1,"A plate of food with a lobster and rice, serve...",United States,The presence of lobster and rice as a dish is ...,Canada,"Lobster is a popular dish in Canada, and the p...",Australia,"Lobster is a popular dish in Australia, and th...",New Zealand,"Lobster is a popular dish in New Zealand, and ...",China,"Lobster is a popular dish in China, and the pr...",/home/vsl333/cultural_values/datasets/cvqa/ima...,Ecuador
2,A large white building with a clock tower and ...,Spain,"The architecture and design of the building, a...",Italy,"The architecture and design of the building, a...",France,"The architecture and design of the building, a...",Germany,"The architecture and design of the building, a...",Switzerland,"The architecture and design of the building, a...",/home/vsl333/cultural_values/datasets/cvqa/ima...,Ecuador
3,The image is from a country with a diverse pop...,France,The architecture and clothing style of the peo...,Italy,The presence of a church and a clock tower in ...,Spain,The traditional clothing of the people and the...,Germany,The architecture and the presence of a clock t...,Switzerland,The presence of a church and the clock tower s...,/home/vsl333/cultural_values/datasets/cvqa/ima...,Ecuador
4,A large group of people standing on a soccer f...,Mexico,The presence of a large group of people in red...,Brazil,Brazil is known for its soccer culture and pas...,Argentina,Argentina is another country with a strong soc...,Spain,Spain is a country with a rich soccer history ...,Italy,Italy is a country with a strong soccer cultur...,/home/vsl333/cultural_values/datasets/cvqa/ima...,Ecuador


In [4]:
# compute accuracy if top 3 choices are in the ground truth country
# do it per unique gt_country 

uniue_gt_countries = output_df["gt-country"].unique()

accuracy = {}
accurate_img_paths = {}
for gt_country in uniue_gt_countries:
    country_data = output_df[output_df["gt-country"] == gt_country]
    correct_predictions = country_data[
        (country_data["first-choice"] == gt_country)
        # | (country_data["second-choice"] == gt_country)
        # | (country_data["third-choice"] == gt_country)
    ]
    accuracy[gt_country] = len(correct_predictions) / len(country_data)
    accurate_img_paths[gt_country] = correct_predictions["image_path"].tolist()


In [5]:
# save list of accurate image paths
accurate_img_paths_df = pd.DataFrame.from_dict(accurate_img_paths, orient="index")
accurate_img_paths_df.to_csv("accurate_img_paths.csv")

In [6]:
accurate_img_paths_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
Ecuador,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,,,,,,,,,,,,,,,,
Brazil,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...
Ethiopia,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,/home/vsl333/cultural_values/datasets/cvqa/ima...,,,,,,,,,


In [7]:
print(f"Number of directories: {len(accurate_img_paths)}")
# number of images per country
for country, paths in accurate_img_paths.items():
    print(f"{country}: {len(paths)}")

print("----------------")
sorted(accurate_img_paths.keys())
for country, value in accuracy.items():
    print(f"{country}: {value}")

Number of directories: 3
Ecuador: 3
Brazil: 19
Ethiopia: 10
----------------
Ecuador: 0.00847457627118644
Brazil: 0.06859205776173286
Ethiopia: 0.03278688524590164


- If I choose top 1, it's only India, Mexico, Nigeria, USA and China
- We deleted South Kore and Colombia because they are had 0 files

### Copy the accurate data

In [9]:
import shutil 
# To do
classified_dir = classification_output.parent.parent.parent / "accurate_cvqa"

if os.path.exists(classified_dir):
    shutil.rmtree(classified_dir)
os.makedirs(classified_dir)
print(f"Created directory: {classified_dir}")

Created directory: /home/vsl333/cultural_values/datasets/accurate_cvqa


In [11]:
import shutil
# Copy all the images to the new directory where the classification was accurate

for country, paths in accurate_img_paths.items():
    country_dir = classified_dir / country
    if os.path.exists(country_dir):
        shutil.rmtree(country_dir)

    os.makedirs(country_dir)
    for path in paths:
        shutil.copy(path, country_dir)
        print(f"Copied: {path}")

print("Done!")

Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Ecuador/5865926974272589288_1.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Ecuador/5865926974272589288_0.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Ecuador/5865926944276574910_0.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Brazil/5865921714279520278_2.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Brazil/5865921724278042693_1.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Brazil/5865921724277985415_2.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Brazil/5865921724277985415_0.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Brazil/5865921734273472362_0.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Brazil/5865921724272219833_0.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Brazil/5865921724273769462_2.png
Copied: /home/vsl333/cultural_values/datasets/cvqa/images/Brazil/5865921724271622539_0.

In [7]:


# # count files in each country directory in the classification dir
# country_dirs = [d for d in os.listdir(classification_dir) if os.path.isdir(classification_dir / d)]
# country_dir_counts = {}
# for country_dir in country_dirs:
#     country_dir_path = classification_dir / country_dir
#     # count image files
#     country_dir_counts[country_dir] = len([f for f in os.listdir(country_dir_path) if os.path.isfile(country_dir_path / f)])

# print("Total: ", sum(country_dir_counts.values()))
# print("Number of files in each country directory")
# for country, count in country_dir_counts.items():
#     print(f"{country}: {count}")

