In [6]:
import os
import pandas as pd
from pathlib import Path

In [7]:
classification_output = Path("/home/vsl333/cultural_values/datasets/dollarstreet_accurate_images/output.jsonl")
output_df = pd.read_json(classification_output, lines=True)

In [8]:
output_df.head()

Unnamed: 0,caption,first-choice,first-choice-reason,second-choice,second-choice-reason,third-choice,third-choice-reason,fourth-choice,fourth-choice-reason,fifth-choice,fifth-choice-reason,image_path,gt-country
0,A man and a woman posing for a picture in fron...,Mexico,The man and woman are of dark skin and are wea...,Brazil,The man and woman are of dark skin and are wea...,Argentina,The man and woman are of dark skin and are wea...,Colombia,The man and woman are of dark skin and are wea...,Peru,The man and woman are of dark skin and are wea...,/projects/belongielab/people/vsl333/ds/dollars...,Brazil
1,"A man, woman, and child are sitting outside a ...",Mexico,"The man is wearing a sombrero, which is a trad...",Brazil,"The man is giving a thumbs up, which is a comm...",Argentina,The graffiti on the building is a common sight...,Colombia,"The man is wearing a sombrero, which is a trad...",Peru,"The man is giving a thumbs up, which is a comm...",/projects/belongielab/people/vsl333/ds/dollars...,Brazil
2,Two young men standing in front of a yellow bu...,Mexico,The color of the building and the clothing of ...,Brazil,Brazil is known for its vibrant colors and div...,Argentina,Argentina is known for its European influence ...,Colombia,Colombia is known for its diverse population a...,Venezuela,Venezuela is known for its diverse population ...,/projects/belongielab/people/vsl333/ds/dollars...,Brazil
3,A man and woman hugging each other in a room w...,Mexico,The woman is wearing a ring on her finger.,Brazil,The man is wearing glasses.,Argentina,The man is wearing a green shirt.,Colombia,The woman has brown hair.,Peru,The man is wearing a ring on his finger.,/projects/belongielab/people/vsl333/ds/dollars...,Brazil
4,"A group of people posing with a dog and a cat,...",Mexico,"The presence of a dog and a cat, as well as th...",Brazil,The diverse group of people and the casual set...,Argentina,The diverse group of people and the casual set...,Colombia,The diverse group of people and the casual set...,Peru,The diverse group of people and the casual set...,/projects/belongielab/people/vsl333/ds/dollars...,Brazil


### Top 3 accuracy

In [14]:
# compute accuracy if top 3 choices are in the ground truth country
# do it per unique gt_country 

uniue_gt_countries = output_df["gt-country"].unique()

accuracy = {}
accurate_img_paths = {}
for gt_country in uniue_gt_countries:
    country_data = output_df[output_df["gt-country"] == gt_country]
    correct_predictions = country_data[
        (country_data["first-choice"] == gt_country)
        | (country_data["second-choice"] == gt_country)
        | (country_data["third-choice"] == gt_country)
    ]
    accuracy[gt_country] = len(correct_predictions) / len(country_data)
    accurate_img_paths[gt_country] = correct_predictions["image_path"].tolist()


In [17]:
print(f"Number of directories: {len(accurate_img_paths)}")
# print count of values in accurate_img_paths_top2
for country, value in accurate_img_paths.items():
    print(f"{country}: {len(value)}")

print("*"*50)
sorted(accurate_img_paths.keys())
for country, value in accuracy.items():
    print(f"{country}: {value}")

Number of directories: 20
Brazil: 7
Bangladesh: 6
Vietnam: 3
India: 44
Mexico: 4
France: 0
Nigeria: 6
Italy: 1
Spain: 1
Colombia: 0
South Korea: 0
Pakistan: 8
Tunisia: 0
Ghana: 1
Kenya: 2
Nepal: 1
South Africa: 5
United States: 7
China: 24
Iran: 2
**************************************************
Brazil: 1.0
Bangladesh: 1.0
Vietnam: 0.75
India: 1.0
Mexico: 1.0
France: 0.0
Nigeria: 0.75
Italy: 1.0
Spain: 0.2
Colombia: 0.0
South Korea: 0.0
Pakistan: 1.0
Tunisia: 0.0
Ghana: 0.3333333333333333
Kenya: 1.0
Nepal: 1.0
South Africa: 1.0
United States: 0.7
China: 1.0
Iran: 1.0


### Top 2 accuracy

In [16]:
# compute accuracy if top 3 choices are in the ground truth country
# do it per unique gt_country 

uniue_gt_countries = output_df["gt-country"].unique()

accuracy_top2 = {}
accurate_img_paths_top2 = {}
for gt_country in uniue_gt_countries:
    country_data_top2 = output_df[output_df["gt-country"] == gt_country]
    correct_predictions_top2 = country_data_top2[
        (country_data_top2["first-choice"] == gt_country)
        | (country_data_top2["second-choice"] == gt_country)
    ]
    accuracy_top2[gt_country] = len(correct_predictions_top2) / len(country_data_top2)
    accurate_img_paths_top2[gt_country] = correct_predictions_top2["image_path"].tolist()

print(f"Number of directories: {len(accurate_img_paths_top2)}")
sorted(accurate_img_paths_top2.keys())
# print count of values in accurate_img_paths_top2
for country, value in accurate_img_paths_top2.items():
    print(f"{country}: {len(value)}")
print("*"*50)

for country, value in accuracy_top2.items():
    print(f"{country}: {value}")


Number of directories: 20
Brazil: 7
Bangladesh: 0
Vietnam: 1
India: 44
Mexico: 4
France: 0
Nigeria: 6
Italy: 0
Spain: 1
Colombia: 0
South Korea: 0
Pakistan: 8
Tunisia: 0
Ghana: 1
Kenya: 0
Nepal: 0
South Africa: 5
United States: 7
China: 24
Iran: 1
**************************************************
Brazil: 1.0
Bangladesh: 0.0
Vietnam: 0.25
India: 1.0
Mexico: 1.0
France: 0.0
Nigeria: 0.75
Italy: 0.0
Spain: 0.2
Colombia: 0.0
South Korea: 0.0
Pakistan: 1.0
Tunisia: 0.0
Ghana: 0.3333333333333333
Kenya: 0.0
Nepal: 0.0
South Africa: 1.0
United States: 0.7
China: 1.0
Iran: 0.5


- If I choose top 1, it's only India, Mexico, Nigeria, USA and China
- We deleted South Kore and Colombia because they are had 0 files

In [11]:
classification_dir = classification_output.parent
print(classification_dir)

# count files in each country directory in the classification dir
country_dirs = [d for d in os.listdir(classification_dir) if os.path.isdir(classification_dir / d)]
country_dir_counts = {}
for country_dir in country_dirs:
    country_dir_path = classification_dir / country_dir
    # count image files
    country_dir_counts[country_dir] = len([f for f in os.listdir(country_dir_path) if os.path.isfile(country_dir_path / f)])

print("Total: ", sum(country_dir_counts.values()))
print("Number of files in each country directory")
for country, count in country_dir_counts.items():
    print(f"{country}: {count}")



/home/vsl333/cultural_values/datasets/dollarstreet_accurate_images
Total:  137
Number of files in each country directory
Brazil: 7
Bangladesh: 7
Vietnam: 4
India: 44
Mexico: 4
France: 1
Nigeria: 8
Italy: 1
Spain: 5
South Korea: 1
Pakistan: 8
Ghana: 3
Kenya: 2
Nepal: 1
South Africa: 5
United States: 10
China: 24
Iran: 2
