In [41]:
import os
import pandas as pd

### Read OG Dollarstreet Metadata

In [81]:
ds_data = "/projects/belongielab/people/vsl333/ds/dollarstreet/images_v2.csv"
ds_data = pd.read_csv(ds_data)
print(ds_data.columns)
print(ds_data.shape)
ds_data.head()

Index(['id', 'country.name', 'country.id', 'region.id', 'type', 'imageRelPath',
       'topics', 'place', 'income'],
      dtype='object')
(38479, 9)


Unnamed: 0,id,country.name,country.id,region.id,type,imageRelPath,topics,place,income
0,5d4be6cecf0b3a0f3f344586,Burundi,bi,af,image,assets/5d4be6cecf0b3a0f3f344586/5d4be6cecf0b3a...,icons,butoyi,26.994581
1,5d4bf31ccf0b3a0f3f359814,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f359814/5d4bf31ccf0b3a...,Family snapshots,butoyi,26.994581
2,5d4bf31ccf0b3a0f3f35982a,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f35982a/5d4bf31ccf0b3a...,Cutlery,butoyi,26.994581
3,5d4bf31ccf0b3a0f3f35982e,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f35982e/5d4bf31ccf0b3a...,Family,butoyi,26.994581
4,5d4bf31ccf0b3a0f3f359830,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f359830/5d4bf31ccf0b3a...,Place where eating dinner,butoyi,26.994581


### Filter Family, Family Snapshot Data

In [43]:
# seelct all rows with topic == Family snapshots or Family
ds_data = ds_data[ds_data['topics'].isin(['Family snapshots', 'Family'])]
print(ds_data.shape)
ds_data.head()

(567, 9)


Unnamed: 0,id,country.name,country.id,region.id,type,imageRelPath,topics,place,income
1,5d4bf31ccf0b3a0f3f359814,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f359814/5d4bf31ccf0b3a...,Family snapshots,butoyi,26.994581
3,5d4bf31ccf0b3a0f3f35982e,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f35982e/5d4bf31ccf0b3a...,Family,butoyi,26.994581
84,5fd029591f89a90d2a769d21,Burundi,bi,af,video,assets/5fd029591f89a90d2a769d21/6304933865827c...,Family,butoyi,26.994581
88,5d4bf330cf0b3a0f3f359a20,Burundi,bi,af,image,assets/5d4bf330cf0b3a0f3f359a20/5d4bf330cf0b3a...,Family snapshots,kabura,28.5
102,5d4bf333cf0b3a0f3f359a88,Burundi,bi,af,image,assets/5d4bf333cf0b3a0f3f359a88/5d4bf333cf0b3a...,Family,kabura,28.5


In [44]:
ds_data.columns

Index(['id', 'country.name', 'country.id', 'region.id', 'type', 'imageRelPath',
       'topics', 'place', 'income'],
      dtype='object')

### Copy Images: Family, Family Snapshots

In [55]:
from tqdm import tqdm
import shutil   

# for each unique country, look for image path in image_path column, copy that file to a new folder "dollarstreet_family"
# create a new folder
ds_folder = "/projects/belongielab/people/vsl333/ds/dollarstreet_family"
base_folder = "/projects/belongielab/people/vsl333/ds/dollarstreet"


if os.path.exists(ds_folder):
    shutil.rmtree(ds_folder)
os.makedirs(f"{ds_folder}", exist_ok=True)

countries = ds_data["country.name"].unique()

image_metadata =[]
for country in tqdm(countries):
    country_folder = f"{ds_folder}/{country}"
    if os.path.exists(country_folder):
        shutil.rmtree(country_folder)
    os.makedirs(country_folder, exist_ok=True)

    country_data = {}; country_metadata = []
    country_data = ds_data[ds_data["country.name"] == country]
    for idx, row in country_data.iterrows():
        image_path = row["imageRelPath"]

        country_data = {
            "id": row["id"],
            "country.name": row["country.name"],
            "country.id": row["country.id"],
            "region.id": row["region.id"],
            "type": row["type"],
            "topics": row["topics"],
            "place": row["place"],
            "income": row["income"]
        }
    
        try:
            shutil.copy(os.path.join(base_folder, image_path), country_folder)
            country_data["copied"] = True
            country_data["image-path"] = os.path.join(country_folder, image_path)

        except Exception as e:
            print(f"Failed to copy image: {image_path} for {country}")
            print(e)
            country_data["copied"] = False
            country_data["image-path"] = os.path.join(country_folder, image_path)

        country_metadata.append(country_data)
    
    image_metadata.append(country_metadata)
    # print("Copied country: ", country)


100%|██████████| 63/63 [00:16<00:00,  3.88it/s]


### Save Metadata for Family, Family Snapshots

In [56]:
image_metadata_df = pd.DataFrame([item for sublist in image_metadata for item in sublist])
image_metadata_df.to_csv(f"{ds_folder}/metadata.csv", index=False)
print(image_metadata_df.shape)
image_metadata_df.head()

(567, 10)


Unnamed: 0,id,country.name,country.id,region.id,type,topics,place,income,copied,image-path
0,5d4bf31ccf0b3a0f3f359814,Burundi,bi,af,image,Family snapshots,butoyi,26.994581,True,/projects/belongielab/people/vsl333/ds/dollars...
1,5d4bf31ccf0b3a0f3f35982e,Burundi,bi,af,image,Family,butoyi,26.994581,True,/projects/belongielab/people/vsl333/ds/dollars...
2,5fd029591f89a90d2a769d21,Burundi,bi,af,video,Family,butoyi,26.994581,True,/projects/belongielab/people/vsl333/ds/dollars...
3,5d4bf330cf0b3a0f3f359a20,Burundi,bi,af,image,Family snapshots,kabura,28.5,True,/projects/belongielab/people/vsl333/ds/dollars...
4,5d4bf333cf0b3a0f3f359a88,Burundi,bi,af,image,Family,kabura,28.5,True,/projects/belongielab/people/vsl333/ds/dollars...


----------------------------------------------------------------

### Save Metadata for Accurate Photos

In [61]:
ds_data = "/projects/belongielab/people/vsl333/ds/dollarstreet/images_v2.csv"
ds_data = pd.read_csv(ds_data)
print(ds_data.columns)
print(ds_data.shape)
ds_data.head()

Index(['id', 'country.name', 'country.id', 'region.id', 'type', 'imageRelPath',
       'topics', 'place', 'income'],
      dtype='object')
(38479, 9)


Unnamed: 0,id,country.name,country.id,region.id,type,imageRelPath,topics,place,income
0,5d4be6cecf0b3a0f3f344586,Burundi,bi,af,image,assets/5d4be6cecf0b3a0f3f344586/5d4be6cecf0b3a...,icons,butoyi,26.994581
1,5d4bf31ccf0b3a0f3f359814,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f359814/5d4bf31ccf0b3a...,Family snapshots,butoyi,26.994581
2,5d4bf31ccf0b3a0f3f35982a,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f35982a/5d4bf31ccf0b3a...,Cutlery,butoyi,26.994581
3,5d4bf31ccf0b3a0f3f35982e,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f35982e/5d4bf31ccf0b3a...,Family,butoyi,26.994581
4,5d4bf31ccf0b3a0f3f359830,Burundi,bi,af,image,assets/5d4bf31ccf0b3a0f3f359830/5d4bf31ccf0b3a...,Place where eating dinner,butoyi,26.994581


In [86]:
classification_dir = "/projects/belongielab/people/vsl333/ds/dollarstreet_accurate_images"

# make a list of all the images in the classification directory and subdirectories
from glob import glob
import os
import pandas as pd

image_files = glob(f"{classification_dir}/**/*.jpg", recursive=True)
print("No. of files: ", len(image_files))
image_files[0:5]

No. of files:  137


['/projects/belongielab/people/vsl333/ds/dollarstreet_accurate_images/Brazil/5d4be1bacf0b3a0f3f33b8af.jpg',
 '/projects/belongielab/people/vsl333/ds/dollarstreet_accurate_images/Brazil/5d4be173cf0b3a0f3f33b1ad.jpg',
 '/projects/belongielab/people/vsl333/ds/dollarstreet_accurate_images/Brazil/5d4be193cf0b3a0f3f33b465.jpg',
 '/projects/belongielab/people/vsl333/ds/dollarstreet_accurate_images/Brazil/5d4be163cf0b3a0f3f33afd5.jpg',
 '/projects/belongielab/people/vsl333/ds/dollarstreet_accurate_images/Brazil/5ec4fa8bf0611d7ddd742093.jpg']

In [87]:
import collections

ds_data = "/projects/belongielab/people/vsl333/ds/dollarstreet/images_v2.csv"
ds_data = pd.read_csv(ds_data)

metadata = []
found_image_files = []
found = 0
not_found = 0
for idx, row in tqdm(ds_data.iterrows()):
    image_id = row["id"]
    # if file ends with image_id.jpg
    if any([image_id in f for f in image_files]):
        image_file = [f for f in image_files if image_id in f][0].split("/")[-2:]
        image_path = "/".join(image_file)
        row["image_path"] = image_path
        found_image_files.append(image_path)
        metadata.append(row)
        found += 1
    else:
        not_found += 1


print("Len of metadata: ", len(metadata))

# check for duplicates: image_files
duplicates = [item for item, count in collections.Counter(found_image_files).items() if count > 1]
print("Duplicates: ", duplicates)

38479it [00:02, 17247.35it/s]

Len of metadata:  129
Duplicates:  []





In [88]:
metadata_df = pd.DataFrame(metadata)
metadata_csv = f"{classification_dir}/metadata.csv"
if os.path.exists(metadata_csv):
    os.remove(metadata_csv)
metadata_df.to_csv(metadata_csv, index=False)

print(f"Found: {found}, Not Found: {not_found}")
print(metadata_df.shape)

Found: 129, Not Found: 38350
(129, 10)
