## Importing the Dollar Street dataset
#### Imports

In [24]:
import os
from PIL import Image
import pandas as pd
import requests
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

#### Load CSV
Data downloaded from Kaggle: 
```
kaggle datasets download -d mlcommons/the-dollar-street-dataset -p /mfsnic/u/apouget/data/dollarstreet/
unzip /mfsnic/u/apouget/data/dollarstreet/the-dollar-street-dataset.zip -d /mfsnic/u/apouget/data/dollarstreet
```

In [17]:
dataset_path = '/mfsnic/u/apouget/data/dollarstreet/dataset_dollarstreet/'
train_path = os.path.join(dataset_path, 'images_v2_imagenet_train.csv')
test_path = os.path.join(dataset_path, 'images_v2_imagenet_test.csv')

data_train = pd.read_csv(train_path)
data_test = pd.read_csv(test_path)
display(data_train.head())

Unnamed: 0,id,country.name,country.id,region.id,type,imageRelPath,topics,place,income,imagenet_synonyms,imagenet_sysnet_id
0,5d4be7b3cf0b3a0f3f345ebc,Bangladesh,bd,as,image,assets/5d4be7b3cf0b3a0f3f345ebc/5d4be7b3cf0b3a...,['toilet paper'],paramanik-01,72.104573,['toilet paper'],[999]
1,5ec4f945f0611d7ddd7415eb,India,in,as,image,assets/5ec4f945f0611d7ddd7415eb/5ec4f945f0611d...,['waste dumps'],mukharjee-goshal,5795.0,['trash can'],[412]
2,5d4bee2ecf0b3a0f3f350f2a,Rwanda,rw,af,image,assets/5d4bee2ecf0b3a0f3f350f2a/5d4bee2ecf0b3a...,['shower'],ntambara,72.459433,['shower curtain'],[794]
3,5d4bde6fcf0b3a0f3f33612d,Serbia,rs,eu,image,assets/5d4bde6fcf0b3a0f3f33612d/5d4bde6fcf0b3a...,"['couch', 'sofa']",markovic,1522.0,"['studio couch', 'studio couch']","[831, 831]"
4,5d4be062cf0b3a0f3f33921a,Brazil,br,am,image,assets/5d4be062cf0b3a0f3f33921a/5d4be062cf0b3a...,['oven'],carneiro-dos-santos,1034.0,['stove'],[827]


In [19]:
img_name = os.path.join(dataset_path, data_train.iloc[0, 5])
image = Image.open(img_name).convert("RGB")

#### Create torch dataset

In [28]:
class DollarStreetDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.data.iloc[idx, 5]) # 5 is the column imageRelPath
        image = Image.open(img_name).convert("RGB")
        label = self.data.iloc[idx, 10] # 10 is the column imagenet_sysnet_id

        if self.transform:
            image = self.transform(image)

        return image, label

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

data_train = DollarStreetDataset(csv_file=train_path, root_dir=dataset_path, transform=transform)
dataloader = DataLoader(data_train, batch_size=64, shuffle=False, num_workers=4)

def calculate_mean_std(dataloader):
    mean = 0.
    std = 0.
    total_images_count = 0
    
    for images, _ in dataloader:
        batch_samples = images.size(0) # Batch size (the last batch can have smaller size)
        images = images.view(batch_samples, images.size(1), -1) # Reshape images to (batch_size, channels, width*height)
        mean += images.mean(2).sum(0) # Sum up means for each channel
        std += images.std(2).sum(0) # Sum up std for each channel
        total_images_count += batch_samples

    mean /= total_images_count
    std /= total_images_count

    return mean, std

# Calculate mean and std
mean, std = calculate_mean_std(dataloader)
print(f'Mean: {mean}')
print(f'Std: {std}')

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.4250, 0.3887, 0.3524], [0.2061, 0.2025, 0.2029])
])

data_train = DollarStreetDataset(csv_file=train_path, root_dir=dataset_path, transform=transform)
dataloader_train = DataLoader(data_train, batch_size=64, shuffle=False, num_workers=4)
data_test = DollarStreetDataset(csv_file=test_path, root_dir=dataset_path, transform=transform)
dataloader_test = DataLoader(data_test, batch_size=64, shuffle=False, num_workers=4)