## Dollar Street dataset
#### Imports

In [None]:
import ast
import os
from PIL import Image
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import resnet50, ResNet50_Weights

# from datasets.dollarstreet import get_dollarstreet --> test if this works

#### Load CSV
Data downloaded from Kaggle: 
```
kaggle datasets download -d mlcommons/the-dollar-street-dataset -p /mfsnic/u/apouget/data/dollarstreet/
unzip /mfsnic/u/apouget/data/dollarstreet/the-dollar-street-dataset.zip -d /mfsnic/u/apouget/data/dollarstreet
```

In [None]:
dataset_path = '/mfsnic/u/apouget/data/dollarstreet/dataset_dollarstreet/'
train_path = os.path.join(dataset_path, 'images_v2_imagenet_train.csv')
test_path = os.path.join(dataset_path, 'images_v2_imagenet_test.csv')

data_train_csv = pd.read_csv(train_path)
data_test_csv = pd.read_csv(test_path)
# display(data_train_csv.head())

#### Create torch dataset

In [None]:
class DollarStreetDataset(Dataset):
    def __init__(self, csv_file, root_dir, pre_filter=None, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

        if pre_filter:
            self.data = self.data[self.data.apply(pre_filter, axis=1)]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.data.iloc[idx, 5]) # 5 is the column imageRelPath
        image = Image.open(img_name).convert("RGB")
        label = ast.literal_eval(self.data.iloc[idx, 10])[0] # 10 is the column imagenet_sysnet_id, taking the first label

        if self.transform:
            image = self.transform(image)

        return image, label

In [None]:
transform = transforms.Compose([
    transforms.Resize(232, interpolation=transforms.InterpolationMode.BILINEAR),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.4333, 0.3959, 0.3595], [0.2046, 0.2008, 0.2011])
])

data_train = DollarStreetDataset(csv_file=train_path, root_dir=dataset_path, transform=transform)
dataloader_train = DataLoader(data_train, batch_size=64, shuffle=False, num_workers=4)
data_test = DollarStreetDataset(csv_file=test_path, root_dir=dataset_path, transform=transform)
dataloader_test = DataLoader(data_test, batch_size=64, shuffle=False, num_workers=4)

#### Check model accuracy

In [None]:
weights = ResNet50_Weights.IMAGENET1K_V2
model = resnet50(weights=weights)

def get_accuracy(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    
    with torch.no_grad():  # No need to calculate gradients for evaluation
        for data in dataloader:
            images, labels = data
            outputs = model(images)
            
            # Get the predicted class by taking the argmax of the output tensor
            _, predicted = torch.max(outputs.data, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total * 100
    return accuracy

print(get_accuracy(model, dataloader_test)) # 23.81615598885794

#### Check model accuracy per region

In [None]:
REGIONS = data_test_csv['region.id'].unique()
for region in REGIONS:
    data_test_filtered = DollarStreetDataset(csv_file=test_path, root_dir=dataset_path, pre_filter=lambda x: x['region.id'] == region, transform=transform)
    dataloader_test_filtered = DataLoader(data_test_filtered, batch_size=64, shuffle=False, num_workers=4)
    print(f'Region {region}: {get_accuracy(model, dataloader_test_filtered)}')
# Region as: 22.25117248566962
# Region am: 28.971962616822427
# Region af: 15.11216056670602
# Region eu: 32.50728862973761