## Checking the Dollarstreet Dataset 
#### Core problem: my test accuracy is 8-10%, while even their lowest subsets were 15%. To investigate, I took a couple steps: 



1) They released a test dataset with imagenet ids (0-1000) and imagenet classes. I used this csv to confirm that my imagenet class mapping was correct, finding that all of my imagenet classes and ids matched their test set. 

In [None]:
### Check my label mapping with their label mapping: 
import pandas as pd 
a = pd.read_csv("/checkpoint/meganrichards/datasets/dollarstreet_kaggle/dataset_dollarstreet/images_v2_with_imagenet_indices.csv", index_col = 0)
test_set = pd.read_csv("/checkpoint/meganrichards/datasets/dollarstreet_kaggle/dataset_dollarstreet/images_v2_imagenet_test.csv")
combined = pd.merge(test_set, a[['id', 'imagenet_class', 'label_index', 'masked_imagenet_index_str', 'masked_imagenet_index']], on = 'id', how = 'left')[['id', 'imagenet_synonyms','imagenet_class', 'label_index', 'imagenet_sysnet_id', 'masked_imagenet_index_str', 'masked_imagenet_index', 'imageRelPath', 'id']]
assert (len(combined) == len(test_set))
# Checking that both the imagenet index and the imagenet class names match between their mapping version and mine
assert (combined['label_index'] == combined['label_index']).sum()/len(combined) == 1.0
assert (combined['imagenet_synonyms'] == combined['imagenet_class']).sum()/len(combined) == 1.0
# combined.to_csv("/checkpoint/meganrichards/datasets/dollarstreet_kaggle/dataset_dollarstreet/images_v2_imagenet_test_with_masked_index.csv")

2) I checked that the array I'm using to mask the model outputs were the same as those in the labels. 

In [75]:
### Check that the datamodule mask and labels have matching sets  
from ast import literal_eval
import pandas as pd 
from datasets.dollarstreet_kaggle import DollarStreetDataModule

dm = DollarStreetDataModule()
p = pd.read_csv("/checkpoint/meganrichards/datasets/dollarstreet_kaggle/dataset_dollarstreet/images_v2_with_imagenet_indices.csv", index_col=0)
p['masked_imagenet_index'] = p['masked_imagenet_index'].apply(literal_eval)
p['label_index'] = p['label_index'].apply(literal_eval)

dollar_indexes = []
for x in p['masked_imagenet_index'].tolist():
    dollar_indexes = dollar_indexes +x

imagenet_indexes = [] 
for x in p['label_index'].tolist():
    imagenet_indexes = imagenet_indexes +x
   
dollar_indexes = list(set(dollar_indexes))
dollar_indexes.sort()
mask = dm.mask 
mask.sort()

imagenet_indexes = list(set(imagenet_indexes))
imagenet_indexes.sort()

# CHECKS 
assert len(dollar_indexes) == len(mask)  # are we pulling the same number of classes as in dollarstreet
assert mask == imagenet_indexes # are the label indices the same as the mask 


3) I checked that my accuracy calculation was correct

In [77]:
#### Accuracy calculation appears to be correct 
import os 
import pandas as pd 
dir = "/checkpoint/meganrichards/logs/interplay_project/resnet152_base_measures/2023-02-17_10-36-36/"

pd.read_csv(dir + '/DollarStreetPerformance/dollarstreet_results.csv', index_col = 0)[['predictions', 'label', 'accurate_top5', 'accurate_top1']].iloc[40:60]

Unnamed: 0,predictions,label,accurate_top5,accurate_top1
8,"[63, 53, 54, 0, 2]",5151,False,False
9,"[23, 27, 1, 46, 57]",50,False,False
10,"[29, 14, 31, 36, 9]",12,False,False
11,"[8, 16, 37, 36, 43]",16,True,False
12,"[8, 4, 16, 24, 13]",44,False,False
13,"[8, 31, 29, 48, 16]",16,True,False
14,"[33, 52, 29, 40, 2]",50,False,False
15,"[4, 24, 27, 22, 43]",2,False,False
16,"[24, 2, 4, 8, 38]",49,False,False
17,"[27, 43, 5, 7, 26]",58,False,False


##### 4) I confirmed that my prediction range was (0, 63), and that I was using the imagenet transform

In [58]:
# Manual walk through 
from datasets.dollarstreet_kaggle import DollarStreetDataModule
from models.resnet.resnet import ResNet101dClassifierModule
import torch

dm = DollarStreetDataModule(batch_size=32)
dl = dm.test_dataloader()
X,y,id = next(iter(dl))
model = ResNet101dClassifierModule()
y_hat = model(X)
print(y_hat.shape)
masked_y_hat = y_hat[:, mask]
print(masked_y_hat.shape)
confidences5, indices5 = torch.nn.functional.softmax(masked_y_hat, dim=-1).topk(5)
print(f"Predictions range: {indices5.flatten().min().item(), indices5.flatten().max().item()}")

making dataset
Compose(
    Resize(size=256, interpolation=bilinear, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)


#### Some more straightforward things I also checked: 
- tried a few resnet models (they didn't specify the whether they used 18/32/50/etc), included the specific pretrained torchvision one used in the paper  
- tried timm vs imagenet transform - no real difference 

## Generating the metadata file

In [78]:
# Filter for only images with imagenet mapping
import pandas as pd
p = pd.read_csv("/checkpoint/meganrichards/datasets/dollarstreet_kaggle/dataset_dollarstreet/images_v2.csv")
from datasets.dollarstreet_kaggle import MAPPING

def has_imagenet_mapping(x):
    for i in x['topics'].split(','):
        if i.lower().strip() in list(MAPPING.keys()):
            return True
    return False

p['match'] = p.apply(has_imagenet_mapping, axis = 1)
p_only_imagnet = p[p['match']]

assert len(p_only_imagnet) == 21536 # reported by paper

In [93]:
#### Converting CSV labels into dollarstreet labels 

def convert_dollarstreet_class_to_imagenet_class(row):
    row_label = row['topics']
    row_index = []
    for i in row_label.split(','): 
        if i.lower().strip() in MAPPING.keys():
            row_index = row_index + [MAPPING[i.lower().strip()]]
        
    return row_index

p_only_imagnet['imagenet_class'] = p_only_imagnet.apply(convert_dollarstreet_class_to_imagenet_class, axis =1)

# Test 
for x in list(p_only_imagnet['imagenet_class']): 
    assert len(x) > 0

from datasets.dollarstreet_kaggle import make_imagenet_class_to_dollarstreet_idx, make_idx_to_label
imagenet_class_to_dollarstreet_idx = make_imagenet_class_to_dollarstreet_idx()
idx_to_label = make_idx_to_label(imagenet_class_to_dollarstreet_idx)
imagenet_mask = list(idx_to_label.keys())

def convert_imagenet_class_to_index(row):
    row_label = row['imagenet_class']
    row_index = []
    for label in row_label:
        row_index = row_index + imagenet_class_to_dollarstreet_idx[label.lower().strip()]
    return row_index

p_only_imagnet['imagenet_index'] = p_only_imagnet.apply(convert_imagenet_class_to_index, axis =1)

imagenet_mask.sort()

imagenet_idx_to_masked_idx = {}
for i in range(len(imagenet_mask)):
    imagenet_idx_to_masked_idx[imagenet_mask[i]] = i

imagenet_idx_to_masked_idx

def convert_imagenet_index_to_masked_index(row):
    row_idx = row['imagenet_index']
    masked_index = []
    for idx in row_idx:
        masked_index = masked_index + [imagenet_idx_to_masked_idx[idx]]
    return masked_index

p_only_imagnet['masked_imagenet_index'] = p_only_imagnet.apply(convert_imagenet_index_to_masked_index, axis =1)

# #p_only_imagnet.drop(columns = ['label_index'])
# #p_only_imagnet.to_csv("/checkpoint/meganrichards/datasets/dollarstreet_kaggle/dataset_dollarstreet/images_v2_with_imagenet_indices.csv")

def combine_list_to_str(x):
    a = []
    for x_i in x: 
        a.append(str(x_i))
    return ','.join(a)

p_only_imagnet['masked_imagenet_index_str'] = p_only_imagnet["masked_imagenet_index"].apply(combine_list_to_str)
# #p_only_imagnet.to_csv("/checkpoint/meganrichards/datasets/dollarstreet_kaggle/dataset_dollarstreet/images_v2_with_imagenet_indices.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_only_imagnet['imagenet_class'] = p_only_imagnet.apply(convert_dollarstreet_class_to_imagenet_class, axis =1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  p_only_imagnet['imagenet_index'] = p_only_imagnet.apply(convert_imagenet_class_to_index, axis =1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop