# Datasets Overlap Assessment

#### Here we will collect the number of images per class per dataset and analyze overlap.
From mapping, we know neither CIFAR-100 or ImageNet-1000 have Wasp or Moquito, additionally, CIFAR-100 does not have Ant, Dragonfly, Fly, Grasshopper, Ladybug. 
Class overlap:
- Clean: 11 classes
- CIFAR-100: 4 of the clean classes map (4 fine matches)
- ImageNet-1000: 9 of the clean classes map (27 fine matches)
- iNaturalist (36k): 9 of the clean classes map (35 fine matches)

In [9]:
import sys
import os
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

sys.path.append(os.path.abspath('..'))

#from utils.label_mappings import *
from datasets import load_dataset

### CIFAR-100 dataset

In [4]:
cifar100 = load_dataset("uoft-cs/cifar100")

In [5]:
cifar100['train']  # want to match cifar schema

Dataset({
    features: ['img', 'fine_label', 'coarse_label'],
    num_rows: 50000
})

In [6]:
cifar100['train'][0]  # want image type to be same

{'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32>,
 'fine_label': 19,
 'coarse_label': 11}

In [None]:
cifar100_df = pd.DataFrame({'fine_label': cifar100['train']['fine_label']})
def map_cifar100_to_clean_label(label):
    if label in cifar100_to_clean_map:
        return cifar100_to_clean_map[label]
    else:
        return None

cifar100_df['clean_label'] = cifar100_df['fine_label'].apply(map_cifar100_to_clean_label)
cifar100_df.groupby('clean_label').count()

### Clean insect validation dataset

In [6]:
base_path = '../clean_insect_images/'

class_dirs = ['Ant','Bee','Butterfly','Dragonfly','Fly','Grasshopper','Ladybug','Spider']

clean_ds = {'image':[], 'label':[], 'file_path':[]}

for c in class_dirs:
    target_dir = os.path.join(base_path, c)
    image_files = os.listdir(target_dir)
    for f in image_files:
        full_image_path = os.path.join(target_dir, f)
        clean_ds['image'].append(Image.open(full_image_path))
        clean_ds['label'].append(c)
        clean_ds['file_path'].append(full_image_path)



In [7]:
clean_df = pd.DataFrame({'label': clean_ds['label'], 'file_path': clean_ds['file_path']})
clean_df

Unnamed: 0,label,file_path
0,Ant,../clean_insect_images/Ant/Ant_472.jpg
1,Ant,../clean_insect_images/Ant/Ant_13.jpg
2,Ant,../clean_insect_images/Ant/Ant_719.jpg
3,Ant,../clean_insect_images/Ant/Ant_378.jpg
4,Ant,../clean_insect_images/Ant/Ant_641.jpg
...,...,...
6313,Spider,../clean_insect_images/Spider/Spider_376.jpg
6314,Spider,../clean_insect_images/Spider/Spider_400.jpg
6315,Spider,../clean_insect_images/Spider/Spider_558.jpg
6316,Spider,../clean_insect_images/Spider/Spider_139.jpg


Index(['Ant', 'Bee', 'Butterfly', 'Dragonfly', 'Fly', 'Grasshopper', 'Ladybug',
       'Spider'],
      dtype='object', name='label')

In [26]:
#stratefied random sample 50 images from each category

random_samples = np.array([])
for insect in clean_df.groupby('label').count().index.unique():
    cur_sample = np.random.choice(clean_df[clean_df['label']==insect]['file_path'], 50)
    random_samples = np.concatenate((samples,cur_sample), axis=0)

val_set = list(random_samples)
with open('../, 'w') as file:
    file.write('\n'.join(my_list))

array(['../clean_insect_images/Ant/Ant_547.jpg',
       '../clean_insect_images/Ant/Ant_163.jpg',
       '../clean_insect_images/Ant/Ant_524.jpg',
       '../clean_insect_images/Ant/Ant_8.jpg',
       '../clean_insect_images/Ant/Ant_139.jpg',
       '../clean_insect_images/Bee/Bee_866.jpg',
       '../clean_insect_images/Bee/Bee_858.jpg',
       '../clean_insect_images/Bee/Bee_109.jpg',
       '../clean_insect_images/Bee/Bee_208.jpg',
       '../clean_insect_images/Bee/Bee_991.jpg',
       '../clean_insect_images/Butterfly/Butterfly_48.jpg',
       '../clean_insect_images/Butterfly/Butterfly_118.jpg',
       '../clean_insect_images/Butterfly/Butterfly_688.jpg',
       '../clean_insect_images/Butterfly/Butterfly_723.jpg',
       '../clean_insect_images/Butterfly/Butterfly_273.jpg',
       '../clean_insect_images/Dragonfly/Dragonfly_489.jpg',
       '../clean_insect_images/Dragonfly/Dragonfly_174.jpg',
       '../clean_insect_images/Dragonfly/Dragonfly_313.jpg',
       '../clean_insect_i

### iNaturalist dataset

In [None]:
iNat36 = load_dataset("sxj1215/inaturalist") #36k rows
iNat36['train']['messages'][0]
iNat36_df = pd.DataFrame({'messages': iNat36['train']['messages']})
def get_iNat_label(messages):
    return messages[1]['content']
iNat36_df['species'] = iNat36_df['messages'].apply(get_iNat_label)
#list(iNat36_df.groupby('species').count().index)
def map_inat_to_clean_label(label):
    if label in iNat_to_clean_map:
        return iNat_to_clean_map[label]
    else:
        return None
iNat36_df['clean_label'] = iNat36_df['species'].apply(map_inat_to_clean_label)
grouped_counts = iNat36_df.groupby('clean_label').count()
grouped_counts

Unnamed: 0_level_0,messages,species
clean_label,Unnamed: 1_level_1,Unnamed: 2_level_1
Ant,181,181
Bee,300,300
Beetle,1770,1770
Butterfly,4869,4869
Fly,300,300
Grasshopper,398,398
Ladybug,300,300
Spider,600,600
Wasp,175,175


In [None]:
grouped_counts.sum()

In [None]:
len(iNat36_df.groupby('species').count().index)

139

In [8]:
# iNat100 = load_dataset("zguo0525/inat_2021") #100k rows
# iNat100#['train']['messages'][0]
# iNat100_df = pd.DataFrame({'messages': iNat100['train']['messages']})
# def get_iNat_label(messages):
#     return messages[1]['content']
# iNat100_df['species'] = iNat100_df['messages'].apply(get_iNat_label)
# #list(iNat36_df.groupby('species').count().index)
# len(iNat36_df.groupby('species').count().index)
# def map_inat_to_clean_label(label):
#     if label in iNat_to_clean_map:
#         return iNat_to_clean_map[label]
#     else:
#         return None
# iNat100_df['clean_label'] = iNat100_df['species'].apply(map_inat_to_clean_label)
# iNat100_df.groupby('clean_label').count()

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00015.parquet:   0%|          | 0.00/477M [00:00<?, ?B/s]

data/train-00001-of-00015.parquet:   0%|          | 0.00/473M [00:00<?, ?B/s]

data/train-00002-of-00015.parquet:   0%|          | 0.00/480M [00:00<?, ?B/s]



data/train-00003-of-00015.parquet:   0%|          | 0.00/477M [00:00<?, ?B/s]

RuntimeError: Data processing error: CAS service error : IO Error: No space left on device (os error 28)

In [None]:
#would need to check overlap with following
#juppy44/inat2021-train-mini-test #500k #unsure image format (no preview)
#MVRL/iNat-2021-train #500k 
#MVRL/iNat-2021-train #2.69mil #weird and inconsistent image formats