# Datasets Overlap Assessment

#### Here we will collect the number of images per class per dataset and analyze overlap.
From mapping, we know neither CIFAR-100 or ImageNet-1000 have Wasp or Moquito, additionally, CIFAR-100 does not have Ant, Dragonfly, Fly, Grasshopper, Ladybug. 
Class overlap:
- Clean: 11 classes
- CIFAR-100: 4 of the clean classes map (4 fine matches)
- ImageNet-1000: 9 of the clean classes map (27 fine matches)
- iNaturalist (36k): 9 of the clean classes map (35 fine matches)

In [1]:
import sys
import os
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

sys.path.append(os.path.abspath('..'))

from utils.label_mappings import *
from datasets import load_dataset

### Clean insect validation dataset
https://www.kaggle.com/datasets/ismail703/insects/data

In [4]:
base_path = '../data/clean_insect_images/'

class_dirs = ['Ant','Bee','Beetle','Butterfly','Dragonfly','Fly','Grasshopper','Ladybug','Spider']

clean_ds = {'image':[], 'label':[], 'file_path':[]}

for c in class_dirs:
    target_dir = os.path.join(base_path, c)
    image_files = os.listdir(target_dir)
    for f in image_files:
        if f == '.ipynb_checkpoints':
            continue
        else:
            full_image_path = os.path.join(target_dir, f)
            clean_ds['image'].append(Image.open(full_image_path))
            clean_ds['label'].append(c)
            clean_ds['file_path'].append(full_image_path)



In [5]:
clean_df = pd.DataFrame({'label': clean_ds['label'], 'file_path': clean_ds['file_path']})
clean_df

Unnamed: 0,label,file_path
0,Ant,../data/clean_insect_images/Ant/Ant_472.jpg
1,Ant,../data/clean_insect_images/Ant/Ant_13.jpg
2,Ant,../data/clean_insect_images/Ant/Ant_719.jpg
3,Ant,../data/clean_insect_images/Ant/Ant_378.jpg
4,Ant,../data/clean_insect_images/Ant/Ant_641.jpg
...,...,...
7245,Spider,../data/clean_insect_images/Spider/Spider_376.jpg
7246,Spider,../data/clean_insect_images/Spider/Spider_400.jpg
7247,Spider,../data/clean_insect_images/Spider/Spider_558.jpg
7248,Spider,../data/clean_insect_images/Spider/Spider_139.jpg


In [6]:
grouped_counts_clean = clean_df.groupby('label').count()
grouped_counts_clean

Unnamed: 0_level_0,file_path
label,Unnamed: 1_level_1
Ant,734
Bee,1088
Beetle,932
Butterfly,795
Dragonfly,677
Fly,880
Grasshopper,839
Ladybug,654
Spider,651


In [7]:
grouped_counts_clean.sum()

file_path    7250
dtype: int64

### iNaturalist dataset

In [2]:
iNat36 = load_dataset("sxj1215/inaturalist", split='train') #36k rows #3.3 GB
iNat36['messages'][0]
iNat36_df = pd.DataFrame({'messages': iNat36['messages']})
def get_iNat_label(messages):
    return messages[1]['content']
iNat36_df['species'] = iNat36_df['messages'].apply(get_iNat_label)
#list(iNat36_df.groupby('species').count().index)
def map_inat_to_clean_label(label):
    if label in iNat_to_clean_map:
        return iNat_to_clean_map[label]
    else:
        return None
iNat36_df['clean_label'] = iNat36_df['species'].apply(map_inat_to_clean_label)
grouped_counts = iNat36_df.groupby('clean_label').count()
grouped_counts

Unnamed: 0_level_0,messages,species
clean_label,Unnamed: 1_level_1,Unnamed: 2_level_1
Ant,181,181
Bee,300,300
Beetle,1770,1770
Butterfly,1957,1957
Grasshopper,168,168
Ladybug,300,300
Spider,600,600


In [3]:
grouped_counts.sum()

messages    5276
species     5276
dtype: int64

In [11]:
for i in iNat36_df.groupby('species').count().index:
    print(i)

Abudefduf sexfasciatus
Agapostemon virescens
Aglais urticae
Agrotis segetum
Alaus lusciosus
Alectoris rufa
Anartia amathea
Anolis equestris
Aphomia sociella
Aphonopelma chalcodes
Apogeshna stenialis
Aratus pisonii
Ardisia crenata
Artemisiospiza nevadensis
Asparagus officinalis
Avicennia germinans
Baptisia alba
Blatta orientalis
Briza minor
Bromus catharticus
Buprestis aurulenta
Buteo jamaicensis
Cabera pusaria
Calcarius lapponicus
Calochortus venustus
Camponotus planatus
Canis mesomelas
Carcinus maenas
Carex stipata
Carissa bispinosa
Carphophis amoenus
Carya illinoinensis
Castilleja rhexiifolia
Centaurea montana
Ceratomia catalpae
Cerorhinca monocerata
Chaetopappa ericoides
Charadrius leschenaultii
Chlorion aerarium
Chrysolina americana
Chrysosplenium alternifolium
Cicindela hirticollis
Cigaritis lohita
Cirsium vulgare
Clarkia rhomboidea
Cleome viscosa
Commelina africana
Coprosma robusta
Coprosma rotundifolia
Coptis trifolia
Coracias benghalensis
Crocodylus moreletii
Cyanistes caeruleu

## Insects that are present in iNat but not in validation set (yet)
# Moths
"Agrotis segetum": "Moth",
"Aphomia sociella": "Moth",
"Apogeshna stenialis": "Moth",
"Cabera pusaria": "Moth",
"Ceratomia catalpae": "Moth",
"Enyo lugubris": "Moth",
"Gastrina cristaria": "Moth",
"Gymnandrosoma punctidiscanum": "Moth",
"Heterophleps triguttaria": "Moth",
"Ochropleura plecta": "Moth",
"Thaumetopoea processionea": "Moth",
"Triphosa haesitata": "Moth"

# Cockroaches
"Blatta orientalis": "Cockroach"

# True Bugs
"Leptoglossus occidentalis": "True Bug",
"Palomena prasina": "True Bug",
"Zelus luridus": "True Bug"

In [7]:
# iNat100 = load_dataset("zguo0525/inat_2021") #100k rows
# iNat100#['train']['messages'][0]
# iNat100_df = pd.DataFrame({'messages': iNat100['train']['messages']})
# def get_iNat_label(messages):
#     return messages[1]['content']
# iNat100_df['species'] = iNat100_df['messages'].apply(get_iNat_label)
# #list(iNat36_df.groupby('species').count().index)
# len(iNat36_df.groupby('species').count().index)
# def map_inat_to_clean_label(label):
#     if label in iNat_to_clean_map:
#         return iNat_to_clean_map[label]
#     else:
#         return None
# iNat100_df['clean_label'] = iNat100_df['species'].apply(map_inat_to_clean_label)
# iNat100_df.groupby('clean_label').count()

In [None]:
#would need to check overlap with following
#juppy44/inat2021-train-mini-test #500k #unsure image format (no preview)
#MVRL/iNat-2021-train #500k 
#MVRL/iNat-2021-train #2.69mil #weird and inconsistent image formats