# Step 1: Data Preparation

In [26]:
# import packages
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

In [15]:
# read in labels and get the species
df = pd.read_csv('Training_set.csv')
species = sorted(df['label'].unique())
#species

In [17]:
# map each species to an id
species_id = {species: idx for idx, species in enumerate(species)}
#species_id
df['id'] = df['label'].map(species_id)
df

Unnamed: 0,filename,label,id
0,Image_1.jpg,SOUTHERN DOGFACE,66
1,Image_2.jpg,ADONIS,0
2,Image_3.jpg,BROWN SIPROETA,12
3,Image_4.jpg,MONARCH,44
4,Image_5.jpg,GREEN CELLED CATTLEHEART,33
...,...,...,...
6494,Image_6495.jpg,MANGROVE SKIPPER,40
6495,Image_6496.jpg,MOURNING CLOAK,45
6496,Image_6497.jpg,APPOLLO,4
6497,Image_6498.jpg,ELBOWED PIERROT,29


In [21]:
# add filenames to the df
df['filepath'] = df['filename'].apply(lambda x: os.path.join("train", x))
df

Unnamed: 0,filename,label,id,filepath
0,Image_1.jpg,SOUTHERN DOGFACE,66,train/Image_1.jpg
1,Image_2.jpg,ADONIS,0,train/Image_2.jpg
2,Image_3.jpg,BROWN SIPROETA,12,train/Image_3.jpg
3,Image_4.jpg,MONARCH,44,train/Image_4.jpg
4,Image_5.jpg,GREEN CELLED CATTLEHEART,33,train/Image_5.jpg
...,...,...,...,...
6494,Image_6495.jpg,MANGROVE SKIPPER,40,train/Image_6495.jpg
6495,Image_6496.jpg,MOURNING CLOAK,45,train/Image_6496.jpg
6496,Image_6497.jpg,APPOLLO,4,train/Image_6497.jpg
6497,Image_6498.jpg,ELBOWED PIERROT,29,train/Image_6498.jpg


In [23]:
# split into test/train dfs
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['id'],
    random_state=42
)

In [25]:
# create dataset class
class ButterflyDataset(Dataset):
    def __init__(self, df, transform=None):
        self.paths = df['filepath'].values
        self.labels = df['id'].values
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        img = Image.open(self.paths[idx]).convert("RGB")
        label = int(self.labels[idx])

        if self.transform:
            img = self.transform(img)

        return img, label

In [29]:
# create dataloaders

# add transformations eventually
transform = transforms.Compose([
    transforms.ToTensor()
])

train_dataset = ButterflyDataset(train_df, transform)
test_dataset  = ButterflyDataset(test_df, transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)