# Tutorial on PyTorch dataset class

We provide a `BaseDatasetCP` PyTorch Dataset class for loading the data, enhanced with a `BaseSamplerCP` Sampler class to sample data for few-shot prediction.

In [1]:
# Setting things up for importing local modules
import os 
import sys

# Add folder FSL_CP to sys.path
FSL_CP_PATH = os.path.join(os.environ['HOME'], 'FSL_CP')
sys.path.insert(0, FSL_CP_PATH)

# Change working directory to FSL_CP
os.chdir(FSL_CP_PATH)

In [2]:
# Imports
import json
import pandas as pd
from torch.utils.data import DataLoader
from fsl_cp.datamodule.base import BaseDatasetCP, BaseSamplerCP

In [3]:
# Initialisations
support_set_size = 64
query_set_size = 32
num_episodes_val = 100
num_episodes_test = 100
num_episodes = 1000
meta_batch_size=3


json_path = os.path.join(FSL_CP_PATH, 'data/output/data_split.json')
label_df_path= os.path.join(FSL_CP_PATH, 'data/output/FINAL_LABEL_DF.csv')
df_assay_id_map_path = os.path.join(FSL_CP_PATH, 'data/output/assay_target_map.csv') 
cp_f_path=[os.path.join(FSL_CP_PATH,'data/output/norm_CP_feature_df.csv')]

Information about which assay is in the train/validation/test set is stored in a JSON file.

In [4]:
# Load the assay keys.
with open(json_path) as f:
    data = json.load(f)
train_split = data['train']
val_split = data['val']
test_split = data['test']

Let's first have a look at the label dataframe.

In [5]:
label_df = pd.read_csv(label_df_path)
label_df

Unnamed: 0,INCHIKEY,CPD_SMILES,SAMPLE_KEY,VIEWS,LABEL,NUM_ROW_CP_FEATURES,ASSAY
0,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,24305-D04,1_2_3_4_5_6,1.0,2636,737823
1,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,24306-D04,1_2_3_4_5_6,1.0,2870,737823
2,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,24307-D04,1_2_3_4_5_6,1.0,3105,737823
3,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,24352-D04,1_2_3_4_5_6,1.0,5531,737823
4,AUVVAXYIELKVAI-CKBKHPSWSA-N,CC[C@H]1CN2CCc3cc(OC)c(OC)cc3[C@@H]2C[C@@H]1C[...,25955-D04,1_2_3_4_5_6,1.0,34149,737823
...,...,...,...,...,...,...,...
286336,ZZRJKLPEGBZEAO-UHFFFAOYSA-N,COCCNC1=NN=C(CS1)c1ccc(NC(C)=O)cc1,25738-F12,1_2_3_4_5_6,0.0,32530,1495346
286337,ZZZPZDFBUDZIOU-UHFFFAOYSA-N,O=S(=O)(c1ccccc1)n1ccc(n1)-c1cnc(s1)-c1ccccc1,25689-D07,1_2_3_4_5_6,0.0,29280,1495346
286338,ZZZPZDFBUDZIOU-UHFFFAOYSA-N,O=S(=O)(c1ccccc1)n1ccc(n1)-c1cnc(s1)-c1ccccc1,25690-D07,1_2_3_4_5_6,0.0,29533,1495346
286339,ZZZPZDFBUDZIOU-UHFFFAOYSA-N,O=S(=O)(c1ccccc1)n1ccc(n1)-c1cnc(s1)-c1ccccc1,25692-D07,1_2_3_4_5_6,0.0,29786,1495346


Imagine each assay is a small dataset, then this `label_df` is just a concatenation of all of these small datasets. 

The PyTorch dataset class is called `BaseDatasetCP`. This dataset class concatenate all feature dataframes read from `cp_f_path` to create a feature dataframe. 
If you get the nth entry, it returns the `label_df[NUM_ROW_CP_FEATURES][n]`-th row in the feature matrix, and the label `label_df['LABEL]`

In [6]:
train_data = BaseDatasetCP(
    train_split, 
    label_df_path= label_df_path, 
    cp_f_path=cp_f_path
)
train_data[2]

(tensor([-5.5312e-02,  8.2471e-01, -8.5358e-01, -1.1394e-01, -2.2541e-01,
         -9.7531e-02,  5.4378e-01, -1.3228e-01,  3.3213e-02, -1.1026e+00,
         -1.5479e-01,  3.7139e-01,  2.6772e-01,  6.3412e-01,  5.3069e-01,
          6.5574e-01, -2.1781e-01,  2.8737e-01, -6.5207e-01,  3.5818e-01,
          4.9921e-01,  1.5727e-01,  1.9808e-01,  9.9292e-01,  3.2036e-01,
         -2.2425e-01,  3.6343e-01,  3.3449e-01, -1.2459e+00, -3.6888e-01,
          7.8672e-01, -1.9010e-01,  2.1694e-01,  7.7779e-01,  5.4923e-02,
         -2.4673e-01, -3.2580e-02,  8.8714e-01,  1.6694e-01,  1.0584e+00,
          3.2362e-01, -1.0239e-01, -2.6977e-01,  2.7995e-01, -2.2113e-01,
          1.2746e-01, -1.5621e-01,  2.6918e-01,  8.7997e-01, -1.1835e-01,
          3.6365e-01,  5.6145e-01,  2.3562e-01,  2.4998e-01,  4.8559e-01,
          3.5615e-01,  3.3826e-01,  2.2122e-01,  3.7559e-01,  5.8952e-01,
          1.6489e-01,  2.4026e-01,  2.1044e-01,  3.8701e-01,  2.4031e-01,
          2.4409e-01,  6.0047e-01,  1.

For few-shot learning, we consider each assay as a task. `BaseSamplerCP` for each batch choose a random task (assay), and sample support and query set in a stratified manner. These batches are then be loaded using the default `DataLoader`.

In [7]:
# Load train data.

train_sampler = BaseSamplerCP(
        task_dataset=train_data,
        support_set_size=support_set_size,
        query_set_size=query_set_size,
        num_episodes=num_episodes,
        meta_batch_size=meta_batch_size
)
train_loader = DataLoader(
        train_data,
        batch_sampler=train_sampler,
        num_workers=12,
        pin_memory=True,
        collate_fn=train_sampler.episodic_collate_fn,
)

batch = next(iter(train_loader))
print(batch[0].size())
print(batch[1].size())
print(batch[2].size())
print(batch[3].size())
print(batch[4])

torch.Size([64, 993])
torch.Size([64])
torch.Size([32, 993])
torch.Size([32])
[0.0, 1.0]


We can see above that each batch contains the features and labels for support set (size 64), the same for the query set (size 32), and a list of unique labels (just for debugging).

Also notice that `train_loader` returns 3000 batches.

In [8]:
len(train_loader)

3000

This is equal to `num_episodes`*`meta_batch_size`. So normally when one train a model, they will accumulate the gradients (loss.bachward()) from `meta_batch_size` batches, then perform a parameter update step with optimier.step().