# Rainbow Memory: Make a dataset of each task.

In [35]:
import sys
import pandas as pd
import numpy as np
import torch.nn as nn
from pathlib import Path
import PIL
import PIL.ImageOps
import PIL.ImageEnhance
import PIL.ImageDraw
from PIL import Image
from typing import List
from torchvision import datasets
from torch.utils.data import Dataset
from torchvision import transforms

You should read json file which follows below format. 

[{"klass": "truck", "file_name": "test/truck/01.jpg"}, ...]

You should change the file name as below. 


In [5]:
train = pd.read_json('C:/smh/AI/code/rainbow-memory/dataset/train_json.json')
test = pd.read_json('C:/smh/AI/code/rainbow-memory/dataset/test_json.json')

Change the variables `rnd_seed` and `num_tasks` 

In [6]:
rnd_seed = 3 # random seed 
num_tasks = 5 # the number of tasks. 
np.random.seed(rnd_seed)

In [7]:
klass = train.klass.unique()
num_cls_per_task = len(klass) // num_tasks
#print(num_cls_per_task) # cifar10이므로 10/5 = 2
np.random.shuffle(klass)
# enumerate : index 번호와 class를 tuple의 형태로 반환
class2label = {cls_:idx for idx, cls_ in enumerate(klass)}  # house : 0, cat : 1 이런식으로 순서대로 배정해주는 것
'''
apply와 lambda
lambda를 하면 대입을 해야하는 것을 대신해주게 된다. 여기서는 x에 class2label[x]를 넣어주는 역할을 한줄의 코드로 가능하게 만들어준다.
원래 x에 0~9까지의 숫자가 들어 있었기 때문에, 각 class에 해당하는 애를 matching 시켜줌
이를 바탕으로 label이라는 column을 새롭게 matching 시켜줌
'''
train["label"] = train.klass.apply(lambda x: class2label[x])
test["label"] = test.klass.apply(lambda x: class2label[x])

task_class = np.split(klass, num_tasks) # task별로 class를 나눠줌 (2개씩!!)

# list comprehension
'''
[(변수를 활용할 방법) for (사용할 변수 이름) in (순회할 수 있는 값)]
'''
# task별로 쪼갠 것 (isin을 통해서 tc에 있는 애들은 true, 없으면 false를 반환하게 하고, train[true]인 애들끼리 묶어줌)
# 이를 통해서 task별로 train data와 test data들을 묶음 단위로 쪼개줄 수 있다.
task_train = [train[train.klass.isin(tc)] for tc in task_class]
task_test = [test[test.klass.isin(tc)] for tc in task_class]
#print(task_class)
#print(task_train)

# Augmentation Part

In [51]:
class ImageDataset(Dataset):
    def __init__(self, data_frame: pd.DataFrame, dataset: str, transform=None):
        self.data_frame = data_frame
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        sample = dict()
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = self.data_frame.iloc[idx]["file_name"]
        label = self.data_frame.iloc[idx].get("label", -1)

        img_path = os.path.join("dataset", self.dataset, img_name)
        image = PIL.Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        sample["image"] = image
        sample["label"] = label
        sample["image_name"] = img_name
        return sample

    def get_image_class(self, y):
        return self.data_frame[self.data_frame["label"] == y]


In [44]:
def get_dataloader(batch_size, n_worker, train_list, test_list):
        # Loader
        train_loader = None
        test_loader = None
        if train_list is not None and len(train_list) > 0:
            train_dataset = ImageDataset(
                pd.DataFrame(train_list),
                dataset="cifar10",
                transform=self.train_transform,
            )
            # drop last becasue of BatchNorm1D in IcarlNet
            train_loader = DataLoader(
                train_dataset,
                shuffle=True,
                batch_size=batch_size,
                num_workers=n_worker,
                drop_last=True,
            )

        if test_list is not None:
            test_dataset = ImageDataset(
                pd.DataFrame(test_list),
                dataset="cifar10",
                transform=self.test_transform,
            )
            test_loader = DataLoader(
                test_dataset, shuffle=False, batch_size=batch_size, num_workers=n_worker
            )

        return train_loader, test_loader

In [45]:
def get_test_datalist(args, exp_name: str, cur_iter: int) -> List:
    if exp_name is None:
        exp_name = args.exp_name

    if exp_name in ["joint", "blurry10", "blurry30"]:
        # merge over all tasks
        tasks = list(range(args.n_tasks))
    elif exp_name == "disjoint":
        # merge current and all previous tasks
        tasks = list(range(cur_iter + 1))
    else:
        raise NotImplementedError

    datalist = []
    for iter_ in tasks:
        collection_name = "{dataset}_test_rand{rnd}_cls{n_cls}_task{iter}".format(
            dataset=args.dataset, rnd=args.rnd_seed, n_cls=args.n_cls_a_task, iter=iter_
        )
        datalist += pd.read_json(
            f"collections/{args.dataset}/{collection_name}.json"
        ).to_dict(orient="records")
        logger.info(f"[Test ] Get datalist from {collection_name}.json")

    return datalist

# Finetune

## Disjoint Task Boundaries Benchmark

Configure disjoint dataset which does not share the classes of each task.


In [37]:
origin_name = "cifar10_train" # Need to change the name of your dataset.
root = Path('../collections/disjoint')
root.mkdir(exist_ok=True)

for idx, train_task in enumerate(task_train):
    file_name = origin_name
    train_prefix = {'_disjoint':'', 
              '_rand':rnd_seed, 
              '_cls':num_cls_per_task,
              '_task':idx
             }
 
    for name, value in train_prefix.items():
        file_name += name + str(value)
    file_path = (root/file_name).with_suffix('.json')
    train_task.to_json(file_path, orient='records')
    print(f"{file_path}")

../collections/disjoint/cifar10_train_disjoint_rand3_cls2_task0.json
../collections/disjoint/cifar10_train_disjoint_rand3_cls2_task1.json
../collections/disjoint/cifar10_train_disjoint_rand3_cls2_task2.json
../collections/disjoint/cifar10_train_disjoint_rand3_cls2_task3.json
../collections/disjoint/cifar10_train_disjoint_rand3_cls2_task4.json


## Blurry Task Boundaries Benchmark

Configure blurry task dataset which can share the classes of each task

There are two types of classes per each task as described in the paper. 

- **Major Classes** account for 90(70) percent of whole dataset of the corresponding dataset in blurry-10(30). 
- **Minor Classes** account for 10(30) percent of whole dataset of the corresponding dataset in blurry-10(30). 


In [48]:
# data 버젼
def make_blurry(major_ratio, num_labeled, num_classes):
    # major_ratio = 0.9 # 0.9 for blurry-10, 0.7 for blurry-30.
    # num_labeled = 4000 # cifar10 train total 50000
    # num_classes = 10 # cifar10
    major_classes = []
    label_per_class = num_labeled // num_classes
    
    task_trainM = []
    task_trainN = []
    for t in task_train:
        major_classes.append(list(t.klass.unique()))
        sub_task_trainN = []

        # sample 함수를 통해서 M%에 해당하는 data 추출
        taskM = t.sample(n = int(len(t) * major_ratio), replace=False)
        taskN = pd.concat([taskM, t]).drop_duplicates(keep=False)
        taskN_size = len(taskN)

        task_trainM.append(taskM)

        # 각각의 task에서 M/5%씩 추출
        for _ in range(len(task_train)-1):
            sub_task_trainN.append(taskN.sample(n=taskN_size//(len(task_train)-1)))

        task_trainN.append(sub_task_trainN)

    task_mixed_train = []
    for idx, task in enumerate(task_trainM):
        other_task_samples = pd.DataFrame() 
        for j in range(len(task_trainM)):
            if idx != j: 
                other_task_samples = pd.concat([other_task_samples, task_trainN[j].pop(0)])
        mixed_task = pd.concat([task, other_task_samples])
        task_mixed_train.append(mixed_task)
    labeled_idx = []
    total_data = []
    for idx, data_per_task in enumerate(task_mixed_train):
        major_class = major_classes[idx]
        for mc in major_class:
            index = list(np.where(data_per_task.klass == mc)[0])
            # print(len(index))
            index = np.random.choice(index, label_per_class, False) # choice 함수 : index에서 label_per_class만큼 choose하고 replace = False
            labeled_data = []
            # labeled data로 일부 추출
            for i in index:
                labeled_data.append(task_train[idx].iloc[i, :])
            # unlabeled data는 전체 data
            unlabeled_data = data_per_task
            total_data.append((pd.DataFrame(labeled_data), unlabeled_data, task_test[idx]))
    return total_data

In [53]:
total_data = make_blurry(1.0, 4000, 10)
print(len(total_data))
print(len(total_data[0]))
train_loader, test_loader = get_dataloader(128, 0, total_data[0][0], total_data[0][2])

10
3


NameError: name 'self' is not defined

In [None]:
origin_name = "cifar10_train" # Need to change the name of your dataset.
root = Path('../my_collections/blurry')
root.mkdir(exist_ok=True)

for idx, task in enumerate(task_mixed_train):
    file_name = origin_name
    prefix = {'_blurry':f'{int(round((1.0 - major_ratio)*100))}', 
              '_rand':rnd_seed, 
              '_cls':num_cls_per_task,
              '_task':idx
             }
    
    for name, value in prefix.items():
        file_name += name + str(value)

    file_path = (root/file_name).with_suffix('.json')
    task.to_json(file_path, orient='records')
    print(f"{file_path}")

## Test 

In [101]:
origin_name = "cifar10_test" # Need to change the name of your dataset.
task_test = [test[test.klass.isin(tc)] for tc in task_class]

root = Path('../collections/test')
root.mkdir(exist_ok=True)

for idx, task in enumerate(task_test):
    file_name = origin_name
    prefix = {'_rand':rnd_seed, 
              '_cls':num_cls_per_task,
              '_task':idx
             }
    for name, value in prefix.items():
        file_name += name + str(value)
        
    file_path = (root/file_name).with_suffix('.json')
    task.to_json(file_path, orient='records')
    print(f"{file_path}")

../collections/test/cifar10_test_rand3_cls2_task0.json
../collections/test/cifar10_test_rand3_cls2_task1.json
../collections/test/cifar10_test_rand3_cls2_task2.json
../collections/test/cifar10_test_rand3_cls2_task3.json
../collections/test/cifar10_test_rand3_cls2_task4.json
