In [5]:
!pip install dioptra

In [7]:
####
#
#  Set env variables
#
####
import os

os.environ['DIOPTRA_API_KEY'] = 'api_key'

os.environ['DIOPTRA_UPLOAD_BUCKET'] = 'your_bucket'
os.environ['DIOPTRA_UPLOAD_PREFIX'] = 'your_path'

In [None]:
####
#
#  (optional) Upload the scene_parse_150 open source dataset to the Lake
#  this uploads the images to your s3 bucket, so it takes about 15 min
#
####

from datasets import load_dataset
from dioptra.lake.utils import upload_image_dataset, wait_for_upload

all_dataset = load_dataset('scene_parse_150')

for split_name, split_data in {'train': all_dataset['train'], 'test': all_dataset['validation']}.items():
    
    upload_ids = upload_image_dataset(
        dataset=split_data,
        image_field='image',
        dataset_type='SEGMENTATION',
        groundtruth_field='annotation',
        dataset_metadata={
            'tags': {
                'dataset_id': 'scene_parse_150',
                'data_split': split_name
            }
        },
        datapoints_metadata=[{'tags': {'dataset_index': index}} for index in range(len(split_data))],
        class_names=list(class_names.keys())
    )

    for upload_id in upload_ids:
        wait_for_upload(upload_id)
        
        
####
#
#  Download the class names from HF
#
####
import json
from huggingface_hub import hf_hub_download

repo_id = 'huggingface/label-files'
filename = 'ade20k-id2label.json'
id2label = json.load(open(hf_hub_download(repo_id = repo_id, filename = filename, repo_type = 'dataset'), 'r'))
id2label = {int(k) + 1: v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
label2id['ignore'] = 0
id2label[0] = 'ignore'
num_labels = len(id2label)
class_names = dict(sorted(label2id.items(), key=lambda item: item[1]))

In [None]:
####
#
#  Define the training routine
#
####

def run_training(model, training_dataloader, val_dataloader):
    # Implement training routine  <--------- Change this
    return {
        'test_metrics': 0.1
    }


In [14]:
####
#
#  Create the datasets
#
####

from dioptra.lake.utils import select_datapoints
from dioptra.lake.datasets import Dataset as DioptraDataset

test_uuids = select_datapoints(filters=[
    {'left': 'tags.name', 'op': '=', 'right': 'data_split'},
    {'left': 'tags.value', 'op': '=', 'right': 'test'}
])

test_dataset = DioptraDataset()
test_dataset.get_or_create('Scene parse test set')
test_dataset.add_datapoints(list(test_uuids['id']))
test_dataset.commit('initial commit')

training_dataset = DioptraDataset()
training_dataset.get_or_create('Scene parse training set')

In [24]:
####
#
#  Run the learning curve loop
#
####
import time
import torch

from dioptra.lake.utils import select_groundtruths, join_on_datapoints
from dioptra.inference.torch.torch_runner import TorchInferenceRunner
from dioptra.lake.torch.object_store_datasets import ImageDataset
from dioptra.miners.random_miner import RandomMiner
from dioptra.miners.entropy_miner import EntropyMiner
from dioptra.miners.coreset_miner import CoresetMiner

## Define arguments
strategy = 'random'  # <--------- Change this
model_name = 'my_model_name'  # <--------- Change this
model_type = 'SEGMENTATION'  # <--------- Change this
num_data_per_step = 100  # <--------- Change this
num_steps = 10  # <--------- Change this
logits_layer = 'my_logit_layer' # <--------- Change this
embeddings_layer = 'my_embeddings_layer' # <--------- Change this

def train_transforms(batch):
    image = batch['image']
    mask = batch['segmentation_class_mask']
    #process ...              # <--------- Change this
    input_tensor = None
    input_mask = None
    return input_tensor, input_mask

def val_transforms(batch):
    image = batch['image']
    mask = batch['segmentation_class_mask']
    #process ...                  # <--------- Change this
    input_tensor = None
    input_mask = None
    return input_tensor, input_mask

def unlabeled_transforms(batch):
    image = batch['image']
    #process ...             # <--------- Change this
    input_tensor = None
    return input_tensor


test_df = test_dataset.download_datapoints()
test_groundtruth = select_groundtruths(
    filters=[{'left': 'datapoint', 'op': 'in', 'right': list(test_df['id'])}],
    fields=['datapoint', 'task_type', 'encoded_segmentation_class_mask', 'class_names']
)

test_df = join_on_datapoints(
    datapoints=test_df,
    groundtruths=test_groundtruth)

test_dataset = ImageDataset(test_df)
test_dataset.transform = val_transforms
test_data_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=4, shuffle=False, num_workers=4)


model = '' # model definition here <--------- Change this


for step_index in range(num_steps):
    
    my_select_filters = [
        {'left': 'tags.name', 'op': '=', 'right': 'data_split'},
        {'left': 'tags.value', 'op': '=', 'right': 'train'}]
    
    training_df = training_dataset.download_datapoints()
    if len(training_df) > 0:
        my_select_filters.append({'left': 'datapoints.id', 'op': 'not in', 'right': list(training_df['id'])})

    if strategy == 'random':

        my_miner = RandomMiner(
            display_name=f'Random miners {str(step_index)}',
            size=num_data_per_step,
            select_filters=my_select_filters)
        
    else
        if strategy == 'entropy':

            my_logits_layer = logits_layer
            my_embeddings_layers = []
            
            my_miner = EntropyMiner(
                display_name=f'Entropy miners {str(step_index)}',
                model_name=f'model_name_{step_index}',
                size=num_data_per_step,
                select_filters=my_select_filters)
            
        elif  strategy == 'coreset':
            my_logits_layer = None
            my_embeddings_layers = [embeddings_layer]

            my_miner = CoresetMiner(
                display_name=f'Coreset miners {str(step_index)}',
                model_name=f'model_name_{step_index}',
                size=num_data_per_step,
                select_filters=my_select_filters)
        
        unlabeled_datapoints = select_datapoints(filters=my_select_filters)
        unlabeled_data = ImageDataset(
            dataframe=unlabeled_datapoints)

        unlabeled_data.transform = unlabeled_transforms
        unlabeled_data_loader = torch.utils.data.DataLoader(
            test_dataset, batch_size=4, shuffle=False, num_workers=4)
        
        inference_runner = TorchInferenceRunner(
            model=model, 
            model_type=model_type,
            logits_layer=my_logits_layer,
            embeddings_layers=embeddings_layer, 
            class_names=class_names,
            metadata=[{'id': my_id} for my_id in list(unlabeled_datapoints['id'])],
            device='cuda'
        )
        
        inference_runner.run(unlabeled_data_loader)
        inference_runner.wait_for_uploads()
        
    my_miner.run()

    while my_miner.get_status() != 'SUCCESS':
        print('waiting for results')
        time.sleep(10)

    training_dataset.add_datapoints(my_miner.get_results())
    training_dataset.commit(f'iteration {step_index}')
    
    training_datapoints = training_dataset.download_datapoints()
    training_groundtruth = select_groundtruths(
        filters=[{'left': 'datapoint', 'op': 'in', 'right': list(training_datapoints['id'])}],
        fields=['datapoint', 'task_type', 'encoded_segmentation_class_mask', 'class_names']
    )
    training_df = join_on_datapoints(
        datapoints=training_datapoints,
        groundtruths=training_groundtruth)
    
    training_data = ImageDataset(
        dataframe=training_df.iloc[0: int(len(training_df) * 0.8)])
    validation_data = ImageDataset(
        dataframe=training_df.iloc[int(len(training_df) * 0.8): -1])
    
    training_data.transform = train_transforms
    validation_data.transform = val_transforms

    training_data_loader = torch.utils.data.DataLoader(
        training_data, batch_size=4, shuffle=True, num_workers=4)
    
    validation_data_loader = torch.utils.data.DataLoader(
        validation_data, batch_size=4, shuffle=True, num_workers=4)
    
    results = run_training(model, training_data_loader, validation_data_loader)
    results['training_size'] = len(training_df)
    
    with open(f'results_{strategy}', 'a') as file:
        file.write(json.dumps(results) + '\n')