# Creating a new dataset
In this notebook we will have a look at creating a new dataset from a set of records. First we will take a look a the result of a training, make a selection based on the results and than train on the new dataset.

In [1]:
# Imports
import panel as pn

from icevision.all import *
import icedata
from icevision_dashboards.dashboards import *
from icevision_dashboards.data import *

In [2]:
pn.extension()

# Look at training results


In [3]:
trainings_results = ObjectDetectionResultsDataset.load("test_data/pets_valid.dat")

If we take a look at the data below we can see that the model works better on images with only one annoation. Furthermore we can see, that the score distribution that the model is much more confident in some classes. (Here the reason is that the training was stopped before it could converge). Lets say we now want to create a subset and train a model on the subset, where the number of annotations equals 1 and we want to use the following classes Pug, Shiba Inu, Great Pyrenees ans Sphynx. We can use the `ObjectDetectionDatasetGenerator` dashboard to do this.

In [4]:
trainings_resuls_overivew = ObjectDetectionResultOverview(trainings_results)
trainings_resuls_overivew.show_loss_tab()

# Create a subdataset

In [5]:
# load the data and class_map
path = icedata.pets.load_data()
class_map = icedata.pets.class_map()

# use the provided dataset parser
parser = icedata.pets.parser(data_dir=path, class_map=class_map)
records = parser.parse(RandomSplitter([1]))[0]

  0%|          | 0/3686 [00:00<?, ?it/s]

[1m[1mINFO    [0m[1m[0m - [1m[34m[1mAutofixing records[0m[1m[34m[0m[1m[0m | [36micevision.parsers.parser[0m:[36mparse[0m:[36m126[0m


In [6]:
# create a dataset the can be consumed by a dashboard
dash_ds = BboxRecordDataset(records, class_map=class_map)

First select the ranges with the range silder than select the classes you want.

In [18]:
dash_generator = ObjectDetectionDatasetGenerator(dash_ds, width=1000, height=700)
dash_generator.show()

In [None]:
# load the created dataset
new_dataset = BboxRecordDataset("datasets/dataset.json")

In [None]:
# quick overview of the dataset
ObjectDetectionDatasetOverview(new_dataset, height=700, width=1000).show()

In [None]:
# split the records into a training and test dataset
train_records, valid_records = new_dataset.split_in_train_and_val(0.8)

## Train a model on the new Dataset
Now that we have the new dataset we can train a model on the data.

In [None]:
# define transforms
train_tfms = tfms.A.Adapter(
    [*tfms.A.aug_tfms(size=384, presize=512), tfms.A.Normalize()]
)
valid_tfms = tfms.A.Adapter([*tfms.A.resize_and_pad(384), tfms.A.Normalize()])

In [None]:
# create the datasets and dataloaders
train_ds = Dataset(train_records, train_tfms)
valid_ds = Dataset(valid_records, valid_tfms)

train_dl = faster_rcnn.train_dl(train_ds, batch_size=10, num_workers=4, shuffle=True)
valid_dl = faster_rcnn.valid_dl(valid_ds, batch_size=10, num_workers=4, shuffle=False)

In [None]:
# create a model and train it for some epochs
model = faster_rcnn.model(num_classes=len(new_dataset.class_map))

learn = faster_rcnn.fastai.learner(dls=[train_dl, valid_dl], model=model)

learn.fine_tune(5, lr=1e-4)

In [None]:
samples_plus_losses, preds, losses_stats = faster_rcnn.interp.plot_top_losses(model=model, dataset=valid_ds, sort_by="loss_total", n_samples=1)

In [None]:
# create a dataset that can be consumed by the analysis dashboard
valid_result_ds = ObjectDetectionResultsDataset.init_from_preds_and_samples(preds, samples_plus_losses, class_map=class_map)

In [None]:
# create the dashboard
result_overview_dash = ObjectDetectionResultOverview(valid_result_ds)

In [None]:
# show the loss tab
result_overview_dash.show_loss_tab()

In [None]:
# show the ap tab
result_overview_dash.show_ap_tab()