# Compute Dataset Stats

**Author:** Prisca Dotti

**Last modified:** 30.03.2024

Used this script for the following purposes:
- Compute number of individual instances per type
- ...


In [1]:
# autoreload is used to reload modules automatically before entering the
# execution of code typed at the IPython prompt.
%load_ext autoreload
%autoreload 2
# To import modules from parent directory in Jupyter Notebook
import sys

sys.path.append("..")

In [2]:
import logging
import os
import numpy as np

from config import TrainingConfig, config
from data.data_processing_tools import masks_to_instances_dict, process_raw_predictions
from utils.in_out_tools import write_videos_on_disk
from utils.training_inference_tools import do_inference
from utils.training_script_utils import get_sample_ids, init_dataset, init_model

logger = logging.getLogger(__name__)


config.verbosity = 3  # To get debug messages

In [3]:
############################ Get default parameters ############################

test_ids = [
    "05",
    "10",
    "15",
    "20",
    "25",
    "32",
    "34",
    "40",
    "45",
]
train_ids = [
    "01",
    "02",
    "03",
    "04",
    "06",
    "07",
    "08",
    "09",
    "11",
    "12",
    "13",
    "14",
    "16",
    "17",
    "18",
    "19",
    "21",
    "22",
    "23",
    "24",
    "27",
    "28",
    "29",
    "30",
    "33",
    "35",
    "36",
    "38",
    "39",
    "41",
    "42",
    "43",
    "44",
    "46",
]
sample_ids = test_ids + train_ids

# Initialize general parameters with default values
params = TrainingConfig()



In [4]:
############################## Configure dataset ###############################

logger.info(f"Loading samples {sample_ids}.")
logger.info(f"Using {params.dataset_dir} as dataset root path.")

# Create dataset
dataset = init_dataset(
    params=params,
    sample_ids=sample_ids,
    apply_data_augmentation=False,
    print_dataset_info=True,
    load_instances=True,
)

[11:27:10] [  INFO  ] [  __main__  ] < 3  > -- Loading samples ['05', '10', '15', '20', '25', '32', '34', '40', '45', '01', '02', '03', '04', '06', '07', '08', '09', '11', '12', '13', '14', '16', '17', '18', '19', '21', '22', '23', '24', '27', '28', '29', '30', '33', '35', '36', '38', '39', '41', '42', '43', '44', '46'].
[11:27:10] [  INFO  ] [  __main__  ] < 4  > -- Using C:\Users\dotti\Code\sparks_project\data\sparks_dataset as dataset root path.


In [None]:
instances = dataset.get_instances()
labels = dataset.get_labels()

In [None]:
instances[0].shape, np.unique(instances[0]), labels[0].shape, np.unique(labels[0])

((500, 64, 512),
 array([  0,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,
         17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
         30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
         69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
         82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,
         95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
        108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
        121, 122], dtype=int8),
 (500, 64, 512),
 array([0, 1, 3, 4], dtype=int8))

In [None]:
# Compute the number of instances per class per movie
n_instances_per_movie = {movie_id: {} for movie_id in sample_ids}

for movie_id, i, l in zip(sample_ids, instances.values(), labels.values()):
    movie_instances_dict = masks_to_instances_dict(i, l)

    print(f"Movie {movie_id}:")
    print(masks_to_instances_dict(i, l)["sparks"].shape)
    break

Movie 05:
(500, 64, 512)
