# Compute Dataset Stats

**Author:** Prisca Dotti

**Last modified:** 30.03.2024

Used this script for the following purposes:
- Compute number of individual instances per type
- ...


In [1]:
# autoreload is used to reload modules automatically before entering the
# execution of code typed at the IPython prompt.
%load_ext autoreload
%autoreload 2
# To import modules from parent directory in Jupyter Notebook
import sys

sys.path.append("..")

In [23]:
import logging
import numpy as np
import pandas as pd

from config import TrainingConfig, config
from data.data_processing_tools import masks_to_instances_dict
from utils.training_script_utils import init_dataset

logger = logging.getLogger(__name__)


config.verbosity = 3  # To get debug messages

In [3]:
############################ Get default parameters ############################

test_ids = [
    "05",
    "10",
    "15",
    "20",
    "25",
    "32",
    "34",
    "40",
    "45",
]
train_ids = [
    "01",
    "02",
    "03",
    "04",
    "06",
    "07",
    "08",
    "09",
    "11",
    "12",
    "13",
    "14",
    "16",
    "17",
    "18",
    "19",
    "21",
    "22",
    "23",
    "24",
    "27",
    "28",
    "29",
    "30",
    "33",
    "35",
    "36",
    "38",
    "39",
    "41",
    "42",
    "43",
    "44",
    "46",
]
sample_ids = test_ids + train_ids

# Initialize general parameters with default values
params = TrainingConfig()



In [5]:
############################## Configure dataset ###############################

logger.info(f"Loading samples {sample_ids}.")
logger.info(f"Using {params.dataset_dir} as dataset root path.")

# Create dataset
dataset = init_dataset(
    params=params,
    sample_ids=sample_ids,
    apply_data_augmentation=False,
    print_dataset_info=True,
    load_instances=True,
)

[12:08:32] [  INFO  ] [  __main__  ] < 3  > -- Loading samples ['05', '10', '15', '20', '25', '32', '34', '40', '45', '01', '02', '03', '04', '06', '07', '08', '09', '11', '12', '13', '14', '16', '17', '18', '19', '21', '22', '23', '24', '27', '28', '29', '30', '33', '35', '36', '38', '39', '41', '42', '43', '44', '46'].
[12:08:32] [  INFO  ] [  __main__  ] < 4  > -- Using C:\Users\dotti\Code\sparks_project\data\sparks_dataset as dataset root path.
[12:13:12] [  INFO  ] [utils.training_script_utils] <137 > -- Samples in dataset: 830


In [6]:
instances = dataset.get_instances()
labels = dataset.get_labels()

In [13]:
# Compute the number of instances per class per movie
n_instances_per_movie = {movie_id: {} for movie_id in sample_ids}

for movie_id, i, l in zip(sample_ids, instances.values(), labels.values()):
    movie_instances_dict = masks_to_instances_dict(i, l)

    n_instances_per_type = {
        c: len(np.unique(i)) - 1 for c, i in movie_instances_dict.items()
    }
    n_instances_per_movie[movie_id] = n_instances_per_type

In [21]:
# Convert nested dictionary into a DataFrame for easier analysis
df = pd.DataFrame.from_dict(n_instances_per_movie, orient="index")

# Calculate total number of each event type across all movies
total_events = df.sum()

# Calculate average number of each event type per movie
average_events = df.mean()

# Find the movie with the maximum number of each event type
max_events = df.idxmax()

# Gather statistics in a dictionary for easier presentation
stats = {
    "total_sparks": total_events["sparks"],
    "total_waves": total_events["waves"],
    "total_puffs": total_events["puffs"],
    "average_sparks": average_events["sparks"],
    "average_waves": average_events["waves"],
    "average_puffs": average_events["puffs"],
    "movie_most_sparks": (max_events["sparks"], df.loc[max_events["sparks"]]["sparks"]),
    "movie_most_waves": (max_events["waves"], df.loc[max_events["waves"]]["waves"]),
    "movie_most_puffs": (max_events["puffs"], df.loc[max_events["puffs"]]["puffs"]),
    "total_instances": df.sum(axis=1).sum(),  # Total of all instances
}

stats

{'total_sparks': 1380,
 'total_waves': 37,
 'total_puffs': 299,
 'average_sparks': 32.093023255813954,
 'average_waves': 0.8604651162790697,
 'average_puffs': 6.953488372093023,
 'movie_most_sparks': ('23', 188),
 'movie_most_waves': ('38', 8),
 'movie_most_puffs': ('05', 26),
 'total_instances': 1716}

In [25]:
# Convert the boolean DataFrame to int (True to 1, False to 0)
movies_with_events_int = df.map(lambda x: 1 if x > 0 else 0)

# Now, sum each column to get the number of movies that contain each event type
num_movies_with_sparks = movies_with_events_int["sparks"].sum()
num_movies_with_waves = movies_with_events_int["waves"].sum()
num_movies_with_puffs = movies_with_events_int["puffs"].sum()

# Output the results
print(f"Number of movies with sparks: {num_movies_with_sparks}")
print(f"Number of movies with waves: {num_movies_with_waves}")
print(f"Number of movies with puffs: {num_movies_with_puffs}")

Number of movies with sparks: 40
Number of movies with waves: 16
Number of movies with puffs: 37


In [28]:
# Initialize dictionaries to count the number of each event type in the test and train datasets
train_event_counts = {"sparks": 0, "waves": 0, "puffs": 0}
test_event_counts = {"sparks": 0, "waves": 0, "puffs": 0}

# Iterate over the nested dictionary to sum the events for each dataset
for movie_id, events in n_instances_per_movie.items():
    if movie_id in train_ids:
        for event, count in events.items():
            train_event_counts[event] += count
    elif movie_id in test_ids:
        for event, count in events.items():
            test_event_counts[event] += count

train_event_counts, test_event_counts

({'sparks': 1115, 'waves': 30, 'puffs': 225},
 {'sparks': 265, 'waves': 7, 'puffs': 74})

In [32]:
# Now we have the total counts for each event type in the train and test datasets
# Calculate the ratios of each event type in the train dataset vs. the test dataset
ratios = {}
for event_type in train_event_counts.keys():
    if test_event_counts[event_type] > 0:  # To avoid division by zero
        ratios[event_type] = (
            train_event_counts[event_type] / test_event_counts[event_type]
        )
    else:
        ratios[event_type] = float(
            "inf"
        )  # If the event type is not present in the test dataset at all

# Given the ratios, convert them to percentages
percentages = {}
for event_type in train_event_counts.keys():
    percentages[event_type] = (
        100
        * test_event_counts[event_type]
        / (train_event_counts[event_type] + test_event_counts[event_type])
    )

percentages

{'sparks': 19.202898550724637,
 'waves': 18.91891891891892,
 'puffs': 24.74916387959866}