In [None]:
import numpy as np
import os
from pathlib import Path
from PIL import Image
import json
import matplotlib.pyplot as plt
from spyder_kernels.customize.spydercustomize import cell_count
from tqdm import tqdm
from torchvision import transforms as T
import torch
import random
from torch.utils.data import Dataset
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [None]:
# ------------  CONFIG  -------------------
base_path = '/scratch/cv-course-group-5/data/dataset_jpg'
TARGET_SIZE = 224
src_root   = Path(base_path + '/dataset')
dst_root   = Path(base_path + '/preprocessed_dataset')
lmdb_path = Path(base_path + '/lmdb')
anno_file  = Path(base_path + '/dataset/annotations.json')

In [None]:
annos_dict = json.loads(anno_file.read_text())


images = annos_dict.get('images', [])

images

In [None]:
json_path = Path('train_test_split.json')

# Load the file
with open(json_path, 'r') as f:
    split_data = json.load(f)

# Access the train and test entries
train_list = split_data.get("train", [])
train_list

In [None]:
cells_by_video = {}

for image in images:
    if next(filter(lambda train_video: train_video['id'] == image['video_id'], train_list), False):
        if image['video_id'] not in cells_by_video:
            cells_by_video[image['video_id']] = {'cells': 0, 'dead_cells': 0, 'cells_alive': 0}
        cells_by_video[image['video_id']]['cells'] += image['cells_alive'] + image['dead_cells']
        cells_by_video[image['video_id']]['dead_cells'] += image['dead_cells']
        cells_by_video[image['video_id']]['cells_alive'] += image['cells_alive']

cells_by_video

In [None]:
fig, ax = plt.subplots(4)

ax[0].hist([video['cells'] for video in cells_by_video.values()], bins=100)
ax[0].set_xlabel('cells')
ax[0].set_ylabel('video count')
ax[1].hist([video['dead_cells'] for video in cells_by_video.values()], bins=100)
ax[1].set_xlabel('dead cells')
ax[1].set_ylabel('video count')
ax[2].hist([video['cells_alive'] for video in cells_by_video.values()], bins=100)
ax[2].set_xlabel('alive cells')
ax[2].set_ylabel('video count')

fractions_of_dead_cells = np.array([video['dead_cells'] / video['cells'] for video in cells_by_video.values()])
ax[3].hist(fractions_of_dead_cells, bins=100)
ax[3].axvline(np.mean(fractions_of_dead_cells), color='r')
ax[3].axvline(np.median(fractions_of_dead_cells), color='b')
ax[3].set_xlabel('fraction of dead cells')
ax[3].set_ylabel('video count')

fig.set_size_inches(20, 30)
fig.show()

In [None]:

print(f'avg fraction of dead cells: {np.mean(fractions_of_dead_cells)}: ')
print(f'median fraction of dead cells: {np.median(fractions_of_dead_cells)}')

In [None]:
import random

cell_count = 0
reduced_videos = []
CELL_THRESHOLD = 100000
random.seed(42)

while cell_count < CELL_THRESHOLD:
    next = train_list.pop(random.randint(0, len(train_list) - 1))
    reduced_videos.append(next)
    cell_count += cells_by_video[next['id']]['cells']

json.dump(reduced_videos, open('reduced_videos.json', 'w'))