In [1]:
import pandas as pd
import json
import random
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.mixture import BayesianGaussianMixture
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform

from abyss_deep_learning.datasets.translators import CloudFactoryCaptionTranslator

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def grouper(iterable, max_dist):
    prev = None
    group = []
    for item in iterable:
        if not prev or item[1] - prev <= max_dist:
            group.append(item)
        else:
            yield group
            group = [item]
        prev = item[1]
    if group:
        yield group
# numbers = [123, 124, 128, 160, 167, 213, 215, 230, 245, 255, 257, 400, 401, 402, 430]
# dict(enumerate(grouper(numbers,50), 1))

In [3]:
# img_ids_to_keep = []
# for vid in imgs:
#     clumps = dict(enumerate(grouper(imgs[vid],12),1))
#     for c in clumps:
#         rand_sample = clumps[c][random.randint(0,len(clumps[c])-1)]
#         print(vid, c, rand_sample, clumps[c])
#         img_ids_to_keep.append(rand_sample[0])
# sorted(img_ids_to_keep)

In [4]:
def calc_caption_map(input_json, translator):
    caption_map = []
    for ann_idx, ann in enumerate(input_json['annotations']):
        if 'caption' not in ann or not translator.filter(ann):
            continue
        caption_map.append(translator.translate(ann))
    return {l: k for k, l in enumerate(sorted(list(set([j for i in caption_map for j in i]))))}

def extract_timestamps(input_json, translator, caption_map):
    imgs = {}
    idxs = []
    img_id_map = {img['id']: idx for idx, img in enumerate(input_json['images'])} # map id to idx
    for ann_idx, ann in enumerate(input_json['annotations']):
        if 'caption' not in ann:
            continue
        idxs.append((ann_idx, ann['image_id']))
    for ann_idx, img_id in idxs:
        ann = input_json['annotations'][ann_idx]
        img = input_json['images'][img_id_map[img_id]]
        img_id = img['id']
        file_list = img['file_name'][:-4].split('_')
        file = '_'.join(file_list[:-1])
        frame_num = int(file_list[-1])
        fps = (25 / 6 if 'reduced' in file else 25)
        if file not in imgs:
            imgs[file] = []
        if not translator.filter(ann):
            continue
        for caption in translator.translate(ann):
            imgs[file].append(
                (img_id, frame_num, frame_num / fps, caption_map[caption]))
    return imgs

def unique_instances(input_json, seconds=1):
    imgs = extract_timestamps(input_json)
    img_ids_to_keep = []
    
    for name, vid in imgs.items():
        print(name)
        max_dist = fps * seconds
        clumps = dict(enumerate(grouper(vid, max_dist), 1))
        for c in clumps:
            rand_sample = clumps[c][random.randint(0,len(clumps[c])-1)]
            img_ids_to_keep.append(rand_sample[0])
    return sorted(img_ids_to_keep)


# extract_timestamps(json.load(open(json_path, 'r')), translator, caption_map)

In [13]:
max_time_diff = 10
seconds_per_sample = 10
json_file = json.load(open(json_path, 'r'))
translator = CloudFactoryCaptionTranslator()
caption_map = calc_caption_map(json_file, translator)
caption_map

{'C': 0,
 'ED': 1,
 'F': 2,
 'IP': 3,
 'JD': 4,
 'RI': 5,
 'SJ': 6,
 'SO': 7,
 'U': 8,
 'X': 9,
 'background': 10}

In [None]:

D_list = []
def lograndom(low, high, size=None):
    return 10 ** np.random.uniform(low, high, size)
keep_ids = []
for name, data in extract_timestamps(json_file, translator, caption_map).items():
    data = np.array(data)
    print(name, data.shape)
    plt.figure()
    cls = DBSCAN(eps=max_time_diff, min_samples=1)
    n_samples = data.shape[0]
    n_components = n_samples // 2

    X = data[:, 2:4]
#     plt.scatter(X[:, 0], 1 + X[:, 1], c=X[:, 1])
    
    for label in caption_map.values():
        mask = X[:, 1] == label
        if mask.sum() == 0:
            continue
        X2 = X[mask, ...]
        y2 = cls.fit_predict(X2).ravel()
        mask2 = y2 >= 0
        X2 = X2[mask2, ...]
        y2 = y2[mask2]
        
        keep_ids += [
            int(i)
            for l in np.unique(y2)
            for i in np.random.choice(
                data[mask][mask2][y2 == l, 0],
                size=(int(np.round((X2[y2 == l, 1].max() - X2[y2 == l, 1].min()) / seconds_per_sample)) + 1))
        ]
        plot_samples = np.array([
            int(i)
            for l in np.unique(y2)
            for i in np.random.choice(
                data[mask][mask2][y2 == l, 0],
                size=(int(np.round((X2[y2 == l, 1].max() - X2[y2 == l, 1].min()) / seconds_per_sample)) + 1))
        ])
        plt.scatter(X2[:, 0], label * np.ones_like(y2), c=y2, cmap='Paired')
        plt.plot(plot_samples, 0 * plot_samples, '+k', ms=10, fillstyle='none')
    
        
#         plt.plot(X2_discard[:, 0], label * np.ones_like(X2_discard[:, 0]),  'kx')

    
    break


In [6]:
def thin_json_unique(working_json, seconds, save_here=False, save_to=None):
    
    working = json.load(open(working_json,'r'))
    
    if 'categories' in working:
        new_json = {'info':working['info'],'licenses':working['licenses'],'categories':working['categories'],'captions':[],'annotations':[],'images':[]}
    else:
        new_json = {'info':working['info'],'licenses':working['licenses'],'annotations':[],'images':[]}

    conversion_dict = {}
    for uid in unique_instances(working, seconds=seconds):
#         print(id, len(new_json['images'])+1)
        img = working['images'][uid-1]
        if uid != img['id']:
            print("{} != {}".format(uid,img['id']))
        img['id'] = len(new_json['images'])+1
        new_json['images'].append(img)
        if uid in conversion_dict:
            print('{} already in conversion_dict with val {}, trying to add {}'.format(uid,conversion_dict[uid],img['id']))
        conversion_dict[uid] = img['id']

    if 'captions' in working:  
        for cap in working['captions']:
            if cap['image_id'] in conversion_dict:
                cap['image_id'] = conversion_dict[cap['image_id']]
                new_json['captions'].append(cap)

    for ann in working['annotations']:
        if ann['image_id'] in conversion_dict:
            ann['image_id'] = conversion_dict[ann['image_id']]
            new_json['annotations'].append(ann)

    if save_here is True:
        split = os.path.splitext(working_json)
        save_path = split[0]+"-unique.json"
        with open(save_path,"w") as f:
            json.dump(new_json,f)
    elif save_to is not None:
        with open(save_to,"w") as f:
            json.dump(new_json,f)
            
    return new_json

In [7]:
# fp = '/mnt/ssd1/processed/industry-data/project-max/ml/cloud-factory-data/with-bg/binary-datasets/forwards/IP/earl-originals/ip-bg-forwards-train.json'
# working = json.load(open(fp,'r'))
# for i, img in enumerate(working['images']):
#     print(i,img['id']-1)

In [8]:
json_path = '/data/abyss/projectmax/feature-detection/large-fromCF/alltogether/test.json.2'
with open(json_path, 'r') as file:
    old = json.load(file)
new = thin_json_unique(json_path, seconds=5)
print(len(old['images']), len(new['images']))

TypeError: extract_timestamps() missing 2 required positional arguments: 'translator' and 'caption_map'

In [56]:
thin_json_unique(json_path, seconds=5, save_here=True);