# Automatic clean-up of YFCC100M by Nearest neighbors

Select a subset of images from YFCC100M that look similar to moments in DIDEMO in feature space.

In [1]:
image_csv = '../data/interim/yfcc100m/001.csv'
didemo_jsons = ['../data/raw/train_data.json',
                '../data/raw/val_data.json']
nouns2video_json = '../data/interim/didemo/nouns_to_video.json'
image_h5 = '../data/interim/yfcc100m/resnet152/320x240_001.h5'
video_h5 = '../data/interim/didemo/resnet152/320x240_max.h5'
IMAGES_PER_TAG = 100


import json
import random
import time
import h5py
import numpy as np
import pandas as pd
from tqdm import tqdm

# TODO. unit-norm features
class MomentDescriptor():
    def __init__(self, filename):
        self.file = filename
        
    def __call__(self, video, time):
        start, end = time
        end += 1
        with h5py.File(self.file, 'r') as fid:
            feature = fid[video][:]
            # TODO: try max?
            descriptor = feature[start:end, :].mean(axis=0)
        return descriptor
    
def load_image_features(filename):
    feat_db_list = []
    end = time.time()
    with h5py.File(filename, 'r') as fid:
        for _, v in fid.items():
            feat_db_list.append(v[:])
    print(f'Loaded image features: {time.time() - end}')
    feat_db = np.stack(feat_db_list).squeeze()
    print(f'Stacking features: {time.time() - end}')
    return feat_db

# get videos in train-val
didemo_videos = set()
for filename in didemo_jsons:
    with open(filename, 'r') as fid:
        status = [didemo_videos.add(moment['video'])
                  for moment in json.load(fid)]

# mapping of NOUNs to didemo videos
with open(nouns2video_json, 'r') as fid:
    didemo_nouns2video = json.load(fid)
        
get_descriptor = MomentDescriptor(video_h5)

df_yfcc100m = pd.read_csv(image_csv)
image_descriptors = load_image_features(image_h5)

# TODO: generalize it?. It assumes a single top-1 tag
clean_idxs = []
end = time.time()
debug = []
for tag, df_i in tqdm(df_yfcc100m.groupby('topk_tags')):
    assert tag in didemo_nouns2video['nouns']
    moments_videos = didemo_nouns2video['videos'][tag]
    moments_time = didemo_nouns2video['time'][tag]
    assert len(moments_videos) == len(moments_time)

    moment_idxs = []
    for j, video_j in enumerate(moments_videos):
        if video_j not in didemo_videos:
            continue
        moment_idxs.append(j)
    n_per_j = IMAGES_PER_TAG // len(moment_idxs)
    
    idxs_i = df_i.index
    clean_idxs_i = set()
    # TODO. use pdist2.
    for j in moment_idxs:
        moment_j = get_descriptor(moments_videos[j], moments_time[j])
        # TODO. study purite checking overall NN.
        image_descriptors_i = image_descriptors[idxs_i, :]
        # TODO. other distances?
        distance = ((image_descriptors_i - moment_j)**2).sum(axis=1)
        idxs_sorted = distance.argsort()
        # TODO. fancy selections based on bipartite graph stuff
        # I was tired so...
        # Add n_per_j most similar images to moment_j taking into account
        # that other moments could have added the same image
        n_before = len(clean_idxs_i)
        idx_start, idx_end = 0, n_per_j
        while True:
            blah = idxs_i[idxs_sorted[idx_start:idx_end]]
            clean_idxs_i.update(blah)
            idx_start += len(blah)
            items_added = len(clean_idxs_i) - n_before
            if items_added == n_per_j:
                break
            else:
                idx_end += n_per_j - items_added
            idx_end = min(idx_end, len(idxs_sorted))
            if idx_start > len(idxs_i) or idx_end - idx_start <= 0:
                break
        
        if random.random() < 0.01:
            debug.append((moments_videos[j], moments_time[j], tag, df_i.loc[idxs_i[idxs_sorted[:5]], 'url']))
    clean_idxs.extend(clean_idxs_i)
print(f'Cleaning: {time.time() - end}')

clean_df = df_yfcc100m.loc[clean_idxs, :]
raise
clean_df.to_csv('../data/interim/yfcc100m/002.csv', index=None)
# damm there are so many degrees of freedom, definetily I can't reject the hypothesis
# only conclude that I'm unlucky and not smart

Loaded image features: 175.57180213928223


  0%|          | 0/437 [00:00<?, ?it/s]

Stacking features: 188.22845602035522


100%|██████████| 437/437 [01:44<00:00,  4.16it/s]


Cleaning: 105.02398562431335


# Visualize top-5 neighbors for a given moment

In [None]:
ind = 7
if ind >= len(debug):
    print(f'Max ind is: {len(debug) - 1}')
    raise
from IPython.display import Image, display, HTML
print(debug[ind][2], (debug[ind][1][0] * 5, debug[ind][1][1] * 5 + 5) )
video_url = '/'.join(debug[ind][0].split('_')[:2])
EMBED_VIDEO = (
    '<a data-flickr-embed="true" data-context="true" href="https://'
    f'www.flickr.com/photos/{video_url}/in/photostream/"> <img src='
    '"https://farm4.staticflickr.com/3259/2408598493_655c93f5f9.jpg"'
    ' width="320" height="240" alt="2005_03_13__11_28_05"></a><script'
    ' async src="//embedr.flickr.com/assets/client-code.js" charset='
    '"utf-8"></script>'
)
display(HTML(EMBED_VIDEO))
for i in debug[ind][-1]:
    display(Image(i))