# Automatic clean-up of YFCC100M by Nearest neighbors

Select a subset of images from YFCC100M that look similar to moments in DIDEMO in feature space.

## S1. Neighbors among images picked for a given tag

In [None]:
image_csv = '../data/interim/yfcc100m/001.csv'
didemo_jsons = ['../data/raw/train_data.json',
                '../data/raw/val_data.json']
nouns2video_json = '../data/interim/didemo/nouns_to_video.json'
image_h5 = '../data/interim/yfcc100m/resnet152/320x240_001.h5'
video_h5 = '../data/interim/didemo/resnet152/320x240_max.h5'
IMAGES_PER_TAG = 100


import json
import random
import time
import h5py
import numpy as np
import pandas as pd
from tqdm import tqdm

# TODO. unit-norm features
class MomentDescriptor():
    def __init__(self, filename):
        self.file = filename
        
    def __call__(self, video, time):
        start, end = time
        end += 1
        with h5py.File(self.file, 'r') as fid:
            feature = fid[video][:]
            # TODO: try max?
            descriptor = feature[start:end, :].mean(axis=0)
        return descriptor
    
def load_image_features(filename):
    feat_db_list = []
    end = time.time()
    with h5py.File(filename, 'r') as fid:
        for _, v in fid.items():
            feat_db_list.append(v[:])
    print(f'Loaded image features: {time.time() - end}')
    feat_db = np.stack(feat_db_list).squeeze()
    print(f'Stacking features: {time.time() - end}')
    return feat_db

# get videos in train-val
didemo_videos = set()
for filename in didemo_jsons:
    with open(filename, 'r') as fid:
        status = [didemo_videos.add(moment['video'])
                  for moment in json.load(fid)]

# mapping of NOUNs to didemo videos
with open(nouns2video_json, 'r') as fid:
    didemo_nouns2video = json.load(fid)
        
get_descriptor = MomentDescriptor(video_h5)

df_yfcc100m = pd.read_csv(image_csv)
image_descriptors = load_image_features(image_h5)

# TODO: generalize it?. It assumes a single top-1 tag
clean_idxs = []
end = time.time()
debug = []
for tag, df_i in tqdm(df_yfcc100m.groupby('topk_tags')):
    assert tag in didemo_nouns2video['nouns']
    moments_videos = didemo_nouns2video['videos'][tag]
    moments_time = didemo_nouns2video['time'][tag]
    assert len(moments_videos) == len(moments_time)

    moment_idxs = []
    for j, video_j in enumerate(moments_videos):
        if video_j not in didemo_videos:
            continue
        moment_idxs.append(j)
    n_per_j = IMAGES_PER_TAG // len(moment_idxs)
    
    idxs_i = df_i.index
    clean_idxs_i = set()
    # TODO. use pdist2.
    for j in moment_idxs:
        moment_j = get_descriptor(moments_videos[j], moments_time[j])
        # TODO. study purite checking overall NN.
        image_descriptors_i = image_descriptors[idxs_i, :]
        # TODO. other distances?
        distance = ((image_descriptors_i - moment_j)**2).sum(axis=1)
        idxs_sorted = distance.argsort()
        # TODO. fancy selections based on bipartite graph stuff
        # I was tired so...
        # Add n_per_j most similar images to moment_j taking into account
        # that other moments could have added the same image
        n_before = len(clean_idxs_i)
        idx_start, idx_end = 0, n_per_j
        while True:
            blah = idxs_i[idxs_sorted[idx_start:idx_end]]
            clean_idxs_i.update(blah)
            idx_start += len(blah)
            items_added = len(clean_idxs_i) - n_before
            if items_added == n_per_j:
                break
            else:
                idx_end += n_per_j - items_added
            idx_end = min(idx_end, len(idxs_sorted))
            if idx_start > len(idxs_i) or idx_end - idx_start <= 0:
                break
        
        if random.random() < 0.01:
            debug.append((moments_videos[j], moments_time[j], tag, df_i.loc[idxs_i[idxs_sorted[:5]], 'url']))
    clean_idxs.extend(clean_idxs_i)
print(f'Cleaning: {time.time() - end}')

clean_df = df_yfcc100m.loc[clean_idxs, :]
raise
clean_df.to_csv('../data/interim/yfcc100m/002.csv', index=None)
# damm there are so many degrees of freedom, definetily I can't reject the hypothesis
# only conclude that I'm unlucky and not smart

## S2. Neighbors among entire image subset

Find 500 nearest neighbors image over the entire dataset, but only retain those where the NOUNs was tagged to the image. In a nutshell, similar to the above procedure but switching stages.

In [None]:
image_csv = '../data/interim/yfcc100m/001.csv'
didemo_jsons = ['../data/raw/train_data.json',
                '../data/raw/val_data.json']
nouns2video_json = '../data/interim/didemo/nouns_to_video.json'
image_h5 = '../data/interim/yfcc100m/resnet152/320x240_001.h5'
video_h5 = '../data/interim/didemo/resnet152/320x240_max.h5'
IMAGES_PER_TAG = 100
RELAX_FACTOR = 5
MINIMORUM = 75
MODE = 1
OUTPUT_FILE = f'../data/interim/yfcc100m/003-{RELAX_FACTOR}-{MODE}.csv'

import json
import random
import time
import h5py
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from tqdm import tqdm

# TODO. unit-norm features
class MomentDescriptor():
    def __init__(self, filename):
        self.file = filename
        
    def __call__(self, video, time):
        start, end = time
        end += 1
        with h5py.File(self.file, 'r') as fid:
            feature = fid[video][:]
            # TODO: try max?
            descriptor = feature[start:end, :].mean(axis=0)
        return descriptor
    
    def get_features(self, videos, times):
        assert len(videos) == len(times)
        descriptors = []
        with h5py.File(self.file, 'r') as fid:
            for i, video_i in enumerate(videos):
                feature_i = fid[video_i][:]
                start, end = times[i]
                end += 1
                descriptors.append(feature_i[start:end, :].mean(axis=0, keepdims=True))
        descriptors = np.concatenate(descriptors, axis=0)
        return descriptors
    
def load_image_features(filename, id_list):
    feat_db_list = []
    end = time.time()
    with h5py.File(filename, 'r') as fid:
        for v in id_list:
            feat_db_list.append(fid[v][:])
    print(f'Loaded image features: {time.time() - end}')
    end = time.time()
    feat_db = np.stack(feat_db_list).squeeze()
    print(f'Stacking features: {time.time() - end}')
    return feat_db

# get videos in train-val
didemo_videos = set()
didemo_moments = {}
for filename in didemo_jsons:
    with open(filename, 'r') as fid:
        for moment in json.load(fid):
            didemo_videos.add(moment['video'])
            moment_id = moment['annotation_id']
            didemo_moments[moment_id] = moment 

# mapping of NOUNs to didemo videos
with open(nouns2video_json, 'r') as fid:
    didemo_nouns2video = json.load(fid)
        
get_descriptor = MomentDescriptor(video_h5)

df_yfcc100m = pd.read_csv(image_csv)
df_yfcc100m.loc[:, 'tags'] = df_yfcc100m.loc[:, 'tags'].apply(lambda x: x + ';')
image_descriptors = load_image_features(image_h5, df_yfcc100m['h5_id'].tolist())

end = time.time()
image_tree = cKDTree(image_descriptors)
print(f'Building tree: {time.time() - end}')
end = time.time()

clean_idxs = set()
debug = []
chekalebn = []
for tag, _ in tqdm(df_yfcc100m.groupby('topk_tags')):
    assert tag in didemo_nouns2video['nouns']
    moments_videos = didemo_nouns2video['videos'][tag]
    moments_time = didemo_nouns2video['time'][tag]
    # DEBUG: get description
    assert len(moments_videos) == len(moments_time)
    
    moment_idxs = [j for j, video_j in enumerate(moments_videos)
                   if video_j in didemo_videos]
    n_per_j = (IMAGES_PER_TAG * RELAX_FACTOR) // len(moment_idxs)

    clean_idxs_i = set()
    for j in moment_idxs:
        moment_j = get_descriptor(moments_videos[j], moments_time[j])
        distance_j, ind_j = image_tree.query(moment_j, k=n_per_j, n_jobs=-1)
        # filter by tag
        if MODE == 0:
            pick_j = df_yfcc100m.loc[ind_j, 'topk_tags'] == tag
        elif MODE == 1:
            pick_j = df_yfcc100m.loc[ind_j, 'tags'].apply(lambda x: tag in x)
        else:
            raise
        clean_idxs_i.update(ind_j[pick_j].tolist())
        
        if random.random() < 0.01 and len(pick_j) > 0:
            debug.append((moments_videos[j],
                          moments_time[j],
                          tag,
                          df_i.loc[ind_j[pick_j[:min(5, len(pick_j))]], 'url'],
                          )
                        )
    if len(clean_idxs_i) >= MINIMORUM:
        clean_idxs_i = list(clean_idxs_i)
        clean_idxs.update(clean_idxs_i[:min(IMAGES_PER_TAG, len(clean_idxs_i))])
    chekalebn.append(len(clean_idxs_i))
    
    # Abort this path because scipy-hungarian was taking a lot of time
#     # Pull descriptors from train/val videos
#     video_descriptors = get_descriptor.get_features(
#         *zip(*[(video_i, moments_time[i])
#                for i, video_i in enumerate(moments_videos)
#                if video_i in didemo_videos
#               ]
#             )
#     )    
    # TODO: switch to cosine distance
#     dist_matrix = cdist(video_descriptors, image_descriptors)
#     extra = ((IMAGES_PER_TAG * RELAX_FACTOR) // len(dist_matrix)) + 1
#     dist_matrix = np.tile(dist_matrix, [extra, 1])
#     aja, cev = linear_sum_assignment(dist_matrix)

clean_df = df_yfcc100m.loc[clean_idxs, :]
with open(OUTPUT_FILE, 'x') as fid:
    clean_df.to_csv(fid, index=None)
with open(OUTPUT_FILE.replace('.csv', '.json'), 'x') as fid:
    json.dump({'len_per_tag': chekalebn,
               'dataset_size': len(clean_idxs),
               'debug_instances': debug,
              },
              fid)
# damm there are so many degrees of freedom, definetily I can't reject the hypothesis
# only conclude that I'm unlucky and not smart

### profiling

man, that thing was taking so much time, so @escorcia spends some time figuring out what was going on. The results are below, it seems that we hit the wall.

gotta the meisters, if that's what we wanna do 😕

<img src="https://static1.squarespace.com/static/5160bb45e4b0e13a258812c8/t/5549542ae4b04cef2f6cd895/1430869036049/?format=750w">

In [1]:
IMAGE_CSV = '../data/interim/yfcc100m/001.csv'
DIDEMO_JSONS = ['../data/raw/train_data.json',
                '../data/raw/val_data.json']
NOUNS2VIDEO_JSON = '../data/interim/didemo/nouns_to_video.json'
image_h5 = '../data/interim/yfcc100m/resnet152/320x240_001.h5'
video_h5 = '../data/interim/didemo/resnet152/320x240_max.h5'
IMAGES_PER_TAG = 100
RELAX_FACTOR = 100
MINIMORUM = 75
MODE = 0
OUTPUT_FILE = f'../data/interim/yfcc100m/003-{RELAX_FACTOR}-{MODE}.csv'

import json
import random
import time
import h5py
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree

class MomentDescriptor():
    def __init__(self, filename):
        self.file = filename
        
    def __call__(self, video, time):
        start, end = time
        end += 1
        with h5py.File(self.file, 'r') as fid:
            feature = fid[video][:]
            # TODO: try max?
            descriptor = feature[start:end, :].mean(axis=0)
        return descriptor
    
    def get_features(self, videos, times):
        assert len(videos) == len(times)
        descriptors = []
        with h5py.File(self.file, 'r') as fid:
            for i, video_i in enumerate(videos):
                feature_i = fid[video_i][:]
                start, end = times[i]
                end += 1
                descriptors.append(feature_i[start:end, :].mean(axis=0, keepdims=True))
        descriptors = np.concatenate(descriptors, axis=0)
        return descriptors
    
def load_image_features(filename, id_list):
    feat_db_list = []
    with h5py.File(filename, 'r') as fid:
        for v in id_list:
            feat_db_list.append(fid[v][:])
    feat_db = np.stack(feat_db_list).squeeze()
    return feat_db

def get_videos_of_interest(files):
    videos = set()
    moments = {}
    for filename in files:
        with open(filename, 'r') as fid:
            for moment in json.load(fid):
                videos.add(moment['video'])
                moment_id = moment['annotation_id']
                moments[moment_id] = moment 
    return videos, moments

def load_didemo_nouns_metadata(filename):
    # mapping of NOUNs to didemo videos
    with open(filename, 'r') as fid:
        return json.load(fid)
    
def get_indices_of_moi(whitelist, video_list):
    # only consider videos in whitelist
    return [j for j, video_j in enumerate(video_list)
            if video_j in whitelist]

In [2]:
def honorable_cev():
    didemo_videos, didemo_moments = get_videos_of_interest(DIDEMO_JSONS)
    didemo_nouns2video = load_didemo_nouns_metadata(NOUNS2VIDEO_JSON)
    get_descriptor = MomentDescriptor(video_h5)

    df_yfcc100m = pd.read_csv(IMAGE_CSV)
    df_yfcc100m.loc[:, 'tags'] = df_yfcc100m.loc[:, 'tags'].apply(lambda x: x + ';')
    image_descriptors = load_image_features(image_h5, df_yfcc100m['h5_id'].tolist())

    image_tree = cKDTree(image_descriptors)

    clean_idxs = set()
    debug = []
    chekalebn = []
    counter = 0
    end = time.time()
    for tag, _ in df_yfcc100m.groupby('topk_tags'):
        assert tag in didemo_nouns2video['nouns']
        moments_videos = didemo_nouns2video['videos'][tag]
        moments_time = didemo_nouns2video['time'][tag]
        assert len(moments_videos) == len(moments_time) 
        moment_idxs = get_indices_of_moi(didemo_videos, moments_videos)
        n_per_j = (IMAGES_PER_TAG * RELAX_FACTOR) // len(moment_idxs)

        clean_idxs_i = set()
        for j in moment_idxs:
            counter += 1

            moment_j = get_descriptor(moments_videos[j], moments_time[j])
            distance_j, ind_j = image_tree.query(moment_j, k=n_per_j, n_jobs=-1)
            # filter by tag
            if MODE == 0:
                pick_j = df_yfcc100m.loc[ind_j, 'topk_tags'] == tag
            elif MODE == 1:
                pick_j = df_yfcc100m.loc[ind_j, 'tags'].apply(lambda x: tag in x)
            else:
                raise
            clean_idxs_i.update(ind_j[pick_j].tolist())

            if random.random() < 0.01 and len(pick_j) > 0:
                debug.append((moments_videos[j],
                              moments_time[j],
                              tag,
                              df_yfcc100m.loc[ind_j[pick_j], 'url'].iloc[:min(5, pick_j.sum())],
                              )
                            )
        if len(clean_idxs_i) >= MINIMORUM:
            clean_idxs_i = list(clean_idxs_i)
            clean_idxs.update(clean_idxs_i[:min(IMAGES_PER_TAG, len(clean_idxs_i))])
        chekalebn.append(len(clean_idxs_i))
        if counter == 100:
            break
        if counter % 10:
            print(counter, time.time() - end)
            end = time.time()

    clean_df = df_yfcc100m.loc[clean_idxs, :]
    with open(OUTPUT_FILE, 'x') as fid:
        clean_df.to_csv(fid, index=None)
    with open(OUTPUT_FILE.replace('.csv', '.json'), 'x') as fid:
        json.dump({'len_per_tag': chekalebn,
                   'dataset_size': len(clean_idxs),
                   'debug_instances': debug,
                  },
                  fid)
    # damm there are so many degrees of freedom, definetily I can't reject the hypothesis
    # only conclude that I'm unlucky and not smart

In [3]:
%load_ext line_profiler

In [4]:
%lprun -s -f honorable_cev honorable_cev()

*** KeyboardInterrupt exception caught in code being profiled.

Timer unit: 1e-06 s

Total time: 2825.31 s
File: <ipython-input-2-1865751d39d6>
Function: honorable_cev at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def honorable_cev():
     2         1     381430.0 381430.0      0.0      didemo_videos, didemo_moments = get_videos_of_interest(DIDEMO_JSONS)
     3         1     228510.0 228510.0      0.0      didemo_nouns2video = load_didemo_nouns_metadata(NOUNS2VIDEO_JSON)
     4         1         14.0     14.0      0.0      get_descriptor = MomentDescriptor(video_h5)
     5                                           
     6         1    2425621.0 2425621.0      0.1      df_yfcc100m = pd.read_csv(IMAGE_CSV)
     7         1     249944.0 249944.0      0.0      df_yfcc100m.loc[:, 'tags'] = df_yfcc100m.loc[:, 'tags'].apply(lambda x: x + ';')
     8         1  195611099.0 195611099.0      6.9      image_descriptors = load_image_features(image_h5, df_yfcc100m['h5_id'].tolist())
  

# Visualize top-5 neighbors for a given moment

Run this cell if using S2, given that the initial NN implementation is not approximate and takes so much time.

In [5]:
MINIMORUM = 75
filename = '../data/interim/yfcc100m/003-25-1.json'

import json
with open(filename, 'r') as fid:
    data = json.load(fid)
debug = data['debug_instances']
print('Trainable of NOUNs:',
      len([i for i in data['len_per_tag'] if i >= MINIMORUM]))
print('Dataset size:', data['dataset_size'])

Trainable of NOUNs: 175
Dataset size: 16037


In [None]:
ind = 19
if ind >= len(debug):
    print(f'Max ind is: {len(debug) - 1}')
    raise
from IPython.display import Image, display, HTML
print(debug[ind][2], (debug[ind][1][0] * 5, debug[ind][1][1] * 5 + 5) )
video_url = '/'.join(debug[ind][0].split('_')[:2])
EMBED_VIDEO = (
    '<a data-flickr-embed="true" data-context="true" href="https://'
    f'www.flickr.com/photos/{video_url}/in/photostream/"> <img src='
    '"https://farm4.staticflickr.com/3259/2408598493_655c93f5f9.jpg"'
    ' width="320" height="240" alt="2005_03_13__11_28_05"></a><script'
    ' async src="//embedr.flickr.com/assets/client-code.js" charset='
    '"utf-8"></script>'
)
display(HTML(EMBED_VIDEO))
for i in debug[ind][-1]:
    display(Image(i))