# DiDeMo data

Collect and count all `NOUN`s form DiDeMo 

In [None]:
import json
from collections import Counter
import spacy

# Count will be dumped here
filename = 'data/interim/didemo/nouns_count.csv'
# Make sure you downloaded DiDeMo data and place it in
# data/raw/{}_data.json

nlp = spacy.load('en_core_web_sm')
subsets = ['train', 'val', 'test']
num_descriptions = 0
didemo_nouns = Counter()
for subset in subsets:
    filename = f'data/raw/{subset}_data.json'
    with open(filename) as f:
        data = json.load(f)
    for d in data:
        num_descriptions += 1
        d_i = d['description']
        doc_i = nlp(d_i)
        doc_i_nouns = Counter()
        for token in doc_i:
            if token.pos_ == 'NOUN':
                doc_i_nouns.update({token.lemma_: 1})
        didemo_nouns.update(doc_i_nouns)
print('Number of descriptions', num_descriptions)
print('Number of NOUNs', len(didemo_nouns))

# Comment the following lines if you are not interested in
# dumping CSV with counts of NOUNs
with open(filename, 'x') as fid:
    fid.write('tag,count\n')
    for i in didemo_nouns.most_common():
        fid.write(f'{i[0]},{i[1]}\n')

Map NOUN to videos

In [19]:
import json
import random
from collections import Counter
import spacy

# Count will be dumped here
filename = 'data/interim/didemo/nouns_to_videos.json'
# Make sure you downloaded DiDeMo data and place it in
# data/raw/{}_data.json

nlp = spacy.load('en_core_web_sm')
subsets = ['train', 'val', 'test']
num_descriptions = 0
didemo_nouns = Counter()
videos = {}
time = {}
for subset in subsets:
    filename = f'data/raw/{subset}_data.json'
    with open(filename) as f:
        data = json.load(f)
    for d in data:
        num_descriptions += 1
        d_i = d['description']
        doc_i = nlp(d_i)
        doc_i_nouns = Counter()
        for token in doc_i:
            if token.pos_ == 'NOUN':
                doc_i_nouns.update({token.lemma_: 1})
                random.shuffle(d['times'])
                time_i = d['times'][0]
                time_i[0] *= 5
                time_i[1] *= 5
                time_i[1] += 5
                if token.lemma_ in videos:
                    videos[token.lemma_].append(d['video'])
                    time[token.lemma_].append(time_i)
                else:
                    videos[token.lemma_] = [d['video']]
                    time[token.lemma_] = [time_i]
        didemo_nouns.update(doc_i_nouns)
        
with open(filename, 'x') as fid:
    json.dump({'nouns': didemo_nouns, 'videos': videos, 'time': time}, fid)

More fine-grained version of previous cell i.e. code repetition.

- NOUNs per subset

- annotation ids per NOUNs per subset

In [6]:
%%time
import json
import random
from collections import Counter
import spacy

output_file = 'data/interim/didemo/nouns_to_video.json'
# Make sure you downloaded DiDeMo data and place it in
# data/raw/{}_data.json

nlp = spacy.load('en_core_web_sm')
subsets = ['train', 'val', 'test']
num_descriptions = 0
didemo_nouns = Counter()
noun_counts_per_set = {i: Counter() for i in subsets}
nouns_per_set = {i: set() for i in subsets}
aid_per_set = {i: {} for i in subsets}
videos = {}
time = {}
for subset in subsets:
    filename = f'data/raw/{subset}_data.json'
    with open(filename) as f:
        data = json.load(f)
    for d in data:
        num_descriptions += 1
        d_i = d['description']
        doc_i = nlp(d_i)
        doc_i_nouns = Counter()
        for token in doc_i:
            if token.pos_ == 'NOUN':
                doc_i_nouns.update({token.lemma_: 1})
                nouns_per_set[subset].add(token.lemma_)
                
                random.shuffle(d['times'])
                time_i = d['times'][0]
                if token.lemma_ in videos:
                    videos[token.lemma_].append(d['video'])
                    time[token.lemma_].append(time_i)
                else:
                    videos[token.lemma_] = [d['video']]
                    time[token.lemma_] = [time_i]
                nouns_per_set[subset].add(token.lemma_)
        
        for noun in doc_i_nouns:
            annotation_id = d['annotation_id']
            if noun in aid_per_set[subset]:
                aid_per_set[subset][noun].append(annotation_id)
            else:
                aid_per_set[subset][noun] = [annotation_id]
                
        noun_counts_per_set[subset].update(doc_i_nouns)
        didemo_nouns.update(doc_i_nouns)

if filename is None:
    raise ValueError('never mind ;)')

for k, v in nouns_per_set.items():
    nouns_per_set[k] = list(v)
with open(output_file, 'w') as fid:
    json.dump({'nouns': didemo_nouns,
               'videos': videos,
               'time': time,
               'nouns_per_subset': nouns_per_set,
               'counts_per_subset': noun_counts_per_set,
               'annotations_per_subset': aid_per_set}, fid)

[Test] `NOUN` extraction

In [4]:
import json
import random
import spacy
from spacy import displacy

SEED = None # 123
if SEED is not None:
    random.seed(SEED)

descriptions = []
subsets = ['train', 'val', 'test']
for i in subsets:
    filename = f'data/raw/{i}_data.json'
    with open(filename) as f:
        data = json.load(f)
    for d in data:
        descriptions.append(d['description'])
print('Number of descriptions', len(descriptions))

random.shuffle(descriptions)

nlp = spacy.load('en_core_web_sm')
print()
for d_i in descriptions:
    doc = nlp(d_i)
    print(d_i)
    displacy.render(doc, style='dep', jupyter=True)
    for token in doc:
        if token.pos_ == 'NOUN':
            print(token.lemma_)
#     for token in doc:
#         print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#               token.shape_, token.is_alpha, token.is_stop)
    break

Number of descriptions 41206

man in maroon has left arm in air, two frames


man
maroon
arm
air
frame


# Outside data

## YFCC100M

[project](https://webscope.sandbox.yahoo.com/catalog.php?datatype=i&did=67&guccounter=1)

[another description](http://riemenschneider.hayko.at/vision/dataset/task.php?did=280)

data@adobe: `/mnt/ilcompf2d1/data/yfcc100m/`

Branches

- [Bryan's previous project](http://deep-tagging.cs.washington.edu/)

    [paper about data](https://arxiv.org/pdf/1411.6909.pdf)

@escorciav attempt to parse data.

He quit this approach after realizing how demanding it was and the existence of a similar endeavor

In [None]:
import glob
import pandas as pd

USER_TAGS = 8
PHOTO_OR_VIDEO = 22

for i in glob.glob('/mnt/ilcompf2d1/data/yfcc100m/yfcc100m_dataset-*'):
    raise NotImplementedError('postponed')
    df = pd.read_csv(i, delimiter='\t', header=None)
    idx = ((df.loc[:, PHOTO_OR_VIDEO] == 0) &
           (pd.notna(df.loc[:, USER_TAGS])))
    # TODO copy column of interest and append it to list
    # df.loc[idx, USER_TAGS]
# TODO concat all dataFrames

# TODO for each row
# parse tags with spacy to remove plurals and focus on nouns    

[Website](http://deep-tagging.cs.washington.edu/imagenet_correspondence.html) from previous Bryan's work with data easy to scrap

In [3]:
from bs4 import BeautifulSoup

# Tag frequency will be dumped here
filename = 'data/interim/yfcc100m/tag_frequency.csv'
# Make sure that you download the website mentioned above
# It assumes that you placed it here:
html_file = 'data/raw/yfcc100m/tags.html'

with open(html_file) as fid:
    page = fid.read()
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('div', attrs={'id':'content'}).find('tbody').find_all('tr')

# Comment the last line if you are not interested in
# dumping CSV with counts of tags
with open(filename, 'x') as fid:
    fid.write('tag,count\n')
    for row in table:
        tag = row.find('th').text
        frequency = row.find('svg').text
        fid.write(f'{tag},{frequency}\n')

Merging DiDeMo and YFCC100M tags

In [6]:
import pandas as pd

tags, rows = {}, []
found_tags = 0
file_ref = 'data/interim/didemo/nouns_count.csv'
filename = 'data/interim/yfcc100m/tag_frequency.csv'
newfile = 'data/interim/didemo/nouns_yfcc100m.csv'

# Get didemo tags
with open(file_ref) as fid:
    i = 0
    for line in fid:
        if i == 0:
            i += 1
            continue
        tag, count = line.strip().split(',')
        tags[tag] = None
        rows.append({'tag': tag, 'instances': count, 'dataset': 'DiDeMo'})

# Add YFCC100M tags that are in Didemo
with open(filename) as fid:
    i = 0
    for line in fid:
        if i == 0:
            i += 1
            continue
        tag, count = line.strip().split(',')
        if tag in tags:
            found_tags += 1
            rows.append({'tag': tag, 'instances': count, 'dataset': 'YFCC100M'})
print(f'Tags found: {found_tags}')

df = pd.DataFrame(rows)
#df.to_csv(newfile, index=None)

Tags found: 951


{'url': 'altair-data-41217a5eb806a0e8ca497668ef5f1003.csv',
 'format': {'type': 'csv'}}

Export data to layered plot with two axes

In [1]:
import numpy as np
import pandas as pd

tags, rows = {}, []
found_tags = 0
file_ref = 'data/interim/didemo/nouns_count.csv'
filename = 'data/interim/yfcc100m/tag_frequency.csv'
newfile = 'data/interim/didemo/nouns_didemo_vs_yfcc100m.csv'

# Get didemo tags
with open(file_ref, 'r') as fid:
    for i, line in enumerate(fid):
        if i == 0:
            continue
        tag, count = line.strip().split(',')
        tags[tag] = i - 1
        rows.append({'tag': tag, 'instances_didemo': int(count)})

# Add YFCC100M tags that are in Didemo
with open(filename, 'r') as fid:
    for i, line in enumerate(fid):
        if i == 0:
            continue
        tag, count = line.strip().split(',')
        if tag in tags:
            found_tags += 1
            rows[tags[tag]]['instances_yfcc100m'] = int(count)
print(f'Tags found: {found_tags}')

df = pd.DataFrame(rows)
#df.to_csv(newfile, index=None)

Tags found: 951


Plot YFCC100M vs DiDeMo

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

ind = (pd.notna(df['instances_yfcc100m']) &
       (df.loc[:, 'instances_yfcc100m'] > df.loc[:, 'instances_didemo']))
ind = ind.nonzero()[0]
assert len(ind) == pd.notna(df['instances_yfcc100m']).sum()

fig, ax = plt.subplots(figsize=(40, 10))
df.instances_didemo.plot(kind='bar', ax=ax, color='#a6cee3')
ax2 = df.instances_yfcc100m.plot(kind='bar', ax=ax, color='#fb9a99', secondary_y=True)
idx = np.linspace(0, len(df) - 1, 100).astype(int)
ax.set_xticks(idx)
_ = ax.set_xticklabels(df.loc[idx, 'tag'], rotation=90, size=12)
ax.tick_params(labelsize=12)
ax.set_ylabel('Instances DiDeMo', size=12)
ax2.tick_params(labelsize=12)
# ax2.set_ylabel('Instances YFCC100M', size=12)
#plt.savefig("data/interim/didemo/nouns_didemo_vs_yfcc100m.svg")

In [3]:
df = df.sort_values('instances_didemo', ascending=False)
percentile = [50, 25, 10, 5, 3, 1, 1-0.99]
lind_in_yfc100m = pd.notna(df.loc[:, 'instances_yfcc100m'])
for i in percentile:
    ind = int(len(df['instances_didemo']) * i / 100)
    percentile_value = df.loc[ind, 'instances_didemo']
    lind_below_pctile = df.loc[:, 'instances_didemo'] <= percentile_value
    tags_below_pctile = (lind_below_pctile & lind_in_yfc100m).sum()
    print(f'NOUNs with <= to {percentile_value} instances get {tags_below_pctile} [{100 * tags_below_pctile / len(df):.2f}][{100 * tags_below_pctile / lind_in_yfc100m.sum():.2f}]')

NOUNs with <= to 2 instances get 238 [6.17][25.03]
NOUNs with <= to 8 instances get 481 [12.46][50.58]
NOUNs with <= to 31 instances get 707 [18.32][74.34]
NOUNs with <= to 79 instances get 814 [21.09][85.59]
NOUNs with <= to 134 instances get 866 [22.44][91.06]
NOUNs with <= to 490 instances get 923 [23.91][97.06]
NOUNs with <= to 6313 instances get 951 [24.64][100.00]


Generate cloudword

In [None]:
df['diff'] = df['instances_yfcc100m'] - df['instances_didemo']
df2 = df.loc[lind_in_yfc100m, :].tail(n=100)
df2.loc[:, 'instances_yfcc100m'] = 100 * df2.loc[:, 'instances_yfcc100m'].max() / df.loc[:, 'instances_yfcc100m'].max()
ind = df2.loc[:, 'instances_yfcc100m'] < 1
for i, row in df2.iterrows():
    for j in range(int(row['instances_yfcc100m'])):
        print(row['tag'], end=' ')

## Open images

[project](https://storage.googleapis.com/openimages/web/index.html)

[details](https://storage.googleapis.com/openimages/web/download.html) about annotations 

data at adobe

all images (v1): `/mnt/ilcompf5d0/data/google_openimages`

boxes: TBD

image-level labels: `/mnt/ilcompf9d1/user/escorcia/image_level`

In [31]:
import pandas as pd

file_trainable = '/mnt/ilcompf9d1/user/escorcia/openimages/image_level/classes-trainable.txt'
file_description = '/mnt/ilcompf9d1/user/escorcia/openimages/image_level/class-descriptions.csv'

trainable_df = pd.read_csv(file_trainable, header=None, columns='key')
trainable = dict.fromkeys(trainable_df['key'].tolist())
description_df = pd.read_csv(file_description, header=None, columns=['key', 'description'])
len(trainable_df, )
# Comment the last line if you are not interested in
# dumping CSV with counts of tags
# with open(filename, 'w') as fid:
#     fid.write('tag,count\n')
#     for row in table:
#         tag = row.find('th').text
#         frequency = row.find('svg').text
#         fid.write(f'{tag},{frequency}\n')

7186

## HICO

## COCO

[project](http://cocodataset.org/#captions-2015)

## Visual genome

[project](http://visualgenome.org/)

## Flickr30k

[project](http://web.engr.illinois.edu/~bplumme2/Flickr30kEntities/)

[website](http://web.engr.illinois.edu/~bplumme2/Flickr30kEntities/phraseList.html) to scrap for tags