# Map tags to images

In [2]:
%%time
# Load clean (scrapped) tags
import csv
filename = '../data/interim/yfcc100m/tag_frequency.csv'
clean_tags = set()
with open(filename, 'r') as fid:
    reader = csv.DictReader(fid, delimiter=',')
    for row in reader:
        clean_tags.add(row['tag'])

# Lemmatizer to deal with plurals
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

# Extract mapping tags 2 image
import glob
import pandas as pd
import sys
# Those CSV-files are too big and we need these two lines
csv.field_size_limit(sys.maxsize)

TOPK = 5
IMAGE_ID = 0
USER_ID = 1
USER_TAGS = 8
MACHINE_TAGS = 9
IMAGE_URL = 14
PHOTO_OR_VIDEO = 22

# rows = []
images = []
tag2image = {}
image_tags = []
for i in glob.glob('/mnt/ilcompf2d1/data/yfcc100m/yfcc100m_dataset-0*'):
    with open(i, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            if int(row[PHOTO_OR_VIDEO]) != 0 or len(row[USER_TAGS]) == 0:
                continue 
            
            counter = 0
            image_url = f'{row[IMAGE_URL]}'
            images.append(image_url)
            image_tags.append([])
            for tag in row[USER_TAGS].split(','):                    
                lemmatized_tag = lemmatizer(tag, u'NOUN')[0]
                if lemmatized_tag not in clean_tags:
                    continue
                
                counter += 1
                if lemmatized_tag in tag2image:
                    tag2image[lemmatized_tag].append(len(images) - 1)
                else:
                    tag2image[lemmatized_tag] = [len(images) - 1]
                image_tags[-1].append(tag)

                if counter == TOPK:
                    break
print('Num tags:', len(tag2image))
print('Num clean tags:', len(clean_tags))

Num tags: 2839
Num clean tags: 2981
CPU times: user 3min 15s, sys: 5.99 s, total: 3min 21s
Wall time: 3min 39s


In [11]:
import json

with open('../data/interim/yfcc100m/tag_to_images-0.json', 'w') as fid:
    json.dump({'images': images, 'image_tags': image_tags, 'tag2image': tag2image}, fid)

# Check relevance of top-k tags in YFCC100M

Compare raw top-k tags vs relevant* top-k tags for about 100 random images.

- This generates the data from the webpage `yfcc100m_original_filtered_v2`

* relevant means used in [this project](http://deep-tagging.cs.washington.edu/imagenet_correspondence.html)

~~goal: visualize 100 random images with top-5 tags vs filtered top-5 tags~~

In [3]:
%%time

import csv
filename = 'data/interim/yfcc100m/tag_frequency.csv'
clean_tags = set()
with open(filename, 'r') as fid:
    reader = csv.DictReader(fid, delimiter=',')
    for row in reader:
        clean_tags.add(row['tag'])

from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

import glob
import csv
# Those CSV are too big and we need these two lines
import sys
csv.field_size_limit(sys.maxsize)
import random

SEED = 13029
if SEED is not None:
    random.seed(SEED)
TOPK = 5
IMAGE_ID = 0
USER_ID = 1
USER_TAGS = 8
IMAGE_URL = 14
PHOTO_OR_VIDEO = 22
SUBSET = 180 # None

rows = []
image_tag_mapping = {}
for i in glob.glob('/mnt/ilcompf2d1/data/yfcc100m/yfcc100m_dataset-0*'):
    with open(i, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            
            if int(row[PHOTO_OR_VIDEO]) != 0 or len(row[USER_TAGS]) == 0:
                continue
            if SUBSET is not None:
                if random.random() > SUBSET / 10000000:
                    continue  
            
            counter, counter2 = 0, 0
            image_url = f'{row[IMAGE_URL]}'
            topk_original_tags, topk_filtered_tags = [], []
            for tag in row[USER_TAGS].split(','):
                if counter < TOPK:
                    counter += 1
                    topk_original_tags.append(tag)
                    
                lemmatized_tag = lemmatizer(tag, u'NOUN')[0]
                if lemmatized_tag in clean_tags:
                    counter2 += 1
                    topk_filtered_tags.append(lemmatized_tag)
                    
                if counter2 == TOPK:
                    break
            image_tag_mapping[image_url] = {'original': topk_original_tags,
                                            'filtered': topk_filtered_tags}
print('Num images:', len(image_tag_mapping))

CPU times: user 57.4 s, sys: 1.23 s, total: 58.7 s
Wall time: 58.8 s


In [5]:
import json

with open('data/interim/yfcc100m/top5_tags_100.json', 'w') as fid:
    json.dump(image_tag_mapping, fid)