# Importing and Loading COCO, GloVe, ResNet

In [12]:
import numpy as np
from pathlib import Path
import json
import io
import requests
from PIL import Image
from gensim.models import KeyedVectors
import pickle
from collections import defaultdict
import random
import time

In [13]:
# load COCO metadata
filename = "data/captions_train2014.json"
with Path(filename).open() as f:
    coco_data = json.load(f)

In [14]:
# Loading GloVe-200
filename = "data/glove.6B.200d.txt.w2v"
t0 = time.time()
glove = KeyedVectors.load_word2vec_format(filename, binary=False)
t1 = time.time()
print("elapsed %ss" % (t1 - t0))

elapsed 50.193023920059204s


In [15]:
# Loading ResNet Descriptors
with Path('data/resnet18_features.pkl').open('rb') as f:
    resnet18_features = pickle.load(f)

In [16]:
def download_image(img_url: str) -> Image:
    """Fetches an image from the web.

    Parameters
    ----------
    img_url : string
        The url of the image to fetch.

    Returns
    -------
    PIL.Image
        The image."""

    response = requests.get(img_url)
    return Image.open(io.BytesIO(response.content))

In [25]:
import re, string
import numpy as np
import mygrad as mg

from collections import Counter

punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))

def tokenize(text):
    return punc_regex.sub('', text).lower().split()

def to_vocab(list_of_counters, k=None, stop_words=tuple()):
    """ 
    [word, word, ...] -> sorted list of top-k unique words
    Excludes words included in `stop_words`
    
    Parameters
    ----------
    list_of_counters : Iterable[Iterable[str]]
    
    k : Optional[int]
        If specified, only the top-k words are returned
    
    stop_words : Collection[str]
        A collection of words to be ignored when populating the vocabulary
    """
    # <COGINST>
    vocab = Counter()
    for counter in list_of_counters:
        vocab.update(counter)
        
    for word in set(stop_words):
        vocab.pop(word, None)  # if word not in bag, return None
    return sorted(i for i,j in vocab.most_common(k))
    # </COGINST>

def phrase_idf(phrase_vocab, list_of_counters):
    N = len(list_of_counters)
    #print(N)
    #print(phrase_vocab)
    
    # if term i is not in glove, we set nt[i] = N that way its corresponding idf value is 0
    nt = [sum(1 if term in counter else 0 for counter in list_of_counters) for term in phrase_vocab]
    nt = np.array(nt, dtype=float)
    
    nt[nt == 0] = N
    
    #print(nt)
    return np.log10(N / nt)

# Initializing Database

In [26]:
class Database:
    def __init__(self):
        self.ID_to_descriptor = resnet18_features
        self.ID_to_URL = dict()
        self.ID_to_captions = defaultdict(list) # key:value | ID --> list of captions corresonding to ID
        # NOTE: USE "glove" TO CONVERT WORD --> WORD EMBEDDING
        
        # Getting ID --> URL:
        for i in range(len(coco_data["images"])):
            ID = coco_data["images"][i]['id']
            URL = coco_data["images"][i]['coco_url']
            self.ID_to_URL[ID] = URL
        
        # Getting ID --> Captions
        for i in range(len(coco_data["annotations"])):
            ID = coco_data["annotations"][i]['image_id']
            caption = coco_data["annotations"][i]['caption']
            self.ID_to_captions[ID].append(caption)
        
        # Initialize the dataset
        self.make_dataset()
        
        # Shuffle datasets
        self.shuffle_dataset()
        
        # Making List of caption_counters
        self.list_of_caption_counters = []
        for ID in self.ID_to_captions:
            captions = self.ID_to_captions[ID]
            for caption in captions:
                self.list_of_caption_counters.append(Counter(tokenize(caption)))
    
    # This funciton creates the dataset (only call this once during initialization process)
    def make_dataset(self):
        list_of_IDs = list(self.ID_to_descriptor.keys())
        N = len(list_of_IDs)
        self.dataset = np.zeros((N, 3), dtype=np.int64) # Shape: N, 3
        
        for i in range(N):
            ID = list_of_IDs[i]
            confuser_ID = random.randint(0, N-1)
            while ID == confuser_ID: # Just to make sure that the randomly picked confuser ID isn't the same as the img ID; 1/N chance of happening
                confuser_ID = random.randint(0, N-1)
            caption_index = random.randint(0, len(self.ID_to_captions[ID])-1)
            
            self.dataset[i][0] = caption_index
            self.dataset[i][1] = ID
            self.dataset[i][2] = confuser_ID
    
    '''
    This function randomly shuffles the dataset across its rows (each tuplet)
    and makes the cuts for the training & validation sets;
    call this when you want to shuffle the sets after each epoch.
    '''
    def shuffle_dataset(self):
        np.random.shuffle(self.dataset)
        cut = int(self.dataset.shape[0] * (4/5))
        self.training_set = self.dataset[0:cut]
        self.validation_set = self.dataset[cut:]
    
    # This function parses the query and returns one word embedding that represents the query
    def parse_query(self, phrase):
        phrase_vocab = to_vocab([Counter(tokenize(phrase))])

        glove_embeddings = [(glove[term] if term in glove else np.zeros(200)) for term in phrase_vocab]
        idf = phrase_idf(phrase_vocab, self.list_of_caption_counters)

        w_phrase = sum( glove_embeddings[i] * idf[i] for i in range(len(idf)) )

        return w_phrase / np.sqrt((w_phrase ** 2).sum(keepdims=True)) # normalized
    

In [27]:
db = Database()

In [28]:
w_caption = db.parse_query("Hello World it is I.")

In [29]:
w_caption.shape

(200,)

In [30]:
w_caption

array([ 6.98279738e-02,  7.27819130e-02, -4.74345013e-02, -1.03809431e-01,
       -3.65419425e-02,  3.88387367e-02, -9.68886986e-02,  5.37098572e-03,
        6.08133525e-02,  3.19068842e-02, -4.80326638e-02,  8.43193009e-02,
        3.15515548e-02,  8.50647986e-02,  3.07553746e-02,  3.58984130e-03,
       -6.91734776e-02,  1.11817487e-01,  4.32791673e-02,  5.25470115e-02,
        1.07439242e-01,  4.49819475e-01,  3.19547392e-02, -5.35684824e-02,
        2.89007947e-02, -1.57520063e-02,  9.80453752e-03,  9.94066056e-03,
       -4.44947556e-02, -3.38789038e-02, -3.37655395e-02,  2.79248320e-02,
       -7.52623156e-02, -5.02482206e-02, -8.38084370e-02, -2.70092376e-02,
       -1.17717616e-01, -5.26799588e-03, -8.19397997e-03, -1.00290030e-02,
       -5.10581918e-02,  5.90292476e-02, -4.90671657e-02,  3.86257172e-02,
       -5.01311980e-02,  5.37534244e-02,  1.34071857e-01,  1.09133730e-02,
       -1.30879907e-02,  4.76207845e-02,  3.51809859e-02, -2.90490184e-02,
        5.57997450e-02,  