In [1]:
from dataloader import *
%matplotlib inline
import sys
sys.path.append("./coco-caption")
import matplotlib.pyplot as plt
import skimage.io as io
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
from json import encoder
encoder.FLOAT_REPR = lambda o: format(o, '.3f')
from dataloader import get_dataset_configuration, load_all_captions_flickr, load_all_captions_coco
import glob
from IPython.display import display, clear_output
from collections import deque
from ipywidgets import HBox, Output, Button, widgets
from prettytable import PrettyTable
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [2]:
def load_images_flickr(dataset_configuration):
    """
    Method to map images ids to pictures for data in Flickr structure

    Parameters
    ----------
    images_dir: str
        Path to the directory with all images from  Flickr type dataset
    Returns
    -------
    all_images_mapping: dict->
        paths to all images

    """
    
    all_captions = load_all_captions_flickr(dataset_configuration["captions_file_path"])

    images_dir = dataset_configuration["images_dir"]
    train_images = set(open(dataset_configuration[
                    "train_images_names_file_path"], 'r').read().strip().split('\n'))
    
    
    # add global paths to the all images in images_dir directory
    all_images = glob.glob(images_dir + '*.jpg')
    all_images_mapping = dict()
    train_images_mapping = dict()
    for i in all_images:  # img is list of full path names of all images
        image_name = i.split("/")[-1]
        image_id = image_name.split(".")[0]
        all_images_mapping[image_id] = i 
        if image_name in train_images and image_id in all_captions.keys():
            train_images_mapping[image_id] = all_captions[image_id]
    return all_images_mapping, train_images_mapping




def load_images_coco(configuration):
    """
    Method to map images ids to pictures for data in COCO structure

    Parameters
    ----------
    configuration
        Configuration of the dataset, with paths to the images and
         datasets specific files like file that mapps images with image id
    Returns
    -------
    all_images_mapping: dict->
        paths to all images from coco type data identidied by image ids

    """
    all_captions = load_all_captions_coco(configuration["captions_file_path"])
    
    file_with_images_def = configuration["images_names_file_path"]
    images_folder = configuration["images_dir"]
    info = json.load(open(file_with_images_def))
    all_images_mapping = dict()
    train_images_mapping = dict()
    for ix in range(len(info['images'])):
        img = info['images'][ix]
        image_filename = img['file_path'].rsplit(".", 1)[0]
        #create global path to the image by users directory
        file_path = images_folder + "/" + img['file_path']

        if image_filename.find("/") != -1:
            image_filename = img['file_path'].rsplit("/", 1)[1].rsplit(".", 1)[0]
        #define data splits
        if img['split'] in ['train','val', 'test', 'restval']:
            all_images_mapping[image_filename] = file_path
            print(image_filename)
            print(all_captions[image_filename])
            if img['split'] == 'train' and image_filename in all_captions.keys():
                    train_images_mapping[image_filename] = all_captions[image_filename]
            all_images_mapping[image_filename] = file_path

    return all_images_mapping, train_images_mapping

def get_images_for_split(dataset_name):
    # Load dataset configuration, by the name of the dataset assigned for training/testing
    train_dataset_configuration = get_dataset_configuration(dataset_name)
    # Therefore Flickr and COCO have different file and data structures, to show captions and split of data
    # different methods for loading captions and images are used.
    # Datasets Flickr30k, COCO2017, COCO2014 have the same strucutre of files with captions and split informations.
    if train_dataset_configuration["data_name"] in ["flickr30k", "coco17", "coco14"]:
        all_images, train_captions_mapping = load_images_coco(train_dataset_configuration)
    # Datasets Flickr30k, Flickr8k_polish, AIDe, Flickr8k  have the same strucutre of files with captions and split informations.
    if train_dataset_configuration["data_name"] in ["flickr30k_polish", "flickr8k_polish", "aide", "flickr8k"]:
        all_images, train_captions_mapping = load_images_flickr(train_dataset_configuration)
    return all_images, train_captions_mapping

In [3]:
import json
import os
from config import general
intersection=dict()
max_set_of_ids=set()
max_dict_of_ids=dict()
min_dict_of_ids=dict()
min_set_of_ids=set()

bleu1_ids=list()
bleu2_ids=list()
bleu3_ids=list()
bleu4_ids=list()
cider_ids=list()
meteor_ids=list()
rouge_ids=list()




def sort_and_get_max_min(metric, name):
    sorted_by=sorted(metric.items(), key=lambda x:x[1])

    sorted_max_n=sorted_by[-1]
    sorted_min_n=sorted_by[1]

    image_id_min=sorted_min_n[0]
    image_id_max=sorted_max_n[0]

    min_set_of_ids.add(image_id_min)
    if image_id_min not in min_dict_of_ids:
        min_dict_of_ids[image_id_min]={"measures":{name},
                                       "dataset_name":dataset_name,
                                       "Bleu_1":results['imgToEval'][image_id_min]['Bleu_1'],
                                       "Bleu_2":results['imgToEval'][image_id_min]['Bleu_2'],
                                        "Bleu_3":results['imgToEval'][image_id_min]['Bleu_3'],
                                        "Bleu_4":results['imgToEval'][image_id_min]['Bleu_4'],
                                        "ROUGE_L":results['imgToEval'][image_id_min]['ROUGE_L'],
                                        "METEOR":results['imgToEval'][image_id_min]['METEOR'],
                                        "CIDEr":results['imgToEval'][image_id_min]['CIDEr'],
                                       "caption":results['imgToEval'][image_id_min]['caption'],
                                       "ground_truth_captions":results['imgToEval'][image_id_min]['ground_truth_captions'],
                                       "result_config_name":result_config_name,
                                       "image_path":all_images_from_split[image_id_min]}
    else:
        a=min_dict_of_ids[image_id_min]["measures"]
        a.add(name)

    max_set_of_ids.add(image_id_max)
    if image_id_max not in max_dict_of_ids:
        max_dict_of_ids[image_id_max]={"measures":{name},
                                       "dataset_name":dataset_name,
                                        "Bleu_1":results['imgToEval'][image_id_max]['Bleu_1'],
                                       "Bleu_2":results['imgToEval'][image_id_max]['Bleu_2'],
                                        "Bleu_3":results['imgToEval'][image_id_max]['Bleu_3'],
                                        "Bleu_4":results['imgToEval'][image_id_max]['Bleu_4'],
                                        "ROUGE_L":results['imgToEval'][image_id_max]['ROUGE_L'],
                                        "METEOR":results['imgToEval'][image_id_max]['METEOR'],
                                        "CIDEr":results['imgToEval'][image_id_max]['CIDEr'],
                                       "caption":results['imgToEval'][image_id_max]['caption'],
                                       "ground_truth_captions":results['imgToEval'][image_id_max]['ground_truth_captions'],
                                       "result_config_name":result_config_name,
                                       "image_path":all_images_from_split[image_id_max]}
    else:
        a=max_dict_of_ids[image_id_max]["measures"]
        a.add(name)
    return image_id_max, image_id_min, sorted_by




for result_config_name in os.listdir(general["results_directory"]):
    if result_config_name.endswith(".json"):
        results = json.load(open("./" + general["results_directory"] + "/" + result_config_name, 'r'))
        dataset_name=results["dataset_name"]

        result_images_ids = list(results['imgToEval'].keys())
        bleu1=dict()
        bleu2=dict()
        bleu3=dict()
        bleu4=dict()
        rouge=dict()
        meteor=dict()
        cider=dict()
        for img_id in result_images_ids:
            bleu1[img_id]=results['imgToEval'][img_id]['Bleu_1']
            bleu2[img_id]=results['imgToEval'][img_id]['Bleu_2']
            bleu3[img_id]=results['imgToEval'][img_id]['Bleu_3']
            bleu4[img_id]=results['imgToEval'][img_id]['Bleu_4']
            rouge[img_id]=results['imgToEval'][img_id]['ROUGE_L']
            meteor[img_id]=results['imgToEval'][img_id]['METEOR']
            cider[img_id]=results['imgToEval'][img_id]['CIDEr']


        all_images_from_split, train_captions = get_images_for_split(dataset_name)
        bleu1_sorted_max_n, bleu1_sorted_min_n, bleu1_sorted=sort_and_get_max_min(bleu1, "Bleu_1")
        bleu2_sorted_max_n, bleu2_sorted_min_n, bleu2_sorted=sort_and_get_max_min(bleu2, "Bleu_2")
        bleu3_sorted_max_n, bleu3_sorted_min_n, bleu3_sorted=sort_and_get_max_min(bleu3,'Bleu_3')
        bleu4_sorted_max_n, bleu4_sorted_min_n, bleu4_sorted=sort_and_get_max_min(bleu4, 'Bleu_4')
        meteor_sorted_max_n, meteor_sorted_min_n, meteor_sorted=sort_and_get_max_min(meteor,'METEOR')
        rouge_sorted_max_n, rouge_sorted_min_n, rouge_sorted=sort_and_get_max_min(rouge, 'ROUGE_L')
        cider_sorted_max_n, cider_sorted_min_n, cider_sorted=sort_and_get_max_min(cider, 'CIDEr')


metrics=['Bleu_4', 'METEOR','ROUGE_L','CIDEr']
for x in max_dict_of_ids.keys():
    max_dict_of_ids[x]["is_best"]=all(item in max_dict_of_ids[x]['measures'] for item in metrics)
for x in min_dict_of_ids.keys():
    min_dict_of_ids[x]["is_best"]=all(item in min_dict_of_ids[x]['measures'] for item in metrics)

print(max_dict_of_ids)
class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)
with open("./" + general["results_directory"] +"/intersection"+ "/intersection_results.json", 'w') as outfile:
    json.dump(
        {"max_dict_of_ids":max_dict_of_ids,
         "min_dict_of_ids":min_dict_of_ids},outfile, cls=SetEncoder)

{'3123463486_f5b36a3624': {'measures': {'Bleu_1'}, 'dataset_name': 'flickr8k', 'Bleu_1': 0.9999999998750002, 'Bleu_2': 0.8451542546153265, 'Bleu_3': 0.4919340733318464, 'Bleu_4': 6.985342055472009e-05, 'ROUGE_L': 0.7331730769230769, 'METEOR': 0.134383761300984, 'CIDEr': 0.9780245030146637, 'caption': 'a black dog is running on the sand', 'ground_truth_captions': ['A brown , black , and white dog runs along on the gravel .', 'A dog runs', 'A little dog running on sand .', 'The brown , white and black dog runs on a gravel surface .', 'The dog is running across the gravel .'], 'result_config_name': 'mixed_flickr8k_flickr8k_resnet50_glove.json', 'image_path': '/home2/data/images/flickr8k/Images/3123463486_f5b36a3624.jpg', 'is_best': False}, '3364026240_645d533fda': {'measures': {'Bleu_3', 'Bleu_2'}, 'dataset_name': 'flickr8k', 'Bleu_1': 0.9999999997500004, 'Bleu_2': 0.9258200995328305, 'Bleu_3': 0.8298265331423224, 'Bleu_4': 0.6914415690877681, 'ROUGE_L': 0.7724312590448625, 'METEOR': 0.12

In [4]:
def get_ngrams(text, n=2, m=None, max_features=20000):
    
    vec = CountVectorizer(ngram_range = (n, n), 
                          max_features = max_features).fit(text)
    bag_of_words = vec.transform(text)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = dict()
    words_freq = {word: sum_words[0, i] for word, i in vec.vocabulary_.items()}
    words_freq = {k: v for k, v in sorted(words_freq.items(), key=lambda item: item[1], reverse=True)}
   
    return words_freq

def n_gram_train_captions(train_captions_dict, n=2):
    #     train_captions = set()
#     for val in train_captions_dict.values():
#         for sentence in val:
#             train_captions.add(sentence)

    train_captions = list()
    for val in train_captions_dict.values():
        for sentence in val:
            train_captions.append(sentence)
    unigrams = get_ngrams(train_captions, n)
    file = open(f'captions_n_gram/train_captions{n}', 'wb')
    pickle.dump(unigrams, file)
    file.close()
#     vectorizer = CountVectorizer()
#     X = vectorizer.fit_transform(train_captions)
#     captions_words = vectorizer.get_feature_names()
#     vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(n, n))
#     X2 = vectorizer2.fit_transform(train_captions)
#     captions_grams = vectorizer2.get_feature_names()
#     print(vectorizer2.get_params())
#     return captions_grams



In [5]:
n_gram_train_captions(train_captions,5)


Czy dobrze jest policzona suma wszystkich n-gramów, czy dobrze jest wyliczony procent?

In [6]:
def n_gram_stat(n, cap):
    
    # open a file, where you stored the pickled data
    with open(f'captions_n_gram/train_captions{n}', 'rb') as file:
        all_captions_train = pickle.load(file)

    
    sum_all_cap = sum(all_captions_train.values())
    caption = [cap]
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(caption)
    captions_words = vectorizer.get_feature_names()
    vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(n, n))
    X2 = vectorizer2.fit_transform(caption)
    captions_grams = vectorizer2.get_feature_names()
    perc_gram = []
    for cap_gram in captions_grams:

        try:
            perc_gram.append((cap_gram, all_captions_train[cap_gram]/sum_all_cap))
        except KeyError as e:
            perc_gram.append((cap_gram, 0))

    return perc_gram
    

In [41]:
def show_image_results_captions(image_id, intersection_results):
    """
    Method to show image, ground truth captions, predicted caption and results of metrics
    Parameters
    ----------
    image_id: str
        ID of image
    Returns
    -------
    Prints image, ground truth captions, predicted caption and results of metrics
    """
    #Load results of metrics from file
    image_results = intersection_results[image_id]
    print('Dataset name: {}'.format(image_results["dataset_name"]))
    #Load image
    I = io.imread(image_results['image_path'])
    plt.imshow(I)
    plt.axis('off')
    plt.show()
    print("Ground truth captions")
    print(image_results['ground_truth_captions'])
    print("Predicted captions")
    print(image_results['caption'])
    print(n_gram_stat(2, image_results['caption']))
    #Display results in pretty table
    print( f'\n===== Results =====' )
#     print(image_results["metrics"])
#     print(image_results["is_best"])
    t = PrettyTable(("Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"))
    t.add_row((image_results["Bleu_1"], image_results["Bleu_2"], image_results["Bleu_3"], image_results["Bleu_4"]))
    t2 = PrettyTable(("METEOR", "ROUGE_L", "CIDEr"))
    t2.add_row((image_results["METEOR"],image_results["ROUGE_L"], image_results["CIDEr"]))
    print(t)
    print(t2)
    print()

In [42]:
#List all files from results directory to view data
type_of_filter = ["max_dict_of_ids", "min_dict_of_ids"]
selectbox = widgets.Select(
    options=type_of_filter,
    value=type_of_filter[1],
    description='Min/max:',
    disabled=False
)

In [43]:
print(selectbox.value)
intersection_results = json.load(open(general["results_directory"] + "/intersection/intersection_results.json"))
inter_results=intersection_results[selectbox.value]
images_ids=list(inter_results.keys())
#Create fancy viever for images, captions and results of evaluation
d=deque(images_ids)
#Button to read image back
left = Button(description="<")
#Button to read next image
right = Button(description=">")

switch = [left, right]

combined = HBox([items for items in switch])
out = Output()

def on_button_left(ex):
    with out:
        clear_output()
        d.rotate(1)
        show_image_results_captions(d[0], inter_results)
def on_button_right(ex):
    with out:
        clear_output()
        d.rotate(-1)
        show_image_results_captions(d[0], inter_results)
l=switch[0].on_click(on_button_left)
r=switch[1].on_click(on_button_right)
display(combined)
display(out)

min_dict_of_ids


HBox(children=(Button(description='<', style=ButtonStyle()), Button(description='>', style=ButtonStyle())))

Output()

In [76]:
type_of_filter = ["max_dict_of_ids", "min_dict_of_ids"]
selectbox = widgets.Select(
    options=type_of_filter,
    value=type_of_filter[1],
    description='Min/max:',
    disabled=False
)
freq_for_min = {}
freq_for_max = {}
freq = {}

def freq_calc(freq, selectbox, dataset_name):
    intersection_results = json.load(open(general["results_directory"] + "/intersection/intersection_results.json"))
    inter_results=intersection_results[selectbox.value]
    images_ids=list(inter_results.keys())
    #Create fancy viever for images, captions and results of evaluation
    d=deque(images_ids)
    for element in d:
        image_results = inter_results[element]
        n_grams = n_gram_stat(2, image_results['caption'])
        for n_gram_words in n_grams:
            if n_gram_words[0] in freq:
                freq[n_gram_words[0]] = freq[n_gram_words[0]] + 1
            else:
                freq[n_gram_words[0]] = 1
    return image_results


freq_calc(freq, selectbox, dataset_name)

selectbox = widgets.Select(
    options=type_of_filter,
    value=type_of_filter[0],
    description='Min/max:',
    disabled=False
)
img_results = freq_calc(freq, selectbox, dataset_name)
dataset = image_results["dataset_name"]

freq = {k: v for k, v in sorted(freq.items(), key=lambda item: item[1], reverse=True)}


In [77]:
freq

{'dog is': 5,
 'is running': 4,
 'shirt is': 3,
 'in red': 2,
 'is standing': 2,
 'man in': 2,
 'red shirt': 2,
 'running through': 2,
 'through river': 2,
 'black dog': 2,
 'running on': 2,
 'down snowy': 2,
 'is skiing': 2,
 'skiing down': 2,
 'in the': 1,
 'standing in': 1,
 'the street': 1,
 'black shirt': 1,
 'in black': 1,
 'in white': 1,
 'next to': 1,
 'on stone': 1,
 'shirt sitting': 1,
 'sitting on': 1,
 'standing next': 1,
 'stone floor': 1,
 'to man': 1,
 'white shirt': 1,
 'woman in': 1,
 'boy in': 1,
 'is eating': 1,
 'young boy': 1,
 'grassy field': 1,
 'through grassy': 1,
 'on the': 1,
 'the sand': 1,
 'skier is': 1,
 'snowy mountain': 1,
 'and white': 1,
 'black and': 1,
 'on sand': 1,
 'white dog': 1,
 'is swimming': 1,
 'swimming through': 1,
 'man is': 1,
 'snowy hill': 1}

In [71]:
import re
from config import fastText, glove
from tqdm import tqdm
import numpy as np

def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False


if dataset == 'flickr8k':
    from config_datasets import config_flickr8k
    language = config_flickr8k['language']
    
word_embedings_path = glove[language]["word_embedings_path"]
embedings_dim = glove[language]["embedings_dim"]

embeddings_index = {}
# From the embeddings matrix get coefficients of particular words and store the in dictionarym by key - words
f = open(word_embedings_path, encoding="utf-8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    if isfloat(values[1]):
        coefs = np.asarray(values[2:], dtype='float32')
    elif isfloat(values[2]):
        coefs = np.asarray(values[3:], dtype='float32')
    elif isfloat(values[3]):
        coefs = np.asarray(values[4:], dtype='float32')
    elif isfloat(values[4]):
        coefs = np.asarray(values[5:], dtype='float32')
    else:
        coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

400000it [00:10, 36653.38it/s]


In [88]:
for key_freq in freq.keys():
    for word in key_freq.split(' '):
        print(word, embeddings_index[word])

dog [-4.7601e-01 -5.6369e-02 -3.9082e-01 -1.7544e-01 -6.2244e-01 -3.9816e-01
  2.9620e-01 -6.0647e-02 -6.7017e-02  1.1466e-01 -3.3015e-01 -2.0318e-02
  6.0616e-01 -1.3920e-01  1.3896e-01 -5.4781e-01  3.0864e-01  1.7354e-01
  3.9927e-01  2.1137e-01  1.3004e+00  8.8030e-01  2.3946e-01  2.8838e-01
 -4.6336e-01  2.5745e-01 -3.1755e-01 -3.2877e-01 -5.9534e-01  2.3983e-01
  3.4159e-01  1.2754e-01 -8.8208e-01  1.4258e-01 -1.8857e-01 -1.6961e-01
  2.7808e-01 -2.4600e-01  1.9122e-01  5.0244e-01  5.3660e-01 -5.3568e-01
  2.4827e-01  3.2561e-01  6.7882e-01  9.6401e-01 -2.8892e-01  5.1206e-01
  5.8496e-01 -3.1934e-02 -2.4849e-02  8.8564e-02  1.7360e-01  5.4166e-01
 -8.6743e-02 -3.8412e-01  1.3974e-01 -7.4122e-03  9.2210e-01 -2.5799e-01
 -4.7018e-01 -5.5742e-01 -2.1213e-02 -7.1072e-01  8.0995e-02 -4.7254e-01
 -3.2925e-01  6.8052e-01  1.7242e-01  8.7783e-02 -2.6560e-01 -6.0070e-01
 -8.5217e-02 -3.6977e-02 -3.6593e-01 -6.2576e-01 -3.4162e-01  5.4672e-02
 -1.1734e-01  1.9686e-01  8.3758e-02  4.3157e-0