#Plotting stats raw context

In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn
from collections import OrderedDict

In [5]:
loader = np.load('/media/Data/flipvanrijn/datasets/coco/processed/reduced/context_train_filtered_stemmed.npz')
data_train = loader['data']
loader = np.load('/media/Data/flipvanrijn/datasets/coco/processed/reduced/context_val_filtered_stemmed.npz')
data_val = loader['data']

In [14]:
def count_lengths(data):
    # Count with the same length
    lengths = {}
    for d in data:
        length = len(d)
        lengths[length] = lengths.get(length, 0) + 1
    # Sort them in decreasing order
    sorted_lengths = OrderedDict(sorted(lengths.items(), key=lambda t: t[1]*-1))
    return lengths.values(), sorted_lengths
lengths_train, _ = count_lengths(data_train)
lengths_val, _ = count_lengths(data_val)

In [16]:
# Plot lengths train
fig = plt.figure()
plt.plot(lengths_train)
plt.xlabel('# words')
plt.ylabel('Frequency')
plt.title('Textual context size [TRAIN]')
plt.savefig('/home/flipvanrijn/plots/textual-context-size-train.pdf', dpi=1000)

# Plot lengths validation + test
fig = plt.figure()
plt.plot(lengths_val)
plt.xlabel('# words')
plt.ylabel('Frequency')
plt.title('Textual context size [VAL]')
plt.savefig('/home/flipvanrijn/plots/textual-context-size-val.pdf', dpi=1000)

In [58]:
#Zipf's distribution of length
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(sorted_lengths.values())
ax.set_yscale('log')
plt.show()

In [80]:
sum([c for l, c in lengths.items() if l <= 100])

77673

#Evaluate context using metrics

In [82]:
import sys
sys.path.append('/home/flipvanrijn/Workspace/Dedicon-Thesis/server/pycocoevalcap')

In [85]:
from rouge.rouge import Rouge

In [137]:
data = [(u'luang', 0.20321628880092443),
 (u'prabang', 0.2050751395771906),
 (u'199', 0.22953895277526),
 (u'this', 0),
 (u'dog', 0.15507775312144545),
 (u'loved', 0.1070854724854866),
 (u'to', 0),
 (u'sack', 0.2050751395771906),
 (u'out', 0),
 (u'on', 0),
 (u'the', 0),
 (u'shoe', 0.22634186845755885),
 (u'rack', 0.16392433862000122),
 (u'at', 0),
 (u'villa', 0.16745461600994188),
 (u'merry', 0.18885814309810428),
 (u'no', 0),
 (u'.', 0),
 (u'1', 0),
 (u'.', 0),
 (u'laos', 0.2974918213250064),
 (u'lao', 0.2974918213250064),
 (u'vacation', 0.11953598919265084),
 (u'travel', 0.10236591509290169),
 (u'trip', 0.11494678531178285),
 (u'luangprabang', 0.20234419599159267),
 (u'dog', 0.15507775312144545),
 (u'shoes', 0.22634186845755885),
 (u'cute', 0.12711024332054496)]

In [151]:
from colored import fg, bg
colors = range(232, 253)[::-1]
scores = [s[1] for s in data]
words = [s[0] for s in data]
bins = np.linspace(0, 1, len(colors))
pos = np.digitize(sc, bins)
for w, p, s in zip(words, pos, scores):
    if s > 0:
        print('%s%s %s ' % (fg(255), bg(colors[p]), w)),
    else:
        print('%s%s %s ' % (fg('black'), bg(255), w)),

[38;5;255m[48;5;247m luang  [38;5;255m[48;5;247m prabang  [38;5;255m[48;5;247m 199  [38;5;0m[48;5;255m this  [38;5;255m[48;5;248m dog  [38;5;255m[48;5;249m loved  [38;5;0m[48;5;255m to  [38;5;255m[48;5;247m sack  [38;5;0m[48;5;255m out  [38;5;0m[48;5;255m on  [38;5;0m[48;5;255m the  [38;5;255m[48;5;247m shoe  [38;5;255m[48;5;248m rack  [38;5;0m[48;5;255m at  [38;5;255m[48;5;248m villa  [38;5;255m[48;5;248m merry  [38;5;0m[48;5;255m no  [38;5;0m[48;5;255m .  [38;5;0m[48;5;255m 1  [38;5;0m[48;5;255m .  [38;5;255m[48;5;246m laos  [38;5;255m[48;5;246m lao  [38;5;255m[48;5;249m vacation  [38;5;255m[48;5;249m travel  [38;5;255m[48;5;249m trip  [38;5;255m[48;5;247m luangprabang  [38;5;255m[48;5;248m dog  [38;5;255m[48;5;247m shoes  [38;5;255m[48;5;249m cute 


In [128]:
print colors

[255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240, 239, 238, 237, 236, 235, 234, 233, 232]
