In [166]:
%matplotlib inline
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (8.0, 10.0)
from IPython.display import Image
from IPython.core.display import HTML 

import sys
sys.path.append('./cocoapi/PythonAPI')
from pycocotools.coco import COCO
import json

from sklearn.feature_extraction.text import CountVectorizer

In [99]:
coco = COCO('./data/foilv1.0_train_2017.json')

loading annotations into memory...
Done (t=0.91s)
creating index...
index created!


In [100]:
# Get all dataset annotations
anns = coco.loadAnns(coco.getAnnIds())
print(anns[:5])

[{'id': 789366, 'foil_id': 2000003, 'image_id': 57870, 'caption': 'a long restaurant couch with rattan rounded back chairs.', 'target_word': '', 'foil_word': '', 'foil': True}, {'id': 789366, 'foil_id': 2000003, 'image_id': 57870, 'caption': 'a long restaurant couch with rattan rounded back chairs.', 'target_word': '', 'foil_word': '', 'foil': True}, {'id': 789888, 'foil_id': 2000006, 'image_id': 57870, 'caption': 'a long couch with a plant on top of it surrounded with wooden chairs', 'target_word': '', 'foil_word': '', 'foil': True}, {'id': 789888, 'foil_id': 2000006, 'image_id': 57870, 'caption': 'a long couch with a plant on top of it surrounded with wooden chairs', 'target_word': '', 'foil_word': '', 'foil': True}, {'id': 791316, 'foil_id': 2000008, 'image_id': 57870, 'caption': 'a long couch with a flower arrangement in the middle for meetings', 'target_word': '', 'foil_word': '', 'foil': True}]


In [139]:
def display_img(img_ids, caption=''):
    imgs = coco.loadImgs(img_ids)
    for img in imgs:
        display(Image(url=img['flickr_url'], width=100, height=100))
        display(caption)
        

In [5]:
# Input can be 'id', foil_id, 'image_id'
def get_unique_ids(id_type):
    anns = coco.loadAnns(coco.getAnnIds())
    ids = {ann[id_type] for ann in anns}
    return list(ids)

In [6]:
# There are more foil ids than img ids since there are multiple ways of foiling one image
print("# Foils: ", len(get_unique_ids('foil_id')))
print("# Images: ", len(get_unique_ids('image_id')))

# Foils:  197788
# Images:  65697


In [7]:
# Look at captions for the same image
imgs = get_unique_ids('image_id')
img_0 = imgs[11]
captions_0 = {(ann['caption'], ann['foil']) for ann in anns if ann['image_id'] == img_0}
display_img(img_0, captions_0)

{('a cat on a surfboard is riding in on the wake.', True),
 ('a cat riding a surfboard in the ocean.', True),
 ('a cat surfing on a surfboard on a body of water.', True),
 ('a cow laying on a surf board and riding a small wave', True)}

### Reproducing Defoiling Foiled Image Captions Paper
https://github.com/sheffieldnlp/foildataset

Bag of Objects representation:
1. object mention
2. object frequency
3. use object annotations + predictions using YOLO to generate this

In [189]:
# object_mentions: key = image id, vals = dict of object id : frequency count
def collect_object_mentions(instances, object_mentions):
    for obj in instances.loadAnns(instances.getAnnIds()):
        object_mentions[obj['image_id']] = object_mentions.get(obj['image_id'], {})
        frequencies = object_mentions[obj['image_id']]
        count = frequencies.get(obj['category_id'], 0)
        frequencies[obj['category_id']] = count + 1

In [218]:
def get_feats_y(instances, foil_data):
    categories = instances.loadCats(instances.getCatIds())
    categories = {cat['id'] : cat['name'] for cat in categories}
    object_mentions = {}
    collect_object_mentions(instances, object_mentions)
    
    annotations = [l['caption'] for l in foil_data['annotations']]
    foils = [(l['target_word'], l['foil_word']) for l in foil_data['annotations']]
    
    test_img = foil_data['annotations'][0]['image_id']
    current_objects = object_mentions[test_img]
    print('Verifying objects in image and frequency')
    for obj in current_objects.keys():
        if obj > 0: 
            print(categories[obj], current_objects[obj])
    
    # term frequency, inverse document frequenct (tf-idf)
    # tf = freq t in document, idf = ln (1 / freq document has t)
    tf_vectorizer = CountVectorizer(max_features=None, lowercase=True)
    # row = word, col = document, val = count

    # Fit the training dataset and save the model 
    tf_model = tf_vectorizer.fit(training_annotations) 
    # convert annotation to tf idf feat

    # Obtain training features - this is a sparse matrix
    training_feats = tf_model.transform(training_annotations)
    # get the above format using training_feats.toarray()

    # Obtain the outputs (0/1)
    training_y = [0 if not f['foil'] else 1 for f in foil_data['annotations']]

    # just obtaining the image features - iterating over image id and just saving in the list
    training_image_feats = [object_mentions[i['image_id']] for i in foil_data['annotations'] if i['image_id'] in object_mentions]
    
    return tf_vectorizer, training_feats, training_y, training_image_feats

In [191]:
foil_train = json.load(open('./data/foilv1.0_train_2017.json'))
instances_train2014 = COCO('./data/coco/annotations/instances_train2014.json')

loading annotations into memory...
Done (t=8.37s)
creating index...
index created!


In [220]:
_, training_feats, training_y, training_image_feats = get_feats_y(instances_train2014, foil_train)

# Printing the exact bag of words active for the first sample
print(np.array(tf_model.get_feature_names())[([np.array(training_feats[0].todense()) > 0][0][0]).tolist()].tolist())

# printing the original sentence
print('Original Sentence: ',   training_annotations[0])

# checking if Y is correct -> Original = 0 and Fake = 1 
# The original code looked at 'foil_word' == 'ORIG'
print('VALUE: ', training_y[0], foil_train['annotations'][0]['foil'])

Verifying objects in image and frequency
chair 11
dining table 2
vase 1
potted plant 1
book 2
['back', 'chairs', 'long', 'rattan', 'restaurant', 'rounded', 'table', 'with']
  (0, 912)	1
  (0, 2401)	1
  (0, 7716)	1
  (0, 10571)	1
  (0, 10820)	1
  (0, 11055)	1
  (0, 13254)	1
  (0, 15066)	1
  (1, 912)	1
  (1, 2401)	1
  (1, 3249)	1
  (1, 7716)	1
  (1, 10571)	1
  (1, 10820)	1
  (1, 11055)	1
  (1, 15066)	1
  (2, 2401)	1
  (2, 6950)	1
  (2, 7716)	1
  (2, 8906)	1
  (2, 8957)	1
  (2, 9854)	1
  (2, 13140)	1
  (2, 13254)	1
  (2, 13786)	1
  :	:
  (395573, 15066)	1
  (395574, 5477)	1
  (395574, 5733)	1
  (395574, 6723)	1
  (395574, 6950)	1
  (395574, 7902)	1
  (395574, 8906)	1
  (395574, 8957)	1
  (395574, 11938)	1
  (395574, 12315)	1
  (395574, 13254)	1
  (395574, 15022)	1
  (395574, 15066)	1
  (395575, 3249)	1
  (395575, 5477)	1
  (395575, 5733)	1
  (395575, 6723)	1
  (395575, 6950)	1
  (395575, 7902)	1
  (395575, 8906)	1
  (395575, 8957)	1
  (395575, 11938)	1
  (395575, 12315)	1
  (395575, 15022

In [195]:
foil_test = json.load(open('data/foilv1.0_test_2017.json'))
instances_val2014 = COCO('./data/coco/annotations/instances_val2014.json')

loading annotations into memory...
Done (t=4.99s)
creating index...
index created!


In [200]:
tf_vectorizer, testing_feats, testing_y, testing_image_feats = get_feats_y(instances_val2014, foil_test)

Verifying objects in image and frequency
motorcycle 1
person 2
bicycle 1


In [213]:
X_train = training_feats
X_test = testing_feats

Y_train = np.array(training_y)
Y_test = np.array(testing_y)

print(X_train.shape, Y_train.shape)

(395576, 15294) (395576,)


In [214]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

# using a MultiLayerPerceptron model - default settings
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)

# fitting over training data
mlp.fit(X_train, Y_train)

# printing overall accuracy 
print 'Accuracy = ', metrics.accuracy_score(Y_test, mlp.predict(X_test.toarray()))

# printing the precision and recall over each class 
#print('Accuracy = ', metrics.accuracy_score(Y_test, mlp.predict(X_test.toarray())))
target_names = ['REAL', 'FAKE']
print(metrics.classification_report(Y_test, mlp.predict(X_test.toarray()), 
                                    target_names=target_names, digits=4))

# printing accuracy over each class
cmat = metrics.confusion_matrix(Y_test, mlp.predict(X_test.toarray()))
print(zip(target_names, cmat.diagonal()/cmat.sum(axis=1)))

KeyboardInterrupt: 

In [216]:
def explain_img(explainer, idx, d):
    exp = explainer.explain_instance(testing_img_annotations[idx], d.predict_proba, num_features=100)
    print('prediction: %s' % class_names[d.predict([testing_img_annotations[idx]])[0]])
    print('Probabiliy of it being fake: ', d.predict_proba([testing_img_annotations[idx]]).round(4)[0,1])
    print('True class: %s' % class_names[testing_y[idx]])
    print ('sentence: %s' % testing_annotations[idx])
    print ('correct word:%s, foil word:%s' % testing_foils[idx])
    print ('image_id: %d' % testimglist[idx])
    
    %matplotlib inline
    exp.show_in_notebook(text=True)
    exp.as_pyplot_figure()
    
    return exp

In [None]:
# Explain raw text prediction
# predict proba converts featurized to raw representation
d = make_pipeline(tf_vectorizer, mlp)
d.predict_proba([testing_img_annotations[0]]).round(3)
class_names = ['REAL', 'FAKE']
from lime.lime_text import LimeTextExplainer
explainor = LimeTextExplainer(class_names=class_names)
explanation = explain_img(explainor, 3, d)