In [15]:
%matplotlib inline
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (8.0, 10.0)
from IPython.display import Image
from IPython.core.display import HTML 

import sys
sys.path.append('./cocoapi/PythonAPI')
from pycocotools.coco import COCO
import json

from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as ss

In [99]:
coco = COCO('./data/foilv1.0_train_2017.json')

loading annotations into memory...
Done (t=0.91s)
creating index...
index created!


In [100]:
# Get all dataset annotations
anns = coco.loadAnns(coco.getAnnIds())
print(anns[:5])

[{'id': 789366, 'foil_id': 2000003, 'image_id': 57870, 'caption': 'a long restaurant couch with rattan rounded back chairs.', 'target_word': '', 'foil_word': '', 'foil': True}, {'id': 789366, 'foil_id': 2000003, 'image_id': 57870, 'caption': 'a long restaurant couch with rattan rounded back chairs.', 'target_word': '', 'foil_word': '', 'foil': True}, {'id': 789888, 'foil_id': 2000006, 'image_id': 57870, 'caption': 'a long couch with a plant on top of it surrounded with wooden chairs', 'target_word': '', 'foil_word': '', 'foil': True}, {'id': 789888, 'foil_id': 2000006, 'image_id': 57870, 'caption': 'a long couch with a plant on top of it surrounded with wooden chairs', 'target_word': '', 'foil_word': '', 'foil': True}, {'id': 791316, 'foil_id': 2000008, 'image_id': 57870, 'caption': 'a long couch with a flower arrangement in the middle for meetings', 'target_word': '', 'foil_word': '', 'foil': True}]


In [139]:
def display_img(img_ids, caption=''):
    imgs = coco.loadImgs(img_ids)
    for img in imgs:
        display(Image(url=img['flickr_url'], width=100, height=100))
        display(caption)
        

In [5]:
# Input can be 'id', foil_id, 'image_id'
def get_unique_ids(id_type):
    anns = coco.loadAnns(coco.getAnnIds())
    ids = {ann[id_type] for ann in anns}
    return list(ids)

In [6]:
# There are more foil ids than img ids since there are multiple ways of foiling one image
print("# Foils: ", len(get_unique_ids('foil_id')))
print("# Images: ", len(get_unique_ids('image_id')))

# Foils:  197788
# Images:  65697


In [7]:
# Look at captions for the same image
imgs = get_unique_ids('image_id')
img_0 = imgs[11]
captions_0 = {(ann['caption'], ann['foil']) for ann in anns if ann['image_id'] == img_0}
display_img(img_0, captions_0)

{('a cat on a surfboard is riding in on the wake.', True),
 ('a cat riding a surfboard in the ocean.', True),
 ('a cat surfing on a surfboard on a body of water.', True),
 ('a cow laying on a surf board and riding a small wave', True)}

### Reproducing Defoiling Foiled Image Captions Paper
https://github.com/sheffieldnlp/foildataset

Bag of Objects representation:
1. object mention
2. object frequency
3. use object annotations + predictions using YOLO to generate this

In [3]:
# object_mentions: key = image id, vals = dict of object id : frequency count
def collect_object_mentions(instances, object_mentions):
    for obj in instances.loadAnns(instances.getAnnIds()):
        object_mentions[obj['image_id']] = object_mentions.get(obj['image_id'], {})
        frequencies = object_mentions[obj['image_id']]
        count = frequencies.get(obj['category_id'], 0)
        frequencies[obj['category_id']] = count + 1

In [4]:
def get_feats_y(instances, foil_data, tf_model=None):
    categories = instances.loadCats(instances.getCatIds())
    categories = {cat['id'] : cat['name'] for cat in categories}
    object_mentions = {}
    collect_object_mentions(instances, object_mentions)
    
    annotations = [l['caption'] for l in foil_data['annotations']]
    foils = [(l['target_word'], l['foil_word']) for l in foil_data['annotations']]
    
    test_img = foil_data['annotations'][0]['image_id']
    current_objects = object_mentions[test_img]
    print('Verifying objects in image and frequency')
    for obj in current_objects.keys():
        if obj > 0: 
            print(categories[obj], current_objects[obj])
    
    # term frequency, inverse document frequenct (tf-idf)
    # tf = freq t in document, idf = ln (1 / freq document has t)
    # row = word, col = document, val = count
    tf_vectorizer = CountVectorizer(max_features=None, lowercase=True)
    
    # Use the training model for testing set
    if not tf_model: 
        # Fit the training dataset and save the model 
        tf_model = tf_vectorizer.fit(annotations) 

    # Obtain training features - this is a sparse matrix
    feats = tf_model.transform(annotations)
    # get the above format using training_feats.toarray()

    # Obtain the outputs (0/1)
    y = [0 if not f['foil'] else 1 for f in foil_data['annotations']]

    # just obtaining the image features - iterating over image id and just saving in the list
    image_feats = [object_mentions[i['image_id']] for i in foil_data['annotations'] if i['image_id'] in object_mentions]
    
    return tf_model, tf_vectorizer, feats, y, image_feats

In [5]:
foil_train = json.load(open('./data/foilv1.0_train_2017.json'))
instances_train2014 = COCO('./data/coco/annotations/instances_train2014.json')

loading annotations into memory...
Done (t=9.91s)
creating index...
index created!


In [8]:
tf_model, tf_vectorizer, training_feats, training_y, training_image_feats = get_feats_y(instances_train2014, foil_train)

# Printing the exact bag of words active for the first sample
print(np.array(tf_model.get_feature_names())[([np.array(training_feats[0].todense()) > 0][0][0]).tolist()].tolist())

# checking if Y is correct -> Original = 0 and Fake = 1 
# The original code looked at 'foil_word' == 'ORIG'
print('VALUE: ', training_y[0], foil_train['annotations'][0]['foil'])

Verifying objects in image and frequency
chair 11
dining table 2
vase 1
potted plant 1
book 2
['back', 'chairs', 'long', 'rattan', 'restaurant', 'rounded', 'table', 'with']
VALUE:  0 False


In [9]:
foil_test = json.load(open('data/foilv1.0_test_2017.json'))
instances_val2014 = COCO('./data/coco/annotations/instances_val2014.json')

loading annotations into memory...
Done (t=4.25s)
creating index...
index created!


In [10]:
_, _, testing_feats, testing_y, testing_image_feats = get_feats_y(instances_val2014, foil_test, tf_model)

Verifying objects in image and frequency
motorcycle 1
person 2
bicycle 1


In [11]:
# Predict using only text features 

X_train = training_feats
X_test = testing_feats

Y_train = np.array(training_y)
Y_test = np.array(testing_y)

In [12]:
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

# using a MultiLayerPerceptron model - default settings
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)

# fitting over training data
mlp.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [13]:
predictions = mlp.predict(X_test.toarray())

print('Accuracy = ', metrics.accuracy_score(Y_test, predictions))

# printing the precision and recall over each class 
target_names = ['REAL', 'FAKE']
print(metrics.classification_report(Y_test, predictions, 
                                    target_names=target_names, digits=4))

# printing accuracy over each class
cmat = metrics.confusion_matrix(Y_test, predictions)
print(zip(target_names, cmat.diagonal()/cmat.sum(axis=1)))

Accuracy =  0.6144652191395256
              precision    recall  f1-score   support

        REAL     0.5775    0.8534    0.6888     99480
        FAKE     0.7192    0.3756    0.4934     99480

    accuracy                         0.6145    198960
   macro avg     0.6483    0.6145    0.5911    198960
weighted avg     0.6483    0.6145    0.5911    198960

<zip object at 0x1bbe1c988>


In [216]:
def explain_img(explainer, idx, d):
    exp = explainer.explain_instance(testing_img_annotations[idx], d.predict_proba, num_features=100)
    print('prediction: %s' % class_names[d.predict([testing_img_annotations[idx]])[0]])
    print('Probabiliy of it being fake: ', d.predict_proba([testing_img_annotations[idx]]).round(4)[0,1])
    print('True class: %s' % class_names[testing_y[idx]])
    print('sentence: %s' % testing_annotations[idx])
    print('correct word:%s, foil word:%s' % testing_foils[idx])
    print('image_id: %d' % testimglist[idx])
    
    %matplotlib inline
    exp.show_in_notebook(text=True)
    exp.as_pyplot_figure()
    
    return exp

In [None]:
# Explain raw text prediction
# predict proba converts featurized to raw representation
d = make_pipeline(tf_vectorizer, mlp)
d.predict_proba([testing_img_annotations[0]]).round(3)
class_names = ['REAL', 'FAKE']
from lime.lime_text import LimeTextExplainer
explainor = LimeTextExplainer(class_names=class_names)
explanation = explain_img(explainor, 3, d)

In [17]:
# text and image features
training_image_feats_sparse = ss.csr_matrix(np.array(training_image_feats))
testing_image_feats_sparse = ss.csr_matrix(np.array(testing_image_feats))

X_train = ss.hstack([(training_feats), training_image_feats_sparse])
X_test = ss.hstack([(testing_feats), testing_image_feats_sparse])

Y_train = np.array(training_y)
Y_test = np.array(testing_y)

TypeError: no supported conversion for types: (dtype('O'),)

In [None]:
predictions = mlp.predict(X_test.toarray())
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
mlp.fit(X_train, Y_train)
print('Accuracy = ', metrics.accuracy_score(Y_test, predictions))
target_names = ['REAL', 'FAKE']
print(metrics.classification_report(Y_test, predictions, 
                                    target_names=target_names, digits=4))
cmat = metrics.confusion_matrix(Y_test, predictions)
print(zip(target_names, cmat.diagonal()/cmat.sum(axis=1)))