# source code
### normData.py 
### caption_accuracy.py
### image_cnn.py
### tsne_viz.py
# file 
### normData : Instance of Class normData, having all the data of captions, images, labels
### cap_vec : Result after Doc2Vec and NN, 1000 vectors of 20 dimensions
### fc1_pre : 1000 vectors of 20 dimensions, using just one full-connected layer in CNN
### fc2_pre : 1000 vectors of 20 dimensions, using two full-connected layer in CNN
### fc2_cap_40dim_viz.png : result of visualization using union of fc2_pre and cap_vec

 # normData.py
 ### use a class to generate the elements of data sets 

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import gensim
import numpy as np
from os import listdir
from os.path import isfile, join
from collections import namedtuple
from skimage import io
from skimage.transform import resize
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True


class NormData(object):
    def __init__(self, captions_directory=None, images_directory=None, label_file=None):
        self.captions_src = captions_directory
        self.images_src = images_directory
        self.label_src = label_file
        self.captions = []
        self.images = []
        self.labels = []
        self.class_label_dict = {}
        if captions_directory is not None:
            self.caps2vec()
        if label_file is not None:
            self.labels2vec()
        if images_directory is not None:
            self.img2vec()

    def caps2vec(self):
        # extract captions and file numbers as a tuple
        file_names = [f for f in listdir(self.captions_src) if isfile(join(self.captions_src, f))]
        namedtuple_file = namedtuple('namedtuple_file', 'doc number')
        captions_names = []
        for file in file_names:
            with open(self.captions_src + file, 'r') as f:
                captions = f.read()
                captions_names.append(namedtuple_file(captions, file.split('.')[0]))

        # split file to words with number
        words_number = []
        analyzed_document = namedtuple('analyzed_document', 'words tags')
        for document in captions_names:
            words = document[0].replace('.','\n').lower().split()
            number = [document[1]]
            words_number.append(analyzed_document(words, number))
        # print(words_number[0])

        # construct a doc2vec model
        model = gensim.models.Doc2Vec(words_number, dbow_words=1, size=1024, window=8, min_count=5, workers=4)

        # extract captions vectors and keys
        self.captions = [None] * model.docvecs.__len__()
        for i, vec in enumerate(model.docvecs):
            index = int(model.docvecs.index_to_doctag(i))
            self.captions[index-1] = vec

    def labels2vec(self):
        labels = []
        with open(self.label_src, 'r') as labels_file:
            line = labels_file.readline().rstrip()
            while line:
                labels.append(line)
                line = labels_file.readline().rstrip()

        # match label and class
        s = set(labels)
        l = list(s)
        l.sort()
        class_label_dict = {}
        i = 1
        for e in l:
            class_label_dict[e] = i
            i += 1

        self.class_label_dict = class_label_dict.copy()

        # construct labels list
        for label in labels:
            label_vec = [0] * len(s)
            label_vec[class_label_dict[label]-1] = 1
            self.labels.append(label_vec)

    def img2vec(self):
        img_names = [f for f in listdir(self.images_src) if isfile(join(self.images_src, f))]
        for img_name in img_names:
            img = io.imread(self.images_src+img_name)
            img_resize = resize(img, (28, 28))
            # r = img_resize[:, :, 0].flatten()
            # g = img_resize[:, :, 1].flatten()
            # b = img_resize[:, :, 2].flatten()
            # image = np.append(r,g)
            # image = np.append(image,b)
            self.images.append(img_resize)



# caption_accuracy.py
### construct the neural network for captions
### test the accuracy of the captions after Doc2Vec
### get the vectors using the model

In [2]:
import numpy as np
import tensorflow as tf
import pickle



# load datasets
# n = norm_dataset.NormData(captions_directory="../data/pascal-sentences/ps_captions/"
#                           ,label_file="../data/pascal-sentences/labels.txt")
#                           #,images_directory="../data/pascal-sentences/ps_images/")

# n = pickle.load(open("../captions_vectors_nn/normData", "rb"))
n = pickle.load(open("normData","rb"))

train_data_nd = np.array(n.captions)

train_label_nd = np.array(n.labels)

data_nd = np.array(n.captions)
label_nd = np.array(n.labels)

train_data = []
test_data = []
train_label = []
test_label = []
for i in range(1000):
    if i % 10 < 10:
        train_data.append(data_nd[i])
        train_label.append(label_nd[i])
    else:
        test_data.append(data_nd[i])
        test_label.append(label_nd[i])


test_data_nd = np.array(test_data)
test_label_nd = np.array(test_label)


# implement the regression
captions_dim = 1024
x = tf.placeholder(tf.float32,[None,captions_dim])


W1 = tf.Variable(tf.zeros([1024, 20]))
b1 = tf.Variable(tf.zeros([20]))
y1 = tf.matmul(x, W1) + b1


# output
y = y1
y_ = tf.placeholder(tf.float32,[None,20])

# cross_entropy
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

# accuracy
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# train
for i in range(20000):
    sess.run(train_step, feed_dict={x: train_data_nd, y_: train_label_nd})
    if i % 200 == 0:
        print(i, ',accuracy:', sess.run(accuracy, feed_dict={x: train_data_nd, y_: train_label_nd}))


# get the captions vectors
# cap_vec = sess.run(tf.nn.softmax(y),feed_dict={x:train_data_nd})
# pickle.dump(cap_vec,open('cap_vec','wb'))




0 ,accuracy: 0.278
200 ,accuracy: 0.398
400 ,accuracy: 0.421
600 ,accuracy: 0.437
800 ,accuracy: 0.456
1000 ,accuracy: 0.469
1200 ,accuracy: 0.485
1400 ,accuracy: 0.502
1600 ,accuracy: 0.513
1800 ,accuracy: 0.516
2000 ,accuracy: 0.527
2200 ,accuracy: 0.533
2400 ,accuracy: 0.538
2600 ,accuracy: 0.546
2800 ,accuracy: 0.548
3000 ,accuracy: 0.551
3200 ,accuracy: 0.554
3400 ,accuracy: 0.556
3600 ,accuracy: 0.56
3800 ,accuracy: 0.564
4000 ,accuracy: 0.566
4200 ,accuracy: 0.568
4400 ,accuracy: 0.569
4600 ,accuracy: 0.573
4800 ,accuracy: 0.573
5000 ,accuracy: 0.576
5200 ,accuracy: 0.575
5400 ,accuracy: 0.577
5600 ,accuracy: 0.579
5800 ,accuracy: 0.581
6000 ,accuracy: 0.582
6200 ,accuracy: 0.586
6400 ,accuracy: 0.585
6600 ,accuracy: 0.585
6800 ,accuracy: 0.588
7000 ,accuracy: 0.588
7200 ,accuracy: 0.59
7400 ,accuracy: 0.589
7600 ,accuracy: 0.588
7800 ,accuracy: 0.588
8000 ,accuracy: 0.588
8200 ,accuracy: 0.589
8400 ,accuracy: 0.59
8600 ,accuracy: 0.59
8800 ,accuracy: 0.589
9000 ,accuracy: 0.589

# images_cnn.py
### to construct a convolutional neural network to train with the images and test the accuracy

In [None]:
import pickle
import random

import numpy as np
import tensorflow as tf

#
# n = norm_dataset.NormData(images_directory="../data/pascal-sentences/ps_images/"
#                           ,label_file="../data/pascal-sentences/labels.txt"
#                           ,captions_directory="../data/pascal-sentences/ps_captions/")
# pickle.dump(n,open('normData','wb'))

n = pickle.load(open("normData","rb"))

data_nd = np.array(n.images)
label_nd = np.array(n.labels)

train_data = []
test_data = []
train_label = []
test_label = []
for i in range(1000):
    if i % 10 < 6:
        train_data.append(data_nd[i])
        train_label.append(label_nd[i])
    else:
        test_data.append(data_nd[i])
        test_label.append(label_nd[i])

train_image = np.array(train_data)
train_label = np.array(train_label)
test_image = np.array(test_data)
test_label = np.array(test_label)

def shuffle_dataset(images, labels):
    image_shuffle = []
    label_shuffle = []
    container = []
    for i in range(len(images)):
        container.append((images[i],labels[i]))
    random.shuffle(container)
    for i in range(len(images)):
        x,y= container[i]
        image_shuffle.append(x)
        label_shuffle.append(y)
    image_shuffle = np.array(image_shuffle, np.float32)
    label_shuffle = np.array(label_shuffle, np.float32)
    return image_shuffle, label_shuffle

def compute_accuracy(v_xs, v_ys):
    global prediction
    y_pre = sess.run(prediction, feed_dict={xs: v_xs, keep_prob: 1})
    correct_prediction = tf.equal(tf.argmax(y_pre, 1), tf.argmax(v_ys, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    result = sess.run(accuracy, feed_dict={xs: v_xs, ys: v_ys, keep_prob: 1})
    return result


def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')


def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME')


# define placeholder for inputs to network
xs = tf.placeholder(tf.float32, [None, 28,28,3])  # 28x28
ys = tf.placeholder(tf.float32, [None, 20])
keep_prob = tf.placeholder(tf.float32)

x_image = xs

## conv1 layer ##
W_conv1 = weight_variable([5, 5, 3, 16])  # patch 5x5, in size 1, out size 32
b_conv1 = bias_variable([16])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)    # output size 32个56x56
h_pool1 = max_pool_2x2(h_conv1)  # output size 32个14x14

## conv2 layer ##
W_conv2 = weight_variable([5, 5, 16, 32])  # patch 5x5, in size 32, out size 64
b_conv2 = bias_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)  # output size 14x14x64
h_pool2 = max_pool_2x2(h_conv2)  # output size 64个7x7

## if use fc1, use like this. if use fc2, replace the following code with "fc1 layer and fc2 layer"##
## fc1 layer ##
# h_pool2_flat = tf.reshape(h_pool2, [-1,32*7*7])
# W_fc1 = weight_variable([32*7*7, 20])
# b_fc1 = bias_variable([20])
# h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
# prediction = tf.nn.softmax(h_fc1)

## fc1 layer ##
h_pool2_flat = tf.reshape(h_pool2, [-1,32*7*7])
W_fc1 = weight_variable([32*7*7, 1024])
b_fc1 = bias_variable([1024])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

## fc2 layer ##
W_fc2 = weight_variable([1024, 20])
b_fc2 = bias_variable([20])
prediction = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)

cross_entropy = tf.reduce_mean(-tf.reduce_sum(ys * tf.log(prediction),reduction_indices=[1]))  # loss
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

sess = tf.Session()
if int((tf.__version__).split('.')[1]) < 12:
    init = tf.initialize_all_variables()
else:
    init = tf.global_variables_initializer()
sess.run(init)

for i in range(1000):
    sess.run(train_step,feed_dict={xs: data_nd,ys: label_nd, keep_prob: 0.5})
    if i % 10 == 0:
        accuracy = 100 * compute_accuracy(data_nd, label_nd)
        print(accuracy)


# get the vectors of fc2
# fc2_pre = sess.run(prediction,feed_dict={xs:data_nd,keep_prob:0.5})
# pickle.dump(fc2_pre,open('fc2_pre','wb'))

# get the vectors of fc1
# fc1_pre = sess.run(prediction,feed_dict={xs:data_nd,keep_prob:0.5})
# pickle.dump(fc1_pre,open('fc1_pre','wb'))

# tsne_viz.py
### including the union of the vectors, t-sne and the visualization

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import pickle
from sklearn.manifold import TSNE



# load the vectors
fc2_pre = pickle.load(open('fc2_pre','rb'))
cap_vec = pickle.load(open('cap_vec','rb'))

captions_images_vectors = np.array([None]*1000)

# union the vectors
for i in range(len(captions_images_vectors)):
    captions_images_vectors[i] = np.append(cap_vec[i],fc2_pre[i])

# tsne
tsne = TSNE(perplexity=10, n_components=2, n_iter=5000, random_state=0)
low_dim_embs = tsne.fit_transform(list(captions_images_vectors[:]))

# reshape
low_dim_embs_rs = low_dim_embs.reshape((20,50,2))


# define the visualization function
colorbar = [
    '#e4007f', '#a40000','#ea68a2','#a84200','#f19149',
    '#fff45c','#8fc31f','#009944','#00736d','#0075a9',
    '#004986','#500047','#b28850','#81511c','#6a3906',
    '#59493f','#616e81','#898989','#89c997','#000000']

def visualization(data):
    fig, ax = plt.subplots(figsize=(20, 10))
    i=0
    for color in colorbar:
        x = data[i][:,0]
        y = data[i][:,1]
        scale = 20
        ax.scatter(x, y, c=color, s=scale, label="class"+str(i+1),alpha=1, edgecolors='none')
        ax.legend()
        i=i+1
    plt.show()


# visualization
visualization(low_dim_embs_rs)