In [None]:
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.models import Model, load_model
from keras.applications.vgg16 import preprocess_input
import os
import tokenize
import string
from numpy import array
from pickle import load
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
import keras.backend as K
from numpy import argmax
from nltk.translate.bleu_score import corpus_bleu

In [None]:
#load dataset and use features 
def extract(path):
    model = VGG19()  #FROM https://gist.github.com/baraldilorenzo/07d7802847aaad0a35d3
    model.layers.pop()#removes last added layer
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    features = dict()
    files=os.listdir(path)
    for i in range(0,len(files)):#0 to len(files)-7 in case of data as last 7 files are .txt files
        filename = path + '/' + files[i]
        print(filename)
        image = load_img(filename, target_size=(224, 224,3))#Alex net Size
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)
        feature = model.predict(image, verbose=0)
        image_id = files[i].split('.')[0]
        features[image_id] = feature
    return features
#features = extract('Flicker8k_Dataset/Data')
#print('Extracted Features: %d' % len(features))
#dump(features, open('features.pkl', 'wb'))

        


In [None]:
#loading text data
path="D:\Machine Learning\datasets\Flicker8k_Dataset\Data"
def load_text(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
filename="Flickr8k_text/Flickr8k.token.txt"
doc=load_text(filename)


In [None]:
#mapping image to multiple text description in train set using image id
def mapi(doc):
    mapping=dict()
    for i in doc.split("\n"):
        tok=i.split()
        if(len(tok)<2):
            continue
        image_id,image_des=tok[0],tok[1:]
        image_id = image_id.split('.')[0]
        image_des = ' '.join(image_des)
        if image_id not in mapping:
            mapping[image_id] = list()
        mapping[image_id].append(image_des)
    return mapping
description=mapi(doc)

In [None]:
#cleaning description remove puntuation and single letter char
def clean(description):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in description.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split()
            desc = [word.lower() for word in desc]
            desc = [w.translate(table) for w in desc]
            desc = [word for word in desc if len(word)>1]
            desc = [word for word in desc if word.isalpha()]
            desc_list[i] =  ' '.join(desc)

In [None]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(description):
    all_desc = set()
    for key in description.keys():
        [all_desc.update(d.split()) for d in description[key]]
    return all_desc
vocabulary = to_vocabulary(description)


In [None]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
save_descriptions(description, 'descriptions.txt')

In [None]:
#putting all data together
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        identifier = line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)
#cleaned desc
def load_clean_descriptions(filename, dataset):
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1:]
        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = list()
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions
#load image features
def load_image_features(filename,dataset):
    all_features = load(open(filename, 'rb'))
    features = {k: all_features[k] for k in dataset}
    return features
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
#description
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# image features
train_features = load_image_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))


In [None]:
#conver dict to list desc
def to_lines(descriptions):
    all_desc=list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)


In [None]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length):
    while 1:
        for key, desc_list in descriptions.items():
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
            yield [[in_img, in_seq], out_word]

In [None]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return array(X1), array(X2), array(y)

# calculate the length of the description with the most words
def max_len(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)


In [None]:
#captioning model
def caption_model(vocabsize,maxlen):
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model



In [None]:
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
train_features = load_image_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
max_length = max_len(train_descriptions)
print('Description Length: %d' % max_length)
model = caption_model(vocab_size, max_length)
epochs = 1
steps = len(train_descriptions)
for i in range(100):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=epochs, steps_per_epoch=1, verbose=2)
    model.save('final_'+str(i)+'.h5')

In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=1)
        yhat = argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text
 
# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    for key, desc_list in descriptions.items():
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25)))


In [None]:
filename = 'Flickr8k_text/Flickr_8k.trainImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# image features
test_features = load_image_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))

In [None]:
#save tokenizer in pkl
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [None]:
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
# pre-define the max sequence length (from training)
max_length = 40

In [None]:
photo=(extract('test'))['face']


In [None]:
description = generate_desc(model, tokenizer, photo, max_length)
print(description)

In [None]:
import numpy as np
import cv2

cap = cv2.VideoCapture(0)

while(True):
    # Capture frame-by-frame
    ret, frame = cap.read()

    # Our operations on the frame come here
    #gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    cv2.imwrite("test/face.jpg", frame)
    photo=extract('test')['face']
    description = generate_desc(model, tokenizer, photo, max_length)
    print(description)
    break
    # Display the resulting frame
   # cv2.imshow('frame',frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()