In [1]:
from dataloader import *
from  config import *
from data_processor import preprocess_data

In [None]:
data = DataLoader(config_mixed_flickr8k_flickr8k_inception_fastText)

In [4]:
data=preprocess_data(data)

Number of train images:  6000
Number of test images:  1000
Number of train captions:  6000
Number of test captions:  1000
Descriptions cleaned.
[['a', 'brown', 'and', 'a', 'black', 'dog', 'are', 'wearing', 'collars', 'and', 'playing', 'with', 'each', 'other'], ['a', 'brown', 'dog', 'biting', 'a', 'black', 'dog', 'on', 'a', 'sidewalk'], ['the', 'brown', 'dog', 'with', 'a', 'red', 'collar', 'is', 'biting', 'the', 'black', 'dog'], ['two', 'dogs', 'one', 'brown', 'and', 'one', 'black', 'playing', 'with', 'each', 'other'], ['two', 'dogs', 'one', 'brown', 'the', 'other', 'black', 'and', 'white', 'are', 'playing', 'on', 'a', 'sidewalk']]
Descriptions wraped into start and stop words.
['START a child in a pink dress is climbing up a set of stairs in an entry way STOP', 'START a girl going into a wooden building STOP', 'START a little girl climbing into a wooden playhouse STOP', 'START a little girl climbing the stairs to her playhouse STOP', 'START a little girl in a pink dress going into a wo

2023-01-15 23:12:41.542665: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-15 23:12:41.543537: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Encoded images loaded from: 
./mixed_flickr8k_8k_n/Pickle/encoded_train_images.pkl
Encoded images loaded from: 
./mixed_flickr8k_8k_n/Pickle/encoded_train_images.pkl
Number of training captions  30000
Description Length: 37
preprocessed words 7589 -> 1654
Vocab size:  1655
Found 400000 word vectors.


In [5]:
from keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras.layers.merge import add
from keras.models import Model
from keras import Input
from keras import callbacks
from eval_utils import calculate_results, prepare_for_evaluation
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from numpy import array
from eval_utils import generate_report

In [6]:
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch, vocab_size):
    """
    Data generator, that serves the data to the model during training


    Parameters
    ----------
    descriptions: str
        Dictionary where key is image id and value is a list of wraped in start and stop captions, lemmatized, without punctuation descriptions
    photos
        Dictionary with encoded images(vector of image features extracted by specified image feature extractor
         fe. Inception), identified by image id
    wordtoix
        Dictionary with keys-words , values -id of word
    max_length
        Max number of words in caption on dataset
    num_photos_per_batch: int
    vocab_size: int
    Returns
    -------
    """
    X1, X2, y = list(), list(), list()
    n = 0
    #iterujemy po opisach doobrazu
    # kazdy opis zamieniamy na wektor liczb za pomoca slownika wordtoix
    # tworzymy mase par (zdjęcie + slowa) i sekwencja wyjsciowa. Czyli na bazie czeci zdania i zdjecia przewidujemy reszte zdania
    while 1:
        for image_id, desc_list in descriptions.items():
            n += 1
            # retrieve the photo feature from the dictionary
            photo = photos[image_id]
            for desc in desc_list:
                # encode the sentence by translating it to the number representation,
                # with the dictionary of words created in in the previous stage
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if n == num_photos_per_batch:
                yield ([array(X1), array(X2)], array(y))
                X1, X2, y = list(), list(), list()
                n = 0

class ModelImpl:
    def __init__(self, data):
        self.data=data
        if data.configuration['images_processor'] == 'vgg16':
            inputs1= Input(shape=(4096,))
        else:
            inputs1 = Input(shape=(2048,))
        fe1 = Dropout(0.5)(inputs1)
        fe2 = Dense(256, activation='relu')(fe1)
        inputs2 = Input(shape=(self.data.max_length,))
        # The Embedding layer can be understood as a lookup table that maps from integer
        # indices (which stand for specific words) to dense vectors (their embeddings).
        if data.configuration["text_processor"] == "fastText":
            se1 = Embedding(self.data.vocab_size, fastText[self.data.language]["embedings_dim"], mask_zero=True)(inputs2)
        else:
            se1 = Embedding(self.data.vocab_size, glove[self.data.language]["embedings_dim"], mask_zero=True)(inputs2)
        se2 = Dropout(0.5)(se1)
        se3 = LSTM(256)(se2)
        decoder1 = add([fe2, se3])
        decoder2 = Dense(256, activation='relu')(decoder1)
        outputs = Dense(self.data.vocab_size, activation='softmax')(decoder2)
        self.model = Model(inputs=[inputs1, inputs2], outputs=outputs)
        self.model.summary()
        self.model.layers[2]

        self.model.layers[2].set_weights([self.data.embedding_matrix])
        self.model.layers[2].trainable = False

        self.model.compile(loss='categorical_crossentropy', optimizer=self.optimizer())
        self.setup()

    def optimizer(self):
        return Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

    def setup(self):
        # model.optimizer.lr = 0.001
        self.epochs = 2
        self.number_pics_per_bath = 100
        self.steps = len(self.data.train_captions_wrapped) // self.number_pics_per_bath

    def train(self):
        model_weights_path="./" + self.data.configuration["data_name"] + self.data.configuration["model_save_dir"]
        if self.data.configuration["train_model"]:
            callback = callbacks.EarlyStopping(monitor='loss', min_delta=0.001, patience=3)
            generator = data_generator(self.data.train_captions_wrapped,
                                       self.data.encoded_images_train,
                                       self.data.wordtoix,
                                       self.data.max_length,
                                       self.number_pics_per_bath,
                                       self.data.vocab_size)
            self.model.fit(generator, epochs=self.epochs,
                           steps_per_epoch=self.steps,
                           callbacks=[callback],
                           verbose=1)
            if self.data.configuration["save_model"]:
                writepath = model_weights_path+ "/"+'model' + '.h5'
                self.model.save(writepath)
                self.model.save_weights(model_weights_path
                                        + self.data.configuration["model_save_path"])
        else:
            self.model.load_weights(model_weights_path
                                        +self.data.configuration["model_save_path"])

    def evaluate(self):
        expected, results = prepare_for_evaluation(self.data.encoded_images_test, self.data.test_captions_mapping,
                                                   self.data.wordtoix, self.data.ixtoword, self.data.max_length,
                                                   self.model, self.data.configuration["images_processor"])
        out = calculate_results(expected, results, self.data.configuration)
        print(out)

model=ModelImpl(data)


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 37)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 37, 199)      329345      ['input_3[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 2048)         0           ['input_2[0][0]']                
                                                                                            

In [7]:
model.train()

In [8]:
model.evaluate()

Preparing for evaluation
Processed:
0
Processed:
100
Processed:
200
Processed:
300
Processed:
400
Processed:
500
Processed:
600
Processed:
700
Processed:
800
Processed:
900


ModuleNotFoundError: No module named 'gensim'

In [None]:
generate_report(general["results_directory"])
