In [6]:
'''
In this file we train many models and evaluate their caption predictions on the validation and test sets

This file will also be used to generate some captions for the report

All data gathered is in the report

'''

import pandas as pd
from LogisticDecoder import LogisticDecoder
from common import clean_descriptions, samples_to_dict, VALIDATION_FILENAMES, corpus_bleu_score, TEST_FILENAMES
from RNNDecoder import RNNModel

In [2]:
''' 
Initialize the samples and dictionaries that will be used in training

'''
# get the samples with the given filenames
small_train_samples = clean_descriptions('../data/flickr_8k/small_train.csv')
validation_samples = clean_descriptions('../data/flickr_8k/validation.csv')
train_samples = clean_descriptions('../data/flickr_8k/train.csv')
test_samples = clean_descriptions('../data/flickr_8k/test.csv')
train_and_val_samples = clean_descriptions('../data/flickr_8k/train_and_val.csv')


small_train_dict = samples_to_dict(small_train_samples)
train_dict = samples_to_dict(train_samples)
test_dict = samples_to_dict(test_samples)
train_and_val_dict = samples_to_dict(train_and_val_samples)
validation_dict = samples_to_dict(validation_samples)

# get the captions for the validation set and the test set
val_captions = list(validation_dict.values())
test_captions = list(test_dict.values())

In [4]:
'''
The following 3 models are used to select which model is best. We will then tune hyper parameters using that model structure

create logistic regression model
5 epochs
small training set
evaluate on validation set
'''
logistic_decoder = LogisticDecoder(15, small_train_samples)
# logistic_decoder.fit(small_train_dict, 5, '../models/compare_3_models/LogisticModel',verbose=True)
logistic_decoder.load('../models/compare_3_models/LogisticModel')

Model loaded from ../models/compare_3_models/LogisticModel


In [7]:
logistic_captions = logistic_decoder.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
logistic_bleu_scores = corpus_bleu_score(val_captions, logistic_captions)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.29785561917592907
BLEU-2: 0.07837888206846033
BLEU-3: 0.01807101168302499
BLEU-4: 6.019673799174596e-79


In [3]:
'''
create RNN model without dropout layers
5 epochs
small training set
evaluate on validation set
'''
rnn_without_dropout_model = RNNModel(False, small_train_samples)
# rnn_without_dropout_model.train_save_model(input_dict=small_train_dict, save_path='../models/compare_3_models/RNN_without_dropout', epochs=5)
rnn_without_dropout_model.load('../models/compare_3_models/RNN_without_dropout')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12


  layer_config = serialize_layer_fn(layer)


<keras.engine.functional.Functional at 0x128b82735c8>

In [5]:
without_dropout_captions = rnn_without_dropout_model.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
without_dropout_bleu_scores = corpus_bleu_score(val_captions, without_dropout_captions)

BLEU-1: 0.3256668642560759
BLEU-2: 0.12261347302117036
BLEU-3: 0.03905333234637138
BLEU-4: 0.009979650225174266


In [4]:
'''
create RNN model with dropout layers
5 epochs
small training set
evaluate on validation set
'''
rnn_with_dropout_model = RNNModel(True, small_train_samples)
# rnn_with_dropout_model.train_save_model(input_dict=small_train_dict, save_path='../models/compare_3_models/RNN_with_dropout', epochs=5)
rnn_with_dropout_model.load('../models/compare_3_models/RNN_with_dropout')

model loaded successfully!


In [6]:
with_dropout_captions = rnn_with_dropout_model.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
with_dropout_bleu_scores = corpus_bleu_score(val_captions, with_dropout_captions)

BLEU-1: 0.330918147854636
BLEU-2: 0.12436767922869052
BLEU-3: 0.04006884227079066
BLEU-4: 0.011906023055355095


In [3]:
'''
Note:

best model was the RNN model with dropout layers

now do hyperparameter tuning on best model

optimizer   | # epochs
adam        | 5
adam        | 10
sgd         | 5
sgd         | 10


evaluate each model on the validation set
'''
adam_5_epochs = RNNModel(True, small_train_samples, optimizer='adam')
# adam_5_epochs.train_save_model(input_dict=small_train_dict, save_path='../models/hyperparameter_tuning/rnn_5_epochs_adam', epochs=5)
adam_5_epochs.load('../models/hyperparameter_tuning/rnn_5_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12


  layer_config = serialize_layer_fn(layer)


<keras.engine.functional.Functional at 0x2632f1b8bc8>

In [5]:
adam_5_captions = adam_5_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
adam_5_bleu_scores = corpus_bleu_score(val_captions, adam_5_captions)

BLEU-1: 0.3141850995928249
BLEU-2: 0.12398919098008251
BLEU-3: 0.044220494743823155
BLEU-4: 0.015097778742097986


In [3]:
'''
RNN model with dropout 
adam optimizer 10 epochs
'''
adam_10_epochs = RNNModel(True, small_train_samples, optimizer='adam')
# adam_10_epochs.train_save_model(input_dict=small_train_dict, save_path='../models/hyperparameter_tuning/rnn_10_epochs_adam', epochs=10)
adam_10_epochs.load('../models/hyperparameter_tuning/rnn_10_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12


  layer_config = serialize_layer_fn(layer)


<keras.engine.functional.Functional at 0x1d33cef2bc8>

In [5]:
adam_10_captions = adam_10_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
adam_10_bleu_scores = corpus_bleu_score(val_captions, adam_10_captions)

BLEU-1: 0.32845714285714284
BLEU-2: 0.1268539236921717
BLEU-3: 0.04595459169877701
BLEU-4: 0.016645342560671805


In [6]:
'''
RNN model with dropout 
sgd optimizer 5 epochs
'''
sgd_5_epochs = RNNModel(True, small_train_samples, optimizer='sgd')
# sgd_5_epochs.train_save_model(input_dict=small_train_dict, save_path='../models/hyperparameter_tuning/rnn_5_epochs_sgd', epochs=5)
sgd_5_epochs.load('../models/hyperparameter_tuning/rnn_5_epochs_sgd')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12


<keras.engine.functional.Functional at 0x1d5305f4f08>

In [8]:
'''
RNN model with dropout 
sgd optimizer 5 epochs
'''
sgd_5_captions = sgd_5_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
sgd_5_bleu_scores = corpus_bleu_score(val_captions, sgd_5_captions)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.21240199572344975
BLEU-2: 0.04881005024237703
BLEU-3: 0.006275917064585577
BLEU-4: 2.723290711024397e-79


In [9]:
'''
RNN model with dropout 
sgd optimizer 10 epochs
'''
sgd_10_epochs = RNNModel(True, small_train_samples, optimizer='sgd')
# sgd_10_epochs.train_save_model(input_dict=small_train_dict, save_path='../models/hyperparameter_tuning/rnn_10_epochs_sgd', epochs=10)
sgd_10_epochs.load('../models/hyperparameter_tuning/rnn_10_epochs_sgd')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12


<keras.engine.functional.Functional at 0x1d562882088>

In [11]:
sgd_10_captions = sgd_10_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
sgd_10_bleu_scores = corpus_bleu_score(val_captions, sgd_10_captions)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.23887316875615988
BLEU-2: 0.059317915316757834
BLEU-3: 4.089993830033175e-104
BLEU-4: 3.3961826536174215e-155


In [3]:
'''
The next 3 models are ALL THREE models trained with the best hyperparameters

adam optimizer with 10 epochs

First the RNN model with dropout layers
Second the RNN model without dropout layers
Lastly the Logistic Decoder model

All will be tested on the validation set again
'''
full_train_10_epochs = RNNModel(True, train_samples, optimizer='adam')
# full_train_10_epochs.train_save_model(input_dict=train_dict, save_path='../models/full_training_data/rnn_10_epochs_adam', epochs=10)
full_train_10_epochs.load('../models/full_training_data/rnn_10_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12


  layer_config = serialize_layer_fn(layer)


<keras.engine.functional.Functional at 0x1c337cca948>

In [5]:
full_train_10_captions = full_train_10_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
full_train_10_bleu_scores = corpus_bleu_score(val_captions, full_train_10_captions)

BLEU-1: 0.3318903318903319
BLEU-2: 0.1377854245892623
BLEU-3: 0.05011937620648899
BLEU-4: 0.018407808337102996


In [4]:
'''
RNN model without dropout
'''
full_train_without_dropout_10_epochs = RNNModel(False, train_samples, optimizer='adam')
# full_train_without_dropout_10_epochs.train_save_model(input_dict=train_dict, save_path='../models/full_training_data/no_dropout_10_epochs_adam', epochs=10)
full_train_without_dropout_10_epochs.load('../models/full_training_data/no_dropout_10_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12


  layer_config = serialize_layer_fn(layer)


<keras.engine.functional.Functional at 0x223964bbbc8>

In [6]:
full_train_without_dropout_10_captions = full_train_without_dropout_10_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
full_train_without_dropout_10_bleu_scores = corpus_bleu_score(val_captions, full_train_without_dropout_10_captions)

BLEU-1: 0.31642430391162213
BLEU-2: 0.11737997545158703
BLEU-3: 0.03715791149037828
BLEU-4: 0.013360267231810248


In [42]:
'''
logistic model trained on training set
'''
full_train_logistic_decoder = LogisticDecoder(15, train_samples)
# full_train_logistic_decoder.fit(train_dict, 10, '../models/full_training_data/logistic_model',verbose=True)
full_train_logistic_decoder.load('../models/full_training_data/logistic_model')

Training model #1




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder1\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder1\assets


Garbage collected.
Model #1 saved to ../models/full_training_data/logistic_model/decoder1
Training model #2




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder2\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder2\assets


Garbage collected.
Model #2 saved to ../models/full_training_data/logistic_model/decoder2
Training model #3




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder3\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder3\assets


Garbage collected.
Model #3 saved to ../models/full_training_data/logistic_model/decoder3
Training model #4




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder4\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder4\assets


Garbage collected.
Model #4 saved to ../models/full_training_data/logistic_model/decoder4
Training model #5




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder5\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder5\assets


Garbage collected.
Model #5 saved to ../models/full_training_data/logistic_model/decoder5
Training model #6




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder6\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder6\assets


Garbage collected.
Model #6 saved to ../models/full_training_data/logistic_model/decoder6
Training model #7




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder7\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder7\assets


Garbage collected.
Model #7 saved to ../models/full_training_data/logistic_model/decoder7
Training model #8




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder8\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder8\assets


Garbage collected.
Model #8 saved to ../models/full_training_data/logistic_model/decoder8
Training model #9




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder9\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder9\assets


Garbage collected.
Model #9 saved to ../models/full_training_data/logistic_model/decoder9
Training model #10




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder10\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder10\assets


Garbage collected.
Model #10 saved to ../models/full_training_data/logistic_model/decoder10
Training model #11




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder11\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder11\assets


Garbage collected.
Model #11 saved to ../models/full_training_data/logistic_model/decoder11
Training model #12




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder12\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder12\assets


Garbage collected.
Model #12 saved to ../models/full_training_data/logistic_model/decoder12
Training model #13




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder13\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder13\assets


Garbage collected.
Model #13 saved to ../models/full_training_data/logistic_model/decoder13
Training model #14




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder14\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder14\assets


Garbage collected.
Model #14 saved to ../models/full_training_data/logistic_model/decoder14
Training model #15




INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder15\assets


INFO:tensorflow:Assets written to: ../models/full_training_data/logistic_model/decoder15\assets


Garbage collected.
Model #15 saved to ../models/full_training_data/logistic_model/decoder15
Model loaded from ../models/full_training_data/logistic_model


In [43]:
full_train_logistic_captions = full_train_logistic_decoder.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
full_train_logistic_bleu_scores = corpus_bleu_score(val_captions, full_train_logistic_captions)









BLEU-1: 0.31083639603519325
BLEU-2: 0.0852507420770845
BLEU-3: 0.019049863044182973
BLEU-4: 0.005696527662864323


In [11]:
'''
train all 3 model types on full training + validation set with best hyper parameters

use these just for testing some caption generation
'''

full_train_val_rnn_10_epochs = RNNModel(True, train_and_val_samples, optimizer='adam')
# full_train_val_rnn_10_epochs.train_save_model(input_dict=train_and_val_dict, save_path='../models/full_training_val_data/rnn_10_epochs_adam', epochs=10)
full_train_val_rnn_10_epochs.load('../models/full_training_val_data/rnn_10_epochs_adam')


The top 30 sequence lengths are:
[35, 35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [12]:
full_train_val_rnn_10_captions = full_train_val_rnn_10_epochs.generate_captions_for_files(TEST_FILENAMES, verbose=False)
full_train_val_10_bleu_scores = corpus_bleu_score(test_captions, full_train_val_rnn_10_captions)

BLEU-1: 0.3113781939020562
BLEU-2: 0.10670162658613169
BLEU-3: 0.033282317301041336
BLEU-4: 0.010834366325297313


In [13]:
'''
RNN model without dropout on training + validation set
'''
full_train_val_without_dropout_10_epochs = RNNModel(False, train_and_val_samples, optimizer='adam')
# full_train_val_without_dropout_10_epochs.train_save_model(input_dict=train_and_val_dict, save_path='../models/full_training_val_data/no_dropout_10_epochs_adam', epochs=10)
full_train_val_without_dropout_10_epochs.load('../models/full_training_val_data/no_dropout_10_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [14]:
full_train_val_without_dropout_10_captions = full_train_val_without_dropout_10_epochs.generate_captions_for_files(TEST_FILENAMES, verbose=False)
full_train_val_without_dropout_10_bleu_scores = corpus_bleu_score(test_captions, full_train_val_without_dropout_10_captions)

BLEU-1: 0.31462000858737654
BLEU-2: 0.12268801995467031
BLEU-3: 0.04204070609542318
BLEU-4: 0.013412006916045766


In [3]:
'''
logistic model trained on training + validation set
'''
full_train_val_logistic_decoder = LogisticDecoder(15, train_and_val_samples)
# full_train_val_logistic_decoder.fit(train_and_val_dict, 10, '../models/full_training_val_data/logistic_model',verbose=True)
full_train_val_logistic_decoder.load('../models/full_training_val_data/logistic_model')

Model loaded from ../models/full_training_val_data/logistic_model


In [4]:
full_train_val_logistic_captions = full_train_val_logistic_decoder.generate_captions_for_files(TEST_FILENAMES, verbose=False)
full_train_val_logistic_bleu_scores = corpus_bleu_score(test_captions, full_train_val_logistic_captions)

BLEU-1: 0.3000620604054613
BLEU-2: 0.07453068567495304
BLEU-3: 0.019038151757266815
BLEU-4: 0.0055555105652504615


In [34]:
'''
combine and export all captions generated by our 3 fully trained models for viewing
'''
def concat_string(tokens):
    output = ''
    for i, tok in enumerate(tokens):
        output += tok
        if i < len(tokens)-1:
            output += ' '
    return output

def convert_lists(lists_of_toks):
    '''
    remove <start> and <end> from true labels and reappend them to the list
    '''
    output = []
    for token_list in lists_of_toks:
        token_list = token_list[1:len(token_list)-1]
        output.append(concat_string(token_list))
    return output


# commented to not overwrite files
# pd.concat([
#     pd.Series(test_captions, name='True Captions', index=TEST_FILENAMES).apply(convert_lists),
#     pd.Series(full_train_val_rnn_10_captions, name='RNN With Dropout Layers', index=TEST_FILENAMES).apply(concat_string),
#     pd.Series(full_train_val_without_dropout_10_captions, name='RNN Without Dropout', index=TEST_FILENAMES).apply(concat_string),
#     pd.Series(full_train_val_logistic_captions, name='LogisticDecoder', index=TEST_FILENAMES).apply(concat_string)
# ], axis=1).to_csv('../data/generated_captions.csv')

### Below this cell are just some extra experiments

In [50]:
'''
train our best model on 150 epochs and evaluate some captions
'''
dropout_train_val_50_epochs = RNNModel(True, train_and_val_samples, optimizer='adam')
# dropout_train_val_50_epochs.train_save_model(input_dict=train_and_val_dict, save_path='../models/full_training_val_data/with_dropout_50_epochs_adam', epochs=50)
# dropout_train_val_50_epochs.load('../models/full_training_val_data/with_dropout_50_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
 456572/Unknown - 12182s 27ms/step - loss: 2.8844

KeyboardInterrupt: 

In [None]:
dropout_train_val_50_test_captions = dropout_train_val_50_epochs.generate_captions_for_files(TEST_FILENAMES, verbose=False)
dropout_train_val_50_test_bleu_scores = corpus_bleu_score(test_captions, dropout_train_val_50_test_captions)

In [4]:
'''
original goal for this model was to train it overnight on the entire training + validation set
accidentally trained it on only training set
'''

full_train_150_epochs = RNNModel(True, train_samples, optimizer='adam')
# full_train_150_epochs.train_save_model(input_dict=train_dict, save_path='../models/full_training_data/with_dropout_150_epochs_adam', epochs=150)
full_train_150_epochs.load('../models/full_training_data/with_dropout_150_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12


  layer_config = serialize_layer_fn(layer)


<keras.engine.functional.Functional at 0x189786d4148>

In [6]:
'''
scores on validation set
'''
full_train_150_val_captions = full_train_150_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
full_train_150_val_bleu_scores = corpus_bleu_score(val_captions, full_train_150_val_captions)

BLEU-1: 0.32126058325493884
BLEU-2: 0.1148672362951185
BLEU-3: 0.04351843938060441
BLEU-4: 0.01755540276109546


In [24]:
'''
scores on test set
'''
full_train_150_test_captions = full_train_150_epochs.generate_captions_for_files(TEST_FILENAMES, verbose=False)
full_train_150_test_bleu_scores = corpus_bleu_score(test_captions, full_train_150_test_captions)

BLEU-1: 0.31631936314680403
BLEU-2: 0.11440997030157263
BLEU-3: 0.04298068429853824
BLEU-4: 0.015962406999144314
