In [1]:
'''
In this file we train many models and evaluate their caption predictions on the validation and test sets

This file will also be used to generate some captions for the report

All data gathered is in the report

'''

import pandas as pd
from LogisticDecoder import LogisticDecoder
from common import clean_descriptions, samples_to_dict, corpus_bleu_score
from RNNDecoder import RNNModel

In [2]:
''' 
Initialize the samples and dictionaries that will be used in training

'''
# get the samples with the given filenames
small_train_samples = clean_descriptions('../data/flickr_8k/small_train.csv')
validation_samples = clean_descriptions('../data/flickr_8k/validation.csv')
train_samples = clean_descriptions('../data/flickr_8k/train.csv')
test_samples = clean_descriptions('../data/flickr_8k/test.csv')
train_and_val_samples = clean_descriptions('../data/flickr_8k/train_and_val.csv')


small_train_dict = samples_to_dict(small_train_samples)
train_dict = samples_to_dict(train_samples)
test_dict = samples_to_dict(test_samples)
train_and_val_dict = samples_to_dict(train_and_val_samples)
validation_dict = samples_to_dict(validation_samples)

# get the captions for the validation set and the test set
VALIDATION_FILENAMES = list(validation_dict.keys())
val_captions = list(validation_dict.values())

# not using predefined variable because the order is different and it matters for how we generate captions and calculate bleu scores
TEST_FILENAMES = list(test_dict.keys())
test_captions = list(test_dict.values())

In [3]:
'''
The following 3 models are used to select which model is best. We will then tune hyper parameters using that model structure

create logistic regression model
5 epochs
small training set
evaluate on validation set
'''
logistic_decoder = LogisticDecoder(15, small_train_samples)
# logistic_decoder.fit(small_train_dict, 5, '../models/compare_3_models/LogisticModel',verbose=True)
logistic_decoder.load('../models/compare_3_models/LogisticModel')

Model loaded from ../models/compare_3_models/LogisticModel


In [4]:
logistic_captions = logistic_decoder.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
logistic_bleu_scores = corpus_bleu_score(val_captions, logistic_captions)

BLEU-1: 0.3688110474907376
BLEU-2: 0.11844459557724961
BLEU-3: 0.040301407647280026
BLEU-4: 0.010024117672673933


In [5]:
'''
create RNN model without dropout layers
5 epochs
small training set
evaluate on validation set
'''
rnn_without_dropout_model = RNNModel(False, small_train_samples)
# rnn_without_dropout_model.train_save_model(input_dict=small_train_dict, save_path='../models/compare_3_models/RNN_without_dropout', epochs=5)
rnn_without_dropout_model.load('../models/compare_3_models/RNN_without_dropout')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [6]:
without_dropout_captions = rnn_without_dropout_model.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
without_dropout_bleu_scores = corpus_bleu_score(val_captions, without_dropout_captions)

BLEU-1: 0.36882039122703025
BLEU-2: 0.16086606150804358
BLEU-3: 0.06777855153160156
BLEU-4: 0.027481417559851205


In [7]:
'''
create RNN model with dropout layers
5 epochs
small training set
evaluate on validation set
'''
rnn_with_dropout_model = RNNModel(True, small_train_samples)
# rnn_with_dropout_model.train_save_model(input_dict=small_train_dict, save_path='../models/compare_3_models/RNN_with_dropout', epochs=5)
rnn_with_dropout_model.load('../models/compare_3_models/RNN_with_dropout')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [8]:
with_dropout_captions = rnn_with_dropout_model.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
with_dropout_bleu_scores = corpus_bleu_score(val_captions, with_dropout_captions)

BLEU-1: 0.3708819200724556
BLEU-2: 0.16598642265091337
BLEU-3: 0.07421146275530921
BLEU-4: 0.03623250071970148


In [9]:
'''
Note:

best model was the RNN model with dropout layers

now do hyperparameter tuning on best model

optimizer   | # epochs
adam        | 5
adam        | 10
sgd         | 5
sgd         | 10


evaluate each model on the validation set
'''
adam_5_epochs = RNNModel(True, small_train_samples, optimizer='adam')
# adam_5_epochs.train_save_model(input_dict=small_train_dict, save_path='../models/hyperparameter_tuning/rnn_5_epochs_adam', epochs=5)
adam_5_epochs.load('../models/hyperparameter_tuning/rnn_5_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [10]:
adam_5_captions = adam_5_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
adam_5_bleu_scores = corpus_bleu_score(val_captions, adam_5_captions)

BLEU-1: 0.3481897215802795
BLEU-2: 0.15029988165849215
BLEU-3: 0.06072035219055026
BLEU-4: 0.024662053950138443


In [11]:
'''
RNN model with dropout 
adam optimizer 10 epochs
'''
adam_10_epochs = RNNModel(True, small_train_samples, optimizer='adam')
# adam_10_epochs.train_save_model(input_dict=small_train_dict, save_path='../models/hyperparameter_tuning/rnn_10_epochs_adam', epochs=10)
adam_10_epochs.load('../models/hyperparameter_tuning/rnn_10_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [12]:
adam_10_captions = adam_10_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
adam_10_bleu_scores = corpus_bleu_score(val_captions, adam_10_captions)

BLEU-1: 0.36857142857142855
BLEU-2: 0.1693732493313534
BLEU-3: 0.07620747950748036
BLEU-4: 0.036732043324973485


In [13]:
'''
RNN model with dropout 
sgd optimizer 5 epochs
'''
sgd_5_epochs = RNNModel(True, small_train_samples, optimizer='sgd')
# sgd_5_epochs.train_save_model(input_dict=small_train_dict, save_path='../models/hyperparameter_tuning/rnn_5_epochs_sgd', epochs=5)
sgd_5_epochs.load('../models/hyperparameter_tuning/rnn_5_epochs_sgd')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [14]:
'''
RNN model with dropout 
sgd optimizer 5 epochs
'''
sgd_5_captions = sgd_5_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
sgd_5_bleu_scores = corpus_bleu_score(val_captions, sgd_5_captions)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.26977904490377763
BLEU-2: 0.06311048157167536
BLEU-3: 0.009384591087888932
BLEU-4: 3.6825412758162053e-79


In [15]:
'''
RNN model with dropout 
sgd optimizer 10 epochs
'''
sgd_10_epochs = RNNModel(True, small_train_samples, optimizer='sgd')
# sgd_10_epochs.train_save_model(input_dict=small_train_dict, save_path='../models/hyperparameter_tuning/rnn_10_epochs_sgd', epochs=10)
sgd_10_epochs.load('../models/hyperparameter_tuning/rnn_10_epochs_sgd')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [16]:
sgd_10_captions = sgd_10_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
sgd_10_bleu_scores = corpus_bleu_score(val_captions, sgd_10_captions)

BLEU-1: 0.3113117620450892
BLEU-2: 0.08366985387481277
BLEU-3: 0.013668175954438951
BLEU-4: 4.7288634603256347e-79


In [3]:
'''
The next 3 models are ALL THREE models trained with the best hyperparameters

adam optimizer with 10 epochs

First the RNN model with dropout layers
Second the RNN model without dropout layers
Lastly the Logistic Decoder model

All will be tested on the validation set again
'''
full_train_10_epochs = RNNModel(True, train_samples, optimizer='adam')
# full_train_10_epochs.train_save_model(input_dict=train_dict, save_path='../models/full_training_data/rnn_10_epochs_adam', epochs=10)
full_train_10_epochs.load('../models/full_training_data/rnn_10_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [4]:
full_train_10_captions = full_train_10_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
full_train_10_bleu_scores = corpus_bleu_score(val_captions, full_train_10_captions)

BLEU-1: 0.36541236541236544
BLEU-2: 0.16888892197921862
BLEU-3: 0.07157808723253177
BLEU-4: 0.03164923529849848


In [5]:
'''
RNN model without dropout
'''
full_train_without_dropout_10_epochs = RNNModel(False, train_samples, optimizer='adam')
# full_train_without_dropout_10_epochs.train_save_model(input_dict=train_dict, save_path='../models/full_training_data/no_dropout_10_epochs_adam', epochs=10)
full_train_without_dropout_10_epochs.load('../models/full_training_data/no_dropout_10_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [6]:
full_train_without_dropout_10_captions = full_train_without_dropout_10_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
full_train_without_dropout_10_bleu_scores = corpus_bleu_score(val_captions, full_train_without_dropout_10_captions)

BLEU-1: 0.3501296358922331
BLEU-2: 0.15442092433130808
BLEU-3: 0.06434277732239087
BLEU-4: 0.02597076238778425


In [7]:
'''
logistic model trained on training set
'''
full_train_logistic_decoder = LogisticDecoder(15, train_samples)
# full_train_logistic_decoder.fit(train_dict, 10, '../models/full_training_data/logistic_model',verbose=True)
full_train_logistic_decoder.load('../models/full_training_data/logistic_model')

Model loaded from ../models/full_training_data/logistic_model


In [8]:
full_train_logistic_captions = full_train_logistic_decoder.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
full_train_logistic_bleu_scores = corpus_bleu_score(val_captions, full_train_logistic_captions)

BLEU-1: 0.4127408397371645
BLEU-2: 0.14567295125330063
BLEU-3: 0.048268953218352444
BLEU-4: 0.01710742831848821


In [3]:
'''
train all 3 model types on full training + validation set with best hyper parameters

use these just for testing some caption generation
'''

full_train_val_rnn_10_epochs = RNNModel(True, train_and_val_samples, optimizer='adam')
# full_train_val_rnn_10_epochs.train_save_model(input_dict=train_and_val_dict, save_path='../models/full_training_val_data/rnn_10_epochs_adam', epochs=10)
full_train_val_rnn_10_epochs.load('../models/full_training_val_data/rnn_10_epochs_adam')


The top 30 sequence lengths are:
[35, 35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [4]:
full_train_val_rnn_10_captions = full_train_val_rnn_10_epochs.generate_captions_for_files(TEST_FILENAMES, verbose=False)
full_train_val_10_bleu_scores = corpus_bleu_score(test_captions, full_train_val_rnn_10_captions)

BLEU-1: 0.3578531948223495
BLEU-2: 0.15077712216509875
BLEU-3: 0.06486047253678623
BLEU-4: 0.028643511685858013


In [5]:
'''
RNN model without dropout on training + validation set
'''
full_train_val_without_dropout_10_epochs = RNNModel(False, train_and_val_samples, optimizer='adam')
# full_train_val_without_dropout_10_epochs.train_save_model(input_dict=train_and_val_dict, save_path='../models/full_training_val_data/no_dropout_10_epochs_adam', epochs=10)
full_train_val_without_dropout_10_epochs.load('../models/full_training_val_data/no_dropout_10_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [6]:
full_train_val_without_dropout_10_captions = full_train_val_without_dropout_10_epochs.generate_captions_for_files(TEST_FILENAMES, verbose=False)
full_train_val_without_dropout_10_bleu_scores = corpus_bleu_score(test_captions, full_train_val_without_dropout_10_captions)

BLEU-1: 0.3571275225418635
BLEU-2: 0.160156308197626
BLEU-3: 0.07407812945502383
BLEU-4: 0.03210307147322879


In [7]:
'''
logistic model trained on training + validation set
'''
full_train_val_logistic_decoder = LogisticDecoder(15, train_and_val_samples)
# full_train_val_logistic_decoder.fit(train_and_val_dict, 10, '../models/full_training_val_data/logistic_model',verbose=True)
full_train_val_logistic_decoder.load('../models/full_training_val_data/logistic_model')

Model loaded from ../models/full_training_val_data/logistic_model


In [8]:
full_train_val_logistic_captions = full_train_val_logistic_decoder.generate_captions_for_files(TEST_FILENAMES, verbose=False)
full_train_val_logistic_bleu_scores = corpus_bleu_score(test_captions, full_train_val_logistic_captions)

BLEU-1: 0.4072196938353331
BLEU-2: 0.135258190075289
BLEU-3: 0.04907325539219344
BLEU-4: 0.019574905251922445


In [9]:
'''
combine and export all captions generated by our 3 fully trained models for viewing
'''
def concat_string(tokens):
    output = ''
    for i, tok in enumerate(tokens):
        output += tok
        if i < len(tokens)-1:
            output += ' '
    return output

def convert_lists(lists_of_toks):
    '''
    remove <start> and <end> from true labels and reappend them to the list
    '''
    output = []
    for token_list in lists_of_toks:
        token_list = token_list[1:len(token_list)-1]
        output.append(concat_string(token_list))
    return output

# commented to not overwrite files
# pd.concat([
#     pd.Series(test_captions, name='True Captions', index=TEST_FILENAMES).apply(convert_lists),
#     pd.Series(full_train_val_rnn_10_captions, name='RNN With Dropout Layers', index=TEST_FILENAMES).apply(concat_string),
#     pd.Series(full_train_val_without_dropout_10_captions, name='RNN Without Dropout', index=TEST_FILENAMES).apply(concat_string),
#     pd.Series(full_train_val_logistic_captions, name='LogisticDecoder', index=TEST_FILENAMES).apply(concat_string)
# ], axis=1).to_csv('../data/generated_captions.csv')

### Below this cell are just some extra experiments

In [9]:
'''
original goal for this model was to train it overnight on the entire training + validation set
accidentally trained it on only training set
'''

full_train_150_epochs = RNNModel(True, train_samples, optimizer='adam')
# full_train_150_epochs.train_save_model(input_dict=train_dict, save_path='../models/full_training_data/with_dropout_150_epochs_adam', epochs=150)
full_train_150_epochs.load('../models/full_training_data/with_dropout_150_epochs_adam')

The top 30 sequence lengths are:
[35, 35, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30]
The longest sequence length from the training and validation samples is 35
The average sequence length from the training and validation samples is 12
model loaded successfully!


In [10]:
'''
scores on validation set
'''
full_train_150_val_captions = full_train_150_epochs.generate_captions_for_files(VALIDATION_FILENAMES, verbose=False)
full_train_150_val_bleu_scores = corpus_bleu_score(val_captions, full_train_150_val_captions)

BLEU-1: 0.34854186265286924
BLEU-2: 0.13711198910620406
BLEU-3: 0.06145770081029022
BLEU-4: 0.02654907368336846


In [11]:
'''
scores on test set
'''
full_train_150_test_captions = full_train_150_epochs.generate_captions_for_files(TEST_FILENAMES, verbose=False)
full_train_150_test_bleu_scores = corpus_bleu_score(test_captions, full_train_150_test_captions)

BLEU-1: 0.3364551627253571
BLEU-2: 0.12992880857508834
BLEU-3: 0.05701062669647279
BLEU-4: 0.025052644977417254
