In [1]:
from google.colab import drive
drive.mount('/content/drive')
print('mount success')

Mounted at /content/drive
mount success


In [2]:
import os
import json
from sklearn.model_selection import train_test_split

In [3]:
# training features
train_features_path = '/content/drive/Shareddrives/msvd-train-feats/train/custom_feat'
# training captions
train_captions_path = '/content/drive/Shareddrives/msvd-dataset/train/train_video_captions_refined.json'

# validation features
val_features_path = '/content/drive/Shareddrives/msvd-test-feats/val/custom_feat'
# validation captions
val_captions_path = '/content/drive/Shareddrives/msvd-dataset/val/val_video_captions_refined.json'

# test features
test_features_path = '/content/drive/Shareddrives/msvd-test-feats/test/custom_feat'
# test captions
test_captions_path = '/content/drive/Shareddrives/msvd-dataset/test/test_video_captions_refined.json'

In [4]:
# label file
TRAIN_LABEL_PATH = train_captions_path
VAL_LABEL_PATH = val_captions_path

# training labels
with open(TRAIN_LABEL_PATH) as train_data_file:
    train_labels = json.load(train_data_file)

# validation labels
with open(VAL_LABEL_PATH) as val_data_file:
    val_labels = json.load(val_data_file)

## MODEL TRAINING

In [5]:
!pip install keras_preprocessing

Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m880.6 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


In [9]:
import json
import os
import random

import keras
import numpy as np
from keras.callbacks import EarlyStopping
from keras.layers import Input, LSTM, Dense
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

import joblib

In [10]:
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model
import tensorflow as tf

class CustomAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.num_categories = 4
        self.num_tokens = input_shape[1]

        # Initialize weights for each category
        self.wt1 = self.add_weight(name='wt1', shape=(self.num_tokens,), initializer='ones', trainable=True)
        self.wt2 = self.add_weight(name='wt2', shape=(self.num_tokens,), initializer='ones', trainable=True)
        self.wt3 = self.add_weight(name='wt3', shape=(self.num_tokens,), initializer='ones', trainable=True)
        self.wt4 = self.add_weight(name='wt4', shape=(self.num_tokens,), initializer='ones', trainable=True)

        super(CustomAttention, self).build(input_shape)

    def call(self, inputs):
        # Split the input into different categories
        print(inputs.shape)
        wt1_inputs = tf.slice(inputs, [0, 0, 0], [1, inputs.shape[1], 4096])  # Tokens 0-4095

        print('wt_inputs shapes')
        print(wt1_inputs.shape)

        # Gather specific tokens for wt2 and wt3 using tf.gather
        wt2_indices = tf.constant([4100, 4106, 4112, 4118, 4124]) #confidence score
        wt2_inputs = tf.gather(inputs, wt2_indices, axis=2)
        wt2_inputs = tf.reshape(wt2_inputs, (1,80,5))
        print(wt2_inputs.shape)

        wt3_indices = tf.constant([4101, 4107, 4113, 4119, 4125]) #classid
        wt3_inputs = tf.gather(inputs, wt3_indices, axis=2)
        wt3_inputs = tf.reshape(wt3_inputs, (1,80,5))
        print(wt3_inputs.shape)

        #coords
        wt4_indices = tf.concat([
          tf.range(4096, 4100, dtype=tf.int32),  # Elements from 4096 to 4099 (inclusive)
          tf.range(4102, 4106, dtype=tf.int32),  # Elements from 4102 to 4105 (inclusive)
          tf.range(4108, 4112, dtype=tf.int32),  # Elements from 4108 to 4111 (inclusive)
          tf.range(4114, 4118, dtype=tf.int32),  # Elements from 4114 to 4117 (inclusive)
          tf.range(4120, 4124, dtype=tf.int32)   # Elements from 4120 to 4123 (inclusive)
        ], axis=0)
        wt4_inputs = tf.gather(inputs, wt4_indices, axis=2)
        wt4_inputs = tf.reshape(wt4_inputs, (1,80,20))
        print(wt4_inputs.shape)

        # Apply weights to each category
        print(self.wt1.shape, ' weight 1 ', wt1_inputs)
        self.wt1 = tf.expand_dims(self.wt1, axis=0)
        print(self.wt1.shape, ' weight 2', wt1_inputs)
        wt1_weighted = self.wt1[:, :, tf.newaxis] * wt1_inputs
        print(wt1_weighted.shape)

        self.wt2 = tf.expand_dims(self.wt2, axis=0)
        wt2_weighted = self.wt2[:, :, tf.newaxis] * wt2_inputs
        print(wt2_weighted.shape)

        self.wt3 = tf.expand_dims(self.wt3, axis=0)
        wt3_weighted = self.wt3[:, :, tf.newaxis] * wt3_inputs
        print(wt3_weighted.shape)

        self.wt4 = tf.expand_dims(self.wt4, axis=0)
        wt4_weighted = self.wt4[:, :, tf.newaxis] * wt4_inputs
        print(wt4_weighted.shape)

        # Concatenate weighted inputs
        weighted_inputs = tf.concat([wt1_weighted, wt2_weighted, wt3_weighted, wt4_weighted], axis=2)

        # Calculate attention scores
        attention_scores = tf.reduce_sum(weighted_inputs, axis=-1, keepdims=True)

        # Apply softmax to get attention weights
        attention_weights = tf.nn.softmax(attention_scores, axis=1)

        # Apply attention weights to inputs
        attended_inputs = inputs * attention_weights
        print('final return',attended_inputs.shape)

        return attended_inputs

    def compute_output_shape(self, input_shape):
        return input_shape

In [13]:
class VideoDescriptionTrain():
    """
    Initialize the parameters for the model
    """

    def __init__(self):
        # self.train_path = train_path
        self.train_features_path = train_features_path
        self.val_features_path = val_features_path
        self.max_length = 10
        #self.batch_size = 320
        self.batch_size = 640
        self.lr = 0.0007
        self.epochs = 120
        self.latent_dim = 512
        self.num_encoder_tokens = 4126
        self.num_decoder_tokens = 1500
        self.time_steps_encoder = 80
        self.time_steps_decoder = None
        self.x_data = {}

        # processed data
        self.tokenizer = None
        # models
        self.encoder_model = None
        self.decoder_model = None
        self.inf_encoder_model = None
        self.inf_decoder_model = None
        self.save_model_path = '/content/drive/Shareddrives/FYP-models/cust_att_1500'

    # caption preprocessing
    def preprocessing(self):
        """
        Preprocessing the data
        dumps values of the json file into a list
        """

        # train_list contains all the captions with their video ID
        # vocab_list contains all the vocabulary from training data
        training_list = []
        vocab_list = []
        validation_list = []

        #training data
        for y in train_labels:
          for caption in y['caption']:
            caption = "<bos> " + caption + " <eos>"
            # we are only using sentences whose length lie between 6 and 10
            if len(caption.split())>10 or len(caption.split())<6:
              continue
            else:
              training_list.append([caption, y['id']])

        #validation data
        for y in val_labels:
          for caption in y['caption']:
            caption = "<bos> " + caption + " <eos>"
            # we are only using sentences whose length lie between 6 and 10
            if len(caption.split())>10 or len(caption.split())<6:
              continue
            else:
              validation_list.append([caption, y['id']])


        # training_list = train_list
        # validation_list = val_list

        print('val len', len(validation_list))
        print('train len', len(training_list))

        for train in training_list:
            vocab_list.append(train[0]) # vocab_list here has all captions

        # caption vocabulary
        self.tokenizer = Tokenizer(num_words=1500)
        self.tokenizer.fit_on_texts(vocab_list)

        #------ loading training features from cnn & yolo numpy files to dictionary

        TRAIN_FEATURE_DIR = os.path.join(self.train_features_path, 'feat')
        YOLO_FEATURE_DIR = os.path.join(self.train_features_path, 'yolo-feat')

        # Loading all the numpy arrays at once and saving them in a dictionary
        for filename in os.listdir(TRAIN_FEATURE_DIR):
            cnn_file_path = os.path.join(TRAIN_FEATURE_DIR, filename)
            yolo_file_path = os.path.join(YOLO_FEATURE_DIR, filename)

            try:
                # Check if the file is empty
                if os.path.getsize(cnn_file_path) == 0:
                    print(f"Warning: CNN npy file {filename} is empty.")
                if os.path.getsize(yolo_file_path) == 0:
                    print(f"Warning: YOLO npy file {filename} is empty.")

                # Load the numpy array
                cnn_f = np.load(cnn_file_path, allow_pickle=True)
                yolo_f = np.load(yolo_file_path, allow_pickle=True)
                yolo_f = yolo_f.reshape(-1, 30)

                #concatenate arrays
                combined_f = np.concatenate((cnn_f, yolo_f), axis=1)
                print(combined_f.shape)

                # Add the array to the dictionary
                self.x_data[filename[:-4]+'.avi'] = combined_f

            except Exception as e:
                # Handle exceptions (e.g., corrupted file)
                print(f"Error loading file {filename}: {str(e)}")

        # Perform additional checks on x_data if needed
        if len(self.x_data) == 0:
            print("Warning: No data loaded. Check the integrity of your files.")


        #------ loading validation features from cnn & yolo numpy files to dictionary

        TRAIN_FEATURE_DIR = os.path.join(self.val_features_path, 'feat')
        YOLO_FEATURE_DIR = os.path.join(self.val_features_path, 'yolo-feat')

        # Loading all the numpy arrays at once and saving them in a dictionary
        for filename in os.listdir(TRAIN_FEATURE_DIR):
            cnn_file_path = os.path.join(TRAIN_FEATURE_DIR, filename)
            yolo_file_path = os.path.join(YOLO_FEATURE_DIR, filename)

            try:
                # Check if the file is empty
                if os.path.getsize(cnn_file_path) == 0:
                    print(f"Warning: CNN npy file {filename} is empty.")
                if os.path.getsize(yolo_file_path) == 0:
                    print(f"Warning: YOLO npy file {filename} is empty.")

                # Load the numpy array
                cnn_f = np.load(cnn_file_path, allow_pickle=True)
                yolo_f = np.load(yolo_file_path, allow_pickle=True)
                yolo_f = yolo_f.reshape(-1, 30)

                #concatenate arrays
                combined_f = np.concatenate((cnn_f, yolo_f), axis=1)
                print(combined_f.shape)

                # Add the array to the dictionary
                self.x_data[filename[:-4]+'.avi'] = combined_f

            except Exception as e:
                # Handle exceptions (e.g., corrupted file)
                print(f"Error loading val file {filename}: {str(e)}")

        # Perform additional checks on x_data if needed
        if len(self.x_data) == 0:
            print("Warning: No data loaded. Check the integrity of your val files.")


        return training_list, validation_list


    # for feeding dataset into model
    def load_dataset(self, training_list):
        """
        Loads the dataset in batches for training
        :return: batch of data
        """
        encoder_input_data = []
        decoder_input_data = []
        decoder_target_data = []
        videoId = []
        videoSeq = []

        for idx, cap in enumerate(training_list):
            caption = cap[0]
            videoId.append(cap[1])
            videoSeq.append(caption)

        #tokenizing caption to be fed into encoder
        train_sequences = self.tokenizer.texts_to_sequences(videoSeq)
        train_sequences = np.array(train_sequences, dtype=object)

        #pad captions to a max len of 10
        train_sequences = pad_sequences(train_sequences, padding='post', truncating='post', maxlen=self.max_length)
        file_size = len(train_sequences)

        #create batches of data for feeding into the model
        n = 0
        for i in range(self.epochs):
            for idx in range(0, file_size):
                n += 1
                encoder_input_data.append(self.x_data[videoId[idx]])
                y = to_categorical(train_sequences[idx], self.num_decoder_tokens)
                decoder_input_data.append(y[:-1])
                decoder_target_data.append(y[1:])

                if n == self.batch_size:
                    encoder_input = np.array(encoder_input_data)
                    decoder_input = np.array(decoder_input_data)
                    decoder_target = np.array(decoder_target_data)
                    encoder_input_data = []
                    decoder_input_data = []
                    decoder_target_data = []
                    n = 0
                    yield ([encoder_input, decoder_input], decoder_target)

    def train_model(self):
        """
        an encoder decoder sequence to sequence model
        reference : https://arxiv.org/abs/1505.00487
        """
        time_steps_encoder=80
        num_encoder_tokens=4126
        latent_dim=512
        time_steps_decoder=10
        num_decoder_tokens=1500
        batch_size=640

        # Custom Attention Layer
        attention = CustomAttention()

        # Encoder
        encoder_inputs = Input(shape=(time_steps_encoder, num_encoder_tokens), name="encoder_inputs")
        print(encoder_inputs.shape)

        attention_vector = attention(encoder_inputs)
        print('attention vector shape', attention_vector.shape)

        encoder = LSTM(latent_dim, return_state=True,return_sequences=True, name='endcoder_lstm')
        encoder_outputs, state_h, state_c = encoder(attention_vector)
        encoder_states = [state_h, state_c]

        # Decoder
        decoder_inputs = Input(shape=(time_steps_decoder, num_decoder_tokens), name="decoder_inputs")
        decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
        decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
        decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_relu')
        decoder_outputs = decoder_dense(decoder_outputs)

        # Model
        model = Model([attention_vector, decoder_inputs], decoder_outputs)
        model.summary()
        training_list, validation_list = self.preprocessing()

        train = self.load_dataset(training_list)
        valid = self.load_dataset(validation_list)

        early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1, mode='min')

        # Run training
        opt = keras.optimizers.Adam(learning_rate=0.0003)
        reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor="val_loss",
                                                      factor=0.1, patience=5, verbose=0,
                                                      mode="auto")
        model.compile(metrics=['accuracy'], optimizer=opt, loss='categorical_crossentropy')

        validation_steps = len(validation_list)//self.batch_size
        steps_per_epoch = len(training_list)//self.batch_size
        print('val len', len(validation_list))
        print('train len', len(training_list))
        print('val steps', validation_steps)
        print('batch size', batch_size)
        print('spe', steps_per_epoch)

        model.fit(train, validation_data=valid, validation_steps=validation_steps,
                  epochs=self.epochs, steps_per_epoch=steps_per_epoch,
                  callbacks=[reduce_lr, early_stopping])

        if not os.path.exists(self.save_model_path):
            os.makedirs(self.save_model_path)

        self.encoder_model = Model(attention_vector, encoder_states)
        decoder_state_input_h = Input(shape=(self.latent_dim,))
        decoder_state_input_c = Input(shape=(self.latent_dim,))
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
        decoder_outputs, state_h, state_c = decoder_lstm(
            decoder_inputs, initial_state=decoder_states_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = decoder_dense(decoder_outputs)
        self.decoder_model = Model(
            [decoder_inputs] + decoder_states_inputs,
            [decoder_outputs] + decoder_states)
        self.encoder_model.summary()
        self.decoder_model.summary()

        # saving the models
        self.encoder_model.save(os.path.join(self.save_model_path, 'encoder_model.h5'))
        self.decoder_model.save_weights(os.path.join(self.save_model_path, 'decoder_model_weights.h5'))
        with open(os.path.join(self.save_model_path, 'tokenizer' + str(self.num_decoder_tokens)), 'wb') as file:
            joblib.dump(self.tokenizer, file)

In [None]:
video_to_text = VideoDescriptionTrain()
video_to_text.train_model()

(None, 80, 4126)
(None, 80, 4126)
wt_inputs shapes
(1, 80, 4096)
(1, 80, 5)
(1, 80, 5)
(1, 80, 20)
(80,)  weight 1  Tensor("custom_attention_1/Slice:0", shape=(1, 80, 4096), dtype=float32)
(1, 80)  weight 2 Tensor("custom_attention_1/Slice:0", shape=(1, 80, 4096), dtype=float32)
(1, 80, 4096)
(1, 80, 5)
(1, 80, 5)
(1, 80, 20)
final return (None, 80, 4126)
attention vector shape (None, 80, 4126)
Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 80, 4126)]           0         []                            
                                                                                                  
 decoder_inputs (InputLayer  [(None, 10, 1500)]           0         []                            
 )                                                                                        