# **Video Summarization Of TV SUM**

**Importing necessay libraries**

In [None]:
import tensorflow as tf
import numpy as np
import os
import scipy.io as spio
from tqdm import tqdm
from keras.preprocessing import image
import keras
import math
import h5py
import json
import datetime
from ortools.algorithms import pywrapknapsack_solver

In [None]:
files = ['91IHQYk1IQM', 'EE-bNr36nyA', 'JgHubY5Vw3Y', 'RBCABdttQmI', 'Bhxk-O1Y7Ho', 'XkqCExn6_Us', 'HT5vyqe0Xaw', 'iVt07TCkFM0', 'E11zDS9XGzg', 'qqR6AEXwxoQ', 'cjibtmSLxQ4', '3eYKfiOEJNs', 'AwmHb44_ouw', 'akI8YFjEmUw', 'kLxoNp-UchI', 'GsAD1KT1xo8', 'EYqVtI9YWJA', 'Se3oxnaPsz0', 'vdmoEJ5YbrQ', 'VuWGsYPqAX8', '37rzWOQsNIw', 'oDXZc0tZe04', '4wU_LUjG5Ic', 'J0nA4VgnoCo', 'i3wAGJaaktw', 'sTEELN-vY30', 'xwqBXPGE9pQ', 'PJrm840pAUI', 'gzDbaEs1Rlg', 'uGu_10sucQo', 'WG0MBPpPC6I', '-esJrBWj2d8', 'xxdtq8mxegs', 'WxtbjNsCQ8A', 'fWutDQy1nnY', 'Hl-__g2gn_A', '98MoyGZKHXc', 'byxOvuiIJV0', 'NyBmCxDoHJU', 'b626MiF1ew4', 'XzYM3PfTM4w', 'Yi4Ij2NM7U4', 'xmEERLqJ2kU', 'LRw_obCPUt0', '_xMr-HKMfVA', 'eQu1rNs0an0', '0tmA_C6XwfM', 'jcoYJXDG9sw', 'z_6gVvQb2d0', 'JKpqYvAdIsw']


In [None]:
# TV SUM FILES ADD

finalFiles = {}
for i in range(len(files)):
    finalFiles.setdefault("video_"+str(i+1),files[i])
print(finalFiles)

In [None]:
len(finalFiles)

In [None]:
# No. of frames taken for each batch
No_of_frames = 156

**Loading Ground Truth**

In [None]:
def RetrieveGroundTruth(VideoName):
        
    GroundTruth = spio.loadmat('/kaggle/input/tv-sum-gt/'+VideoName+'.mat', squeeze_me=True)
    
    print(len(GroundTruth['gt_score']))
        
    Y = np.zeros( shape=(20000,)  , dtype=np.float16)

    for i in range(0, len(GroundTruth['gt_score']) ):
        Y[i]=GroundTruth['gt_score'][i]

    return Y,len(GroundTruth['gt_score'])

**Generator-3**

In [None]:
def GeneratorForVideos(VideosName):
    
    np.random.shuffle(VideosName)

    for VideoNo in range(len(VideosName)):
        
        VideoName = finalFiles[VideosName[VideoNo].decode('utf-8')]
     
        Y,length = RetrieveGroundTruth(VideoName)
        
        temp = np.load('/kaggle/input/tv-sum/Npy/'+VideoName+'.npy')

        batches = math.ceil(length/No_of_frames)
        train_image = np.zeros((No_of_frames,156,156,3),dtype=np.float16)
        j = 0
        for i in range(0,batches):
            if i != (batches - 1):
                train_image = temp[j:j+No_of_frames]
                GY  = Y[j:j+No_of_frames]
                X = train_image/255
                train_image = np.zeros((No_of_frames,156,156,3),dtype=np.float16)
                X = np.expand_dims(X,axis = 0)
                GY = np.expand_dims(GY, axis = 0)
                j = j + No_of_frames
                yield X,GY
            else:
                train_image[0:(length - (i * No_of_frames) )] = temp[j:j+(length - (i * No_of_frames) )]
                GY  = Y[j:j+No_of_frames]
                X = train_image/255
                train_image = np.zeros((No_of_frames,156,156,3),dtype=np.float16)
                X = np.expand_dims(X,axis = 0)
                GY = np.expand_dims(GY, axis = 0)
                j = j + No_of_frames
                yield X,GY

In [None]:
f = open('/kaggle/input/splits/tvsum_splits.json')
splits = json.load(f)

In [None]:
train_keys = splits[4]['train_keys']
test_keys = splits[4]['test_keys']

In [None]:
partition = {
    'train': train_keys,
    'validation':test_keys
}

In [None]:
training_generator= tf.data.Dataset.from_generator(GeneratorForVideos , args=[partition['train']] ,output_types=(tf.float16, tf.float16) ,output_shapes=((1,No_of_frames, 156, 156, 3), (1,No_of_frames)  )   )
validation_generator=  tf.data.Dataset.from_generator(GeneratorForVideos , args=[partition['validation']] ,output_types=(tf.float16, tf.float16) ,output_shapes=((1,No_of_frames, 156, 156, 3), (1,No_of_frames)  )   )

**Defining the Model**

In [None]:
class StepOneAttention(tf.keras.Model):
    def __init__(self, units):
        super(StepOneAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, no. of frames, height, width, channels)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1,1,1, hidden_size)
#         print("Attention Step 1")
        hidden = tf.expand_dims(hidden, 1)
        hidden = tf.expand_dims(hidden, 1)
        hidden = tf.expand_dims(hidden, 1)
  
        attention_hidden_layer = (tf.nn.tanh(self.W1(features) + self.W2(hidden)))
        
        # This gives you an unnormalized score for each image feature.
        score = self.V(attention_hidden_layer)
        
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size,no. of frames, height, width, channels)
        context_vector = attention_weights * features

        return context_vector,attention_weights

In [None]:
class StepTwoAttention(tf.keras.Model):
    def __init__(self, units):
        super(StepTwoAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
#         print("Attention Step 2")
        features = tf.reduce_sum(features, axis=2)
        features = tf.reduce_sum(features, axis=2)
        # features(CNN_encoder output) shape == (batch_size, no. of frames, channels)
        hidden = tf.expand_dims(hidden, 1)
        
        attention_hidden_layer = (tf.nn.tanh(self.W1(features) +
                                             self.W2(hidden)))

        # This gives you an unnormalized score for each image feature.
        score = self.V(attention_hidden_layer)
        
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size,  no. of frames, channels)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector,attention_weights

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self,units):
        super(Decoder, self).__init__()
        self.units = units

        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(300,activation='relu')
        self.fc2 = tf.keras.layers.Dense(1,activation='relu')
        self.step_one_attention = StepOneAttention(self.units)
        self.step_two_attention = StepTwoAttention(self.units)

    def call(self,features, hidden):
        # defining attention as a separate model
        first_context_vector, attention_weights = self.step_one_attention(features, hidden)
        context_vector, attention_weights = self.step_two_attention(first_context_vector, hidden)
        hidden = tf.expand_dims(hidden, 1)
        
        # x shape after concatenation == (batch_size, 1, context_vector_size + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), hidden], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)
        
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, frame score)
        x = self.fc2(x)
        return x, state, attention_weights

In [None]:
decoder = Decoder(1024)

In [None]:
def multiAttention(features):
    outputs = tf.zeros((1,1))
    hidden = tf.zeros((1,1024)) # decoder hidden state 
    for i in range(No_of_frames):
        prediction, hidden ,_= decoder(features,hidden)
        if i == 0:
            outputs = prediction
        else:
            outputs = tf.concat((outputs,prediction),axis =-1)
    return outputs

In [None]:
def conv3DMultiAttentionModel():

    X_input  = tf.keras.Input(shape=(No_of_frames,156,156,3))

    X = tf.keras.layers.Conv3D(32,(3,3,3),activation='relu',padding='same')(X_input)
    X = tf.keras.layers.MaxPooling3D(pool_size=(1, 2, 2))(X)

    X = tf.keras.layers.Conv3D(64,(3,3,3),activation='relu',padding='same')(X)
    X = tf.keras.layers.MaxPooling3D((1,2,2))(X)
    X = tf.keras.layers.BatchNormalization()(X)

    X = tf.keras.layers.Conv3D(128,(3,3,3),activation='relu',padding='same')(X)
    X = tf.keras.layers.MaxPooling3D((1,2,2))(X)
    X = tf.keras.layers.BatchNormalization()(X)


    X = tf.keras.layers.Conv3D(256,(1,1,1),activation='relu',padding='same')(X)
    X = tf.keras.layers.MaxPooling3D((1,2,2))(X)
    X = tf.keras.layers.BatchNormalization()(X)

    X = tf.keras.layers.Conv3D(1024,(1,1,1),activation='relu',padding='same')(X)
    X = tf.keras.layers.MaxPooling3D((1,2,2))(X)
    
    # Decoder with Multi-Attention
    X = multiAttention(X)

    output =tf.keras.layers.Dense(No_of_frames,activation='sigmoid')(X)


    model = tf.keras.Model(inputs=X_input,outputs=output)

    return model

In [None]:
model = conv3DMultiAttentionModel()

In [None]:
model.load_weights("/kaggle/input/tvsummodelsplit1/model.epoch08-loss0.014.h5")

In [None]:
opt = keras.optimizers.Adam(learning_rate=1e-4)

In [None]:
model.compile(loss='mean_squared_error',optimizer=opt,metrics=['mae'])

In [None]:
log_dir = "../working/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.epoch={epoch:02d}-loss={loss:.3f}.h5', monitor='val_accuracy', verbose=1, save_weights_only=True,save_best_only=False, mode='max')

In [None]:
history = model.fit(
    training_generator,
    epochs = 8,
    validation_data=validation_generator,
    verbose=2,
    callbacks=[checkpoint,tensorboard_callback]
    )

Visualization

In [None]:
import matplotlib.pyplot as plt

In [None]:
mae = history.history['mae']
mse = history.history['loss']
val_mae = history.history['val_mae']
val_mse = history.history['val_loss']

epochs = range(len(mae))

In [None]:
plt.figure(figsize=(15,8))
plt.plot(epochs,mae,label='train')
plt.plot(epochs,val_mae,label='valid')
plt.title('Training and validation MAE')
plt.legend()

In [None]:
plt.figure(figsize=(15,8))
plt.plot(epochs,mse,label='train')
plt.plot(epochs,val_mse,label='valid')
plt.title('Training and validation MSE / loss')
plt.legend()