# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
import cv2
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential, load_model, Model
from keras.layers import Input, average, concatenate, GlobalAveragePooling2D
from keras.layers import TimeDistributed, GlobalAveragePooling1D
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.models import Sequential, load_model, Model
from pathlib import Path
import os
import re

from keras.layers.core import Dense,Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [2]:
# Setup
path = './data/hmdb51'
path_rowframes = './data/hmdb51/rawframes/'
path_annotations = './data/hmdb51/annotations/'

# Parametri Comuni
img_height = 224
img_width = 224
image_shape=(img_height, img_width)
batch_size = 32
num_classes = 51

# Parametri del temporal batch generator
num_of_snip=1
opt_flow_len=10

# Parametri di evaluation
fuse_method = 'average'


# Batch Generation Setting

In [None]:
class DataSet():
    def __init__(self, 
                 num_of_snip=1, 
                 opt_flow_len=10, 
                 image_shape=(224, 224),
                 partition='val',
                 batch_size = batch_size):
        
    # opt_flow_len = (int) number of optical flow frames pet stacked optical flow (snip)

        self.opt_flow_len = opt_flow_len
        self.num_of_snip = num_of_snip
        self.image_shape = image_shape
        self.opt_flow_path = os.path.join(path_rowframes)
        self.path_annotations = path_annotations
        self.partition = partition
        self.batch_size = batch_size
        
        # Get data
        self.video_list = self.find_videos_and_metadata()
        self.n_batch = len(self.video_list) // self.batch_size

        
    def find_videos_and_metadata(self):
        if self.partition == 'val':
            video_list = pd.read_csv(f'{self.path_annotations}/hmdb51_val_split_1_rawframes.txt', sep=" ", header=None) #test
            video_list.columns = ["path", "num_frames_tot", "class"]
        else:
            raise Exception("invalid partition")
        return(video_list)
    
    def val_generator(self):
        video_list = self.video_list
        idx = 0
        #print(f"Creating validation generator with {len(self.video_list)} samples.")
        while 1:
            idx +=1
            idx = idx % self.n_batch
            #print(f"Generator creating batch {idx}")
            X_spatial_batch = []
            X_temporal_batch = []
            y_batch = []
            
            batch_list = video_list.iloc[idx * self.batch_size: (idx + 1) * self.batch_size]
            for index, row in batch_list.iterrows():
                # Get the stacked optical flows from disk.
                X_spatial, X_temporal = self.find_frame_and_stacked_optical_flows(row)
                y = row['class']
                y = np.array(y)
                y = np.squeeze(y) 

                X_spatial_batch.append(X_spatial)
                X_temporal_batch.append(X_temporal)
                y_batch.append(y)

            X_batch = [np.array(X_spatial_batch), np.array(X_temporal_batch)]
            y_batch = np.array(y_batch)
            
            yield X_batch, y_batch
    
    def val_generator1(self):
        video_list = self.video_list
        idx = 0
        #print(f"Creating validation generator with {len(self.video_list)} samples.")
        idx +=1
        idx = idx % self.n_batch
        #print(f"Generator creating batch {idx}")
        X_spatial_batch = []
        X_temporal_batch = []
        y_batch = []
        #print(video_list)
        batch_list = video_list.iloc[idx * self.batch_size: (idx + 1) * self.batch_size]
        #print(type(batch_list))
        for index, row in batch_list.iterrows():
            #print(row[0][2])
            # Get the stacked optical flows from disk.
            #print(row['num_frames_tot'])
            # print(type(row))
            X_spatial, X_temporal = self.find_frame_and_stacked_optical_flows(row)
            y = row['class']
            y = np.array(y)
            y = np.squeeze(y) 

            X_spatial_batch.append(X_spatial)
            X_temporal_batch.append(X_temporal)
            y_batch.append(y)

        X_batch = [np.array(X_spatial_batch), np.array(X_temporal_batch)]
        y_batch = np.array(y_batch)
        
        return X_batch, y_batch
            
    def find_frame_and_stacked_optical_flows(self, row):
        static_frames = []
        opt_flow_stack = []
        opt_flow_dir = self.opt_flow_path
        
        # Temporal parameters
        total_frames = row['num_frames_tot'] # row[0][1]
        if total_frames - self.opt_flow_len + 1 < self.num_of_snip:
            loop = True
            start_frame_window_len = 1
        else:
            loop = False
            start_frame_window_len = (total_frames - self.opt_flow_len + 1) // self.num_of_snip # starting frame selection window length
        '''win_len = (total_frames - self.opt_flow_len) // self.num_of_snip
        if self.partition=='train':
            start_frame = int(random.random() * win_len) + 1
        else:
            start_frame = int(0.5 * win_len) + 1
        frames = [] # selected optical flow frames
        for i in range(self.num_of_snip):
            frames += range(start_frame + self.opt_flow_len * i, 
                            start_frame + self.opt_flow_len * (i + 1))  
        if self.partition == 'train' and random.random() > 0.5:
            flip = True
        else:
            flip = False'''
        
        # Spatial Parameter
        img_path = None
        img_path = row['path'].replace("\\","/")
        #print(f'./{img_path}' + '/img_' + str("%05d"%(1)) + '.jpg')
        img_test = cv2.imread(os.path.join(f'./{img_path}' + '/img_' + str("%05d"%(1)) + '.jpg'), 0)
        
        #print(img_test.shape)
        
        top = int((img_test.shape[0] - self.image_shape[0]) * random.random())
        left = int((img_test.shape[1] - self.image_shape[1]) * random.random())
        right = left + self.image_shape[1]
        bottom = top + self.image_shape[0]
        
        #print(top, left, right, bottom)

        # loop over snip
        for i_snip in range(self.num_of_snip):
            if loop:
                start_frame = i_snip % (total_frames - self.opt_flow_len + 1) + 1
            else:
                start_frame = int(0.5 * start_frame_window_len + 0.5) + start_frame_window_len * i_snip

            # Get the static frame

            #print(f'./{img_path}' + '/img_' + str("%05d"%(start_frame)) + '.jpg')
            static_frame = cv2.imread(os.path.join(f'./{img_path}' + '/img_' + str("%05d"%(start_frame)) + '.jpg'))
            static_frame = static_frame / 255.0
            static_frame = cv2.resize(static_frame, self.image_shape)

            static_frames.append(static_frame)

            #print(len(static_frames))

            # Get the optical flow stack
            frames = range(start_frame, start_frame + self.opt_flow_len) # selected optical flow frames
            opt_flow_stack = []
            for i_frame in frames:
                # x flow
                img = None # reset to be safe
                temp_path = None
                temp_path = row['path'].replace("\\","/")
                img = cv2.imread(os.path.join(f'./{temp_path}' + '/flow_x_' + str("%05d"%(i_frame)) + '.jpg'), 0)
                #print(os.path.join(f'.\\{temp_path}' + '\\flow_x_' + str("%05d"%(i_frame)) + '.jpg'))
                #print(os.path.join(f'.\\{temp_path}' + '\\flow_x_' + str("%05d"%(i_frame)) + '.jpg'))
                #print(img.shape)
                img = np.array(img)
                # mean substraction 
                img = img - np.mean(img)
                img = img[top : bottom, left : right]
                img = img / 255. # normalize pixels 
                img = cv2.resize(img, self.image_shape)
                #print(img.shape)
                opt_flow_stack.append(img)
                
                #print(len(opt_flow_stack))

                # y flow
                img2 = None # reset to be safe
                img2 = cv2.imread(os.path.join(f'./{temp_path}' + '/flow_y_' + str("%05d"%(i_frame)) + '.jpg'), 0)
                #print(img2.shape)
                img2 = np.array(img2)
                #img2 = np.swapaxes(img2, 0, 1)
                img2 = img2 - np.mean(img2)
                img2 = img2[top : bottom, left : right]
                img2 = img2 / 255. # normalize pixels
                img2 = cv2.resize(img2, self.image_shape)
                #print(img2.shape)
                opt_flow_stack.append(img2)
  
            opt_flow_stack = np.array(opt_flow_stack)
            opt_flow_stack = np.swapaxes(opt_flow_stack, 0, 2)
        # random horizontal flip for training sets
            
        #print(np.array(opt_flow_stack).shape)      #ritorna 16 optical flow (224,224, 20)
        #print(np.array(static_frames).squeeze().shape)

        return (np.array(static_frames).squeeze(), np.array(opt_flow_stack))


In [None]:
data_val = DataSet(num_of_snip=num_of_snip, 
                  opt_flow_len=opt_flow_len, 
                  image_shape=image_shape,
                  partition='val',
                  batch_size = batch_size)
img = data_val.val_generator1()

# Models fusion

In [None]:
def two_stream_fuse(spatial_model, temporal_model):
    # spatial stream (frozen)
    cnn_spatial = spatial_model

    # temporal stream (frozen)
    cnn_temporal = temporal_model

    # fused by taking average
    outputs = average([cnn_spatial.output, cnn_temporal.output])

    model = Model(inputs = [cnn_spatial.input, temporal_model.input], outputs = outputs)
    #model = Model(tf.keras.layers.Concatenate(axis = -1)(cnn_spatial, cnn_temporal))
    
    return model

In [5]:
# Importazione dei modelli
spatial_model = load_model('/content/gdrive/MyDrive/Deep learning project Pasinato Carbone Scuri/spatial_model_finetuned_resnet.hdf5')
temporal_model = load_model('/content/gdrive/MyDrive/Deep learning project Pasinato Carbone Scuri/temporal_model_cnn.h5')

In [None]:
spatial_model.summary()

In [None]:
two_stream_model.summary()

In [None]:
# Model
two_stream_model = two_stream_fuse(spatial_model, temporal_model)

In [None]:
# Compiling
optimizer = Adam()
two_stream_model.compile(loss=keras.losses.sparse_categorical_crossentropy, 
                   metrics=['sparse_categorical_accuracy','sparse_top_k_categorical_accuracy'], 
                   optimizer=optimizer)

In [None]:
# Classe data_val
data_val = DataSet(num_of_snip=num_of_snip, 
                  opt_flow_len=opt_flow_len, 
                  image_shape=image_shape,
                  partition='val',
                  batch_size = batch_size)

In [None]:
# Creazione del generatore
validation_generator = data_val.val_generator()
steps = data_val.n_batch

In [None]:
two_stream_model.fit_generator(generator=validation_generator, steps_per_epoch=steps, epochs = 10)

In [None]:
two_stream_model.evaluate_generator(validation_generator, verbose = 1, steps = 100)

Dictionaries label name - label number and vice-versa

In [31]:
labels = dict()
count = 0
for action in sorted(os.listdir("/content/data/hmdb51/rawframes")):
  labels[count] = action 
  count += 1

labels_rev = {v: k for k, v in labels.items()}

Test

In [None]:
for i in range (batch_size):
  top5 = list()
  res = two_stream_model([np.expand_dims(img[0][0][i], axis=0), np.expand_dims(img[0][1][i], axis=0)])
  pred = np.argmax(res)
  label = img[1][i]
  top5 = [labels[n] for n in np.argsort(res, axis=1)[:,-5:].tolist()[0]]

  res_spat = spatial_model(np.expand_dims(img[0][0][i], axis=0))
  res_temp = temporal_model(np.expand_dims(img[0][1][i], axis=0))

  top5_spat = [labels[n] for n in np.argsort(res_spat, axis=1)[:,-5:].tolist()[0]]
  top5_temp = [labels[n] for n in np.argsort(res_temp, axis=1)[:,-5:].tolist()[0]]

  # pred_spat = np.argmax(res_spat)
  # pred_temp = np.argmax(res_temp)
  
  print(f"correct label: {labels[label]}, predicted: {labels[pred]},     top5_fused: {top5} \nspat: {top5_spat},      temp: {top5_temp}\n")

# Predict a video

#### Spatial Model prediction over all frames

In [None]:
video_path = "/content/data/hmdb51/rawframes/climb/Amazing_Wall_Climber_(Must_be_Seen_to_Be_Believed!)_climb_f_cm_np1_ba_bad_1"

In [15]:
class_ind = pd.read_csv("./data/hmdb51/annotations/classInd.txt", sep=" ", header=None)
class_ind.columns = ["class_ind", "class"]
class_ind['class_ind'] = class_ind['class_ind'] - 1
class_ind.set_index('class_ind', inplace=True)
class_ind.head(10)
class_ind_copy = class_ind.copy()

In [17]:
resnet_model = keras.models.load_model('/content/gdrive/MyDrive/Deep learning project Pasinato Carbone Scuri/spatial_model_finetuned_resnet.hdf5')

In [10]:
def find_paths(partition, type_frame):
    if partition == 'train':
        video_list = pd.read_csv(f'{path_annotations}/hmdb51_train_split_1_rawframes.txt', sep=" ", header=None) #train
        video_list.columns = ["path", "num_frames_tot", "class"]
    elif partition == 'val':
        video_list = pd.read_csv(f'{path_annotations}/hmdb51_val_split_1_rawframes.txt', sep=" ", header=None) #test
        video_list.columns = ["path", "num_frames_tot", "class"]
    else:
        raise Exception("invalid partition")


    paths = []
    classes = []
    for index, row in video_list.iterrows():
        temp_path = row['path'].replace("\\","/")
        frame_list = os.listdir(os.path.join(f'./{temp_path}'))

        frame_list_type = [i for i in frame_list if i.startswith(f'{type_frame}')]

        filename = frame_list_type

        paths.append([os.path.join('./', temp_path, file) for file in filename])
        temp = [row['class']] * len(filename)
        classes.append(temp)

    return(paths, classes)

In [12]:
filenames_img, labels = find_paths(partition='val', type_frame='img')

In [40]:
video_path = "/content/data/hmdb51/rawframes/golf/Natalie_Gulbis_1_golf_f_cm_np1_le_med_0"

In [55]:
video_path = "/content/data/hmdb51/rawframes/ride_bike/Justin_lernt_Fahrrad_fahren_ride_bike_f_cm_np1_fr_med_0"

In [56]:
# ------ all in one (for loop)
#random_video_frames_path = random.choice(filenames_img)

from imgaug import augmenters as iaa

random.shuffle(filenames_img)
top1 = 0
top5 = 0
miss = 0

contatore = 0

#for count, video_frames_path in enumerate(filenames_img[:200]):
count = 1
list_frames = os.listdir(video_path)
video_frames_path = [img for img in list_frames if img.startswith("img")]

#print(len(video_frames_path))
class_ind = class_ind_copy.copy()

original_rgb_frames = []

for frame in video_frames_path:
    original_rgb_frames.append(cv2.imread(f"{video_path}/{frame}", cv2.IMREAD_UNCHANGED))

# generate spatial batch as done in the dataloader
spatial_batch_temp = []
for image in original_rgb_frames:
    spatial_batch_temp.append(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    #spatial_batch_temp.append(image) - [103.939, 116.779, 123.68]) #peggiora se non si fa BGR2RGB e se si toglie la media

# image resize augmenter to be fed into the network

augmenter = iaa.Sequential([
    iaa.Resize((img_height, img_width))
    #iaa.CropToFixedSize(img_height, img_width)
])
spatial_batch = np.array(augmenter.augment_images(spatial_batch_temp), dtype=np.float32)# /255.0

#true_class = video_frames_path[0].split('/')[4]
#print(true_class)

# predict spatial stream output
#print(spatial_batch.shape)
try:
    spatial_pred = resnet_model.predict(spatial_batch)
except:
    print('saltatoooooo')
spatial_classes = np.argsort(spatial_pred,axis=1)#[:,:-6:-1]
spatial_scores = np.sort(spatial_pred,axis=1)#[:,:-6:-1]

spatial_sorted = []

for spatial_class, spacial_score in zip(spatial_classes, spatial_scores):
    zipped = zip(spatial_class, spacial_score)
    spatial_sorted.append(sorted(zipped, key=lambda x: x[0]))

results_spat = np.average(spatial_sorted, axis=0)       #avg_scores
# class_ind['percentage'] = round(pd.Series(avg_scores[:,1])*100, 1)
# class_ind.sort_values('percentage', ascending=False, inplace=True)
# class_ind = class_ind.head(5)
# class_ind.reset_index('class_ind', inplace = True)

results_spat = np.expand_dims(results_spat[:,1], axis=0)

# if class_ind.loc[0]['class'] == true_class:
#     #print('top 1')
#     top1 += 1
# elif true_class in class_ind['class'].tolist():
#     #print('top 5')
#     top5 += 1
# else:
#     #print('niente')
#     miss += 1
# print(count,'/',len(filenames_img))

#### Temporal model prediction over all frames

In [49]:
# Given a video this function creates for each frame a stacked optical flow (224,224,20) and gives it to the temporal model
# The overall prediction is an average of all the predictions

def predict_video_temp(video_path):                 
  list_frames = os.listdir(video_path)
  n_frames = len([img for img in list_frames if img.startswith("img")])
  results = np.zeros(shape = (1,51))
  # label = 
  count = 0
  for n_frame in range(1, n_frames - 10):                     
    opt_flow_stack = list()
    count += 1
    for i in range(10):
      for lettera in ["x","y"]:      
        img = None # reset to be safe
        img = cv2.imread(os.path.join(f'{video_path}' + f'/flow_{lettera}_' + str("%05d"%(n_frame + i)) + '.jpg'), 0)
        #print(os.path.join(f'.\\{temp_path}' + '\\flow_x_' + str("%05d"%(i_frame)) + '.jpg'))
        #print(img.shape)
        if img is None:
          continue

        img = np.array(img)
        # mean substraction 
        img = img - np.mean(img)
        #img = img[top : bottom, left : right]
        img = img / 255. # normalize pixels 
        img = cv2.resize(img, (224,224))
        #print(img.shape)
        opt_flow_stack.append(img)

    opt_flow_stack = np.array(opt_flow_stack)
    opt_flow_stack = np.swapaxes(opt_flow_stack, 0, 1)
    opt_flow_stack = np.swapaxes(opt_flow_stack, 1, 2)
    if opt_flow_stack.shape[2] != 20:
      break
    opt_flow_stack = np.expand_dims(opt_flow_stack, axis=0)
    results += temporal_model.predict(opt_flow_stack)
    #np.argsort(results / count, axis=1)[:,-5:]

  return results/count

In [57]:
results_temp = predict_video_temp(video_path)

In [51]:
pred = np.argmax((results_temp + results_spat) / 2 )
labels[pred]

'golf'

In [52]:
for elemento in [results_spat, results_temp]:
  for i in np.argsort(elemento, axis = 1)[:,-5:][0]:
    if elemento is results_spat:
      stringa = "spat"
    else:
      stringa = "temp"
    print(f"{stringa}: {labels[i]}: {elemento[0][i]}")

spat: handstand: 0.000320975007933783
spat: kick_ball: 0.0025262375589692157
spat: catch: 0.004053326523492693
spat: shoot_bow: 0.008194903623927845
spat: golf: 0.9838962849275565
temp: pick: 0.020356225293861437
temp: walk: 0.02042131956706206
temp: handstand: 0.050235423372526254
temp: climb: 0.06163918194350637
temp: golf: 0.6632648305169173


In [58]:
pred = np.argmax((results_temp + results_spat) / 2 )
labels[pred]

'ride_bike'

In [59]:
for elemento in [results_spat, results_temp]:
  for i in np.argsort(elemento, axis = 1)[:,-5:][0]:
    if elemento is results_spat:
      stringa = "spat"
    else:
      stringa = "temp"
    print(f"{stringa}: {labels[i]}: {elemento[0][i]}")

spat: dive: 0.007335681733763621
spat: ride_horse: 0.009148698536668175
spat: draw_sword: 0.011017633753251132
spat: push: 0.17185946898280235
spat: ride_bike: 0.7562838335521519
temp: climb: 0.05316095326122814
temp: ride_horse: 0.05339219504832358
temp: somersault: 0.06613920255145733
temp: cartwheel: 0.06840309912365848
temp: catch: 0.19249734054387047


Finding the label in the given path

In [None]:
key = ""
count = 0
init = video_path.find("rawframes")
while True:
  if (video_path[init + len("rawframes/") + count] == "/") or (video_path[init + len("rawframes/") + count] == "\\"):
    break
  key += video_path[21 + len("rawframes/") + count]
  count += 1
label = labels_rev[key]