In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet50, ResNet50_Weights

import numpy as np
from tensorflow import keras

from tqdm import tqdm
import pandas as pd
import cv2
import os

In [2]:
# Hyper-parameter
hidden_size = 64
num_classes = 5
num_epochs = 10
batch_size = 50
learning_rate = 0.001

input_size = 300
sequence_length = 30
num_layers = 2
img_size = 224

In [3]:
!wget -q https://git.io/JGc31 -O ucf101_top5.tar.gz
!tar xf ucf101_top5.tar.gz

In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 594
Total videos for testing: 224


Unnamed: 0,video_name,tag
411,v_ShavingBeard_g16_c03.avi,ShavingBeard
503,v_TennisSwing_g11_c07.avi,TennisSwing
401,v_ShavingBeard_g14_c06.avi,ShavingBeard
468,v_ShavingBeard_g24_c05.avi,ShavingBeard
218,v_PlayingCello_g23_c02.avi,PlayingCello
439,v_ShavingBeard_g20_c04.avi,ShavingBeard
207,v_PlayingCello_g21_c03.avi,PlayingCello
151,v_PlayingCello_g12_c07.avi,PlayingCello
316,v_Punch_g19_c04.avi,Punch
442,v_ShavingBeard_g20_c07.avi,ShavingBeard


In [5]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(img_size, img_size)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [6]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
)
print(label_processor.get_vocabulary())

['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']


In [7]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    #frame_masks = np.zeros(shape=(num_samples, sequence_length), dtype="bool")
    frame_length = np.zeros(num_samples)
    frame_features = np.zeros(
        shape=(num_samples, sequence_length, 3, img_size, img_size), dtype="uint8"
    )

    # For each video.
    for idx, path in tqdm(enumerate(video_paths), total=len(video_paths)):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, sequence_length,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, sequence_length, 3, img_size, img_size), dtype="uint8"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            if video_length < sequence_length:
              print('chuy')
            length = min(sequence_length, video_length)
            # for j in range(length):
            #     temp_frame_features[i, j, :] = cnn_net(preprocess(torch.tensor(batch[None, j, :].transpose(0,3,1,2)))).detach().numpy()
            temp_frame_features[i, 0:length, :] = np.concatenate([batch[None, ii, :].transpose(0,3,1,2) for ii in range(length)],0)
            #temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_length[idx] = length
    return (frame_features, frame_length), labels

train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame length in train set: {train_data[1].shape}")

100%|██████████| 594/594 [01:53<00:00,  5.24it/s]
100%|██████████| 224/224 [00:35<00:00,  6.24it/s]

Frame features in train set: (594, 30, 3, 224, 224)
Frame length in train set: (594,)





In [8]:
class MyDataset(Dataset):
  def __init__(self, x, y, z):
    self.data = x
    self.labels = y
    self.lengths = z

  def __len__(self):
    return len(self.labels)
  
  def __getitem__(self, index):
    return self.data[index], self.labels[index], self.lengths[index]

In [9]:
train_dataset = MyDataset(train_data[0],train_labels,train_data[1])
test_dataset = MyDataset(test_data[0],test_labels,train_data[1])

In [10]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class CNNLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(CNNLSTM, self).__init__()

        self.weights = ResNet50_Weights.DEFAULT
        self.preprocess = self.weights.transforms()
        self.pretrained_cnn = resnet50(weights=self.weights)

        #self.resnet.fc = nn.Sequential(nn.Linear(self.resnet.fc.in_features, 300))
        self.pretrained_cnn.fc = nn.Linear(self.pretrained_cnn.fc.in_features, input_size)

        self.num_layers =  num_layers
        self.hidden_size = hidden_size


        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # x -> (batch_size, sequence_length, input_size) because batch_size = true
        self.fc1 = nn.Linear(hidden_size, 32)
        self.fc2 = nn.Linear(32, num_classes)
        self.dropout = nn.Dropout(0.40)

    def forward(self, x):
        hidden = None
        for i in range(x.size(1)):
            with torch.no_grad():
                out = self.pretrained_cnn(self.preprocess(x[:,i,:,:,:]))
            out, hidden = self.lstm(out.unsqueeze(1), hidden)
        
        # out -> (batch_size, sequence_length, hidden_size) because batch_size = true
        out = out[:, -1, :] # only the last time step

        out = F.relu(self.fc1(out))
        out = self.fc2(self.dropout(out))

        return out

net = CNNLSTM(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.5)

In [12]:
#net = net.float()
net.train()
for epoch in range(num_epochs):  # loop over the dataset multiple times
    print("\nStarting epoch {}".format(epoch+1))
    
    total = 0
    running_loss = 0.0

    # to make a beautiful progress bar
    loader = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, data in loader:
        # get the data points
        inputs, labels,_ = data
        inputs, labels = inputs.to(device), labels.to(device)
        # zero the parameter gradients (else, they are accumulated)
        optimizer.zero_grad()

        # forward the data through the network
        outputs = net(inputs)
        # calculate the loss given the output of the network and the target labels
        loss = criterion(outputs, labels.squeeze())
        # calculate the gradients of the network w.r.t. its parameters
        loss.backward()
        # Let the optimiser take an optimization step using the calculated gradients
        optimizer.step()
        
        running_loss += loss
        total += outputs.size(0)

        loader.set_description("loss: {:.5f}".format(running_loss/total))

print('Finished Training')


Starting epoch 1


loss: 0.03233: 100%|██████████| 12/12 [00:59<00:00,  4.94s/it]


Starting epoch 2



loss: 0.02950: 100%|██████████| 12/12 [00:56<00:00,  4.73s/it]


Starting epoch 3



loss: 0.02331: 100%|██████████| 12/12 [00:57<00:00,  4.83s/it]


Starting epoch 4



loss: 0.01668: 100%|██████████| 12/12 [00:59<00:00,  4.92s/it]


Starting epoch 5



loss: 0.01104: 100%|██████████| 12/12 [00:59<00:00,  4.98s/it]


Starting epoch 6



loss: 0.00751: 100%|██████████| 12/12 [01:00<00:00,  5.05s/it]


Starting epoch 7



loss: 0.00548: 100%|██████████| 12/12 [01:01<00:00,  5.08s/it]


Starting epoch 8



loss: 0.00358: 100%|██████████| 12/12 [01:01<00:00,  5.12s/it]


Starting epoch 9



loss: 0.00271: 100%|██████████| 12/12 [01:01<00:00,  5.14s/it]


Starting epoch 10



loss: 0.00182: 100%|██████████| 12/12 [01:01<00:00,  5.13s/it]

Finished Training





In [15]:
net.eval()
class Accuracy:
    """A class to keep track of the accuracy while training"""
    def __init__(self):
        self.correct = 0
        self.total = 0
        
    def reset(self):
        """Resets the internal state"""
        self.correct = 0
        self.total = 0
        
    def update(self, output, labels):
        """
        Updates the internal state to later compute the overall accuracy
        
        output: the output of the network for a batch
        labels: the target labels
        """
        _, predicted = torch.max(output.data, 1) # predicted now contains the predicted class index/label
        
        self.total += labels.size(0)
        self.correct += (predicted == labels).sum().item() # .item() gets the number, not the tensor

    def compute(self):
        return self.correct/self.total

accuracy = Accuracy()

accuracy.reset()
# Gradients are calculated on the forward pass for every iteration.
# As we do not need gradients now, we can disable the calculation.
with torch.no_grad():
    for data in tqdm(train_loader):
        # get the data points
        inputs, labels,_ = data
        inputs, labels = inputs.to(device), labels.to(device)
        # forward the data through the network
        outputs = net(inputs)
        
        accuracy.update(outputs, labels.squeeze())

print("Training Accuracy: {:.2f}%".format(100 * accuracy.compute()))

accuracy.reset()        
with torch.no_grad():
    for data in tqdm(val_loader):
        # get the data points
        inputs, labels,_ = data
        inputs, labels = inputs.to(device), labels.to(device)
        # forward the data through the network
        outputs = net(inputs)
        
        accuracy.update(outputs, labels.squeeze())
        
print("\nTesting Accuracy: {:.2f}%".format(100 * accuracy.compute()))

100%|██████████| 12/12 [00:52<00:00,  4.34s/it]


Training Accuracy: 99.49%


100%|██████████| 5/5 [00:20<00:00,  4.12s/it]


Testing Accuracy: 88.84%



