In [1]:
import os
import numpy as np
from PIL import Image
from torch.utils import data
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm
import cv2
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_path = '/raid/dingwanli/data/model_data_3Ds/dataset_non_bm_112f/train/'
validate_path = '/raid/dingwanli/data/model_data_3Ds/dataset_non_bm_112f/validate/'

MAX_frame = 23
img_height = 250
img_width = 250
epoch = 50
batch_size = 6

save_model_path = "E:/project_study/video-classification-master/Conv3D/output_model/"  # save Pytorch models

# 3D CNN parameters
fc_hidden1, fc_hidden2 = 256, 256
dropout = 0.0        # dropout probability

# training parameters
epochs = 50
learning_rate = 1e-4
log_interval = 10

In [4]:
class Dataset_3DCNN(data.Dataset):
    "Characterizes a dataset for PyTorch"
    def __init__(self, data_path, transform=None):
        "Initialization"
        self.data_path = data_path
        self.transform = transform

    def __len__(self):
        "Denotes the total number of samples"
        return len(os.listdir(self.data_path))

    def read_images(self, sub_path, use_transform=None):
        X = []
        pic_name_required = []        
        pic_name_list = os.listdir(sub_path)
        
        pic_name_list = sorted(pic_name_list, key=lambda x: str(x.split('.')[0].split('_')[-1]))
        pic_name_list = sorted(pic_name_list, key=lambda x: len(x.split('.')[0].split('_')[-1]))
        
        num_frame = len(pic_name_list)
        if num_frame > MAX_frame:
            step_need = (num_frame)/(MAX_frame)
            for step in range(MAX_frame):
                required = int(step_need * step)
                if int(step_need * (step+1)) >= num_frame:
                    pic_name_required.append(pic_name_list[-1])
                else:
                    pic_name_required.append(pic_name_list[required])
        else:
            pic_name_required = pic_name_list.copy()
        
        for frame_name in pic_name_required:
            frame_path = os.path.join(sub_path, frame_name)
            image = Image.open(frame_path)
            if use_transform is not None:
                image = use_transform(image)
#             image = torch.from_numpy(image)
            X.append(image)
        X = torch.stack(X, dim=0)
        
#         print(X.shape)

        return X

    def __getitem__(self, index):
        "Generates one sample of data"
        self.one_sequence_dir = os.listdir(self.data_path)
        selected_sequence = self.one_sequence_dir[index]
        one_sequence_path = os.path.join(self.data_path, selected_sequence) + '/'
#         X = self.read_images(one_sequence_path, self.transform).unsqueeze_(0)
        X = self.read_images(one_sequence_path, self.transform)
        X = torch.transpose(X,0,1)
        if os.listdir(one_sequence_path)[0].split("_")[0] == "A":
            y = torch.Tensor([1])
        else:
            y = torch.Tensor([0])

        return X, y


In [5]:
transform = transforms.Compose([transforms.Resize([img_height, img_width]),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.5], std=[0.5])])

use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU

params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 0, 'pin_memory': True} if use_cuda else {}

train_set, valid_set = Dataset_3DCNN(train_path, transform=transform), \
                       Dataset_3DCNN(validate_path, transform=transform)

train_loader = data.DataLoader(train_set, **params)
valid_loader = data.DataLoader(valid_set, **params)

In [17]:
dataiter = iter(train_loader)
images, labels = dataiter.next()
print(images.shape)
print(labels.shape)
print(labels)

torch.Size([6, 3, 23, 250, 250])
torch.Size([6, 1])
tensor([[1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.]])


In [12]:
def conv3D_output_size(img_size, padding, kernel_size, stride):
    # compute output shape of conv3D
    outshape = (np.floor((img_size[0] + 2 * padding[0] - (kernel_size[0] - 1) - 1) / stride[0] + 1).astype(int),
                np.floor((img_size[1] + 2 * padding[1] - (kernel_size[1] - 1) - 1) / stride[1] + 1).astype(int),
                np.floor((img_size[2] + 2 * padding[2] - (kernel_size[2] - 1) - 1) / stride[2] + 1).astype(int))
    return outshape

In [13]:
class CNN3D(nn.Module):
    def __init__(self, t_dim=23, img_x=90, img_y=120, drop_p=0.2, fc_hidden1=256, fc_hidden2=128, num_classes=1):
        super(CNN3D, self).__init__()

        # set video dimension
        self.t_dim = t_dim
        self.img_x = img_x
        self.img_y = img_y
        # fully connected layer hidden nodes
        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p
        self.num_classes = num_classes
        self.ch1, self.ch2 = 32, 48
        self.k1, self.k2 = (5, 5, 5), (3, 3, 3)  # 3d kernel size
        self.s1, self.s2 = (2, 2, 2), (2, 2, 2)  # 3d strides
        self.pd1, self.pd2 = (0, 0, 0), (0, 0, 0)  # 3d padding

        # compute conv1 & conv2 output shape
        self.conv1_outshape = conv3D_output_size((self.t_dim, self.img_x, self.img_y), self.pd1, self.k1, self.s1)
        self.conv2_outshape = conv3D_output_size(self.conv1_outshape, self.pd2, self.k2, self.s2)

        self.conv1 = nn.Conv3d(in_channels=3, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1,
                               padding=self.pd1)
        self.bn1 = nn.BatchNorm3d(self.ch1)
        self.conv2 = nn.Conv3d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2,
                               padding=self.pd2)
        self.bn2 = nn.BatchNorm3d(self.ch2)
        self.relu = nn.ReLU(inplace=True)
        self.drop = nn.Dropout3d(self.drop_p)
        self.pool = nn.MaxPool3d(2)
        self.fc1 = nn.Linear(self.ch2 * self.conv2_outshape[0] * self.conv2_outshape[1] * self.conv2_outshape[2],
                             self.fc_hidden1)  # fully connected hidden layer
        self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)
        self.fc3 = nn.Linear(self.fc_hidden2, self.num_classes)  # fully connected layer, output = multi-classes
        self.sigmoid = nn.Sigmoid()

    def forward(self, x_3d):
        # Conv 1
        x = self.conv1(x_3d)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.drop(x)
        # Conv 2
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.drop(x)
        # FC 1 and 2
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc3(x)
        x = self.sigmoid(x)

        return x

In [14]:
cnn3d = CNN3D(t_dim=MAX_frame, img_x=img_height, img_y=img_width,
              drop_p=dropout, fc_hidden1=fc_hidden1,  fc_hidden2=fc_hidden2, num_classes=1).to(device)
if torch.cuda.device_count() >= 1:
    print("Using", torch.cuda.device_count(), "GPUs!")
    cnn3d = nn.DataParallel(cnn3d)

Using 4 GPUs!


In [21]:
x = torch.randn(2, 3, 23, 250, 250)
x = x.to(device)
logits = cnn3d(x)  # (1,3)
print(logits.shape)

torch.Size([2, 1])


In [14]:
model = cnn3d
for batch_idx, (X, y) in enumerate(train_loader):
    print("X.shape: ", X.size())
    print("y: ", y)
    print("y.shape: ", y.shape)
    print()
    
    X, y = X.to(device), y.to(device)
    output = model(X)
    print("output： ", output)
    print("output.shape: ", output.shape)
    print()
    
    pred = output.ge(0.5)
    print("pred: ", pred)
    
    pred_squeeze = output.ge(0.5).squeeze()
    print("pred_squeeze: ", pred_squeeze)
    break

X.shape:  torch.Size([4, 3, 23, 250, 250])
y:  tensor([[0.],
        [0.],
        [0.],
        [0.]])
y.shape:  torch.Size([4, 1])

output：  tensor([[0.0005],
        [0.0003],
        [0.0002],
        [0.0001]], device='cuda:0', grad_fn=<SigmoidBackward>)
output.shape:  torch.Size([4, 1])

pred:  tensor([[0],
        [0],
        [0],
        [0]], device='cuda:0', dtype=torch.uint8)
pred_squeeze:  tensor([0, 0, 0, 0], device='cuda:0', dtype=torch.uint8)


In [12]:
# print(y.cpu().data.squeeze())
# print(output.cpu().data.squeeze())
# y_pred = torch.max(output, 1)[1]
# print(y_pred)

In [9]:
def train(log_interval, model, device, train_loader, optimizer, epoch):
    # set model as training mode
    model.train()

    losses = []
    scores = []
    N_count = 0   # counting total trained sample in one epoch
    for batch_idx, (X, y) in enumerate(train_loader):
        X, y = X.to(device), y.to(device)
#         print(y)
#         X, y = X.to(device), y.to(device).view(-1, )

        N_count += X.size(0)

        optimizer.zero_grad()
        output = model(X)  # output size = (batch, number of classes)

        loss = F.binary_cross_entropy(output, y)
        losses.append(loss.item())

        # to compute accuracy
#         y_pred = torch.max(output, 1)[1]  # y_pred != output
#         step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
        y_pred = output.ge(0.5)
        step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
        
        scores.append(step_score)         # computed on CPU

        loss.backward()
        optimizer.step()

        # show information
        if (batch_idx + 1) % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accu: {:.2f}%'.format(
                epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))

    return losses, scores


def validation(model, device, optimizer, test_loader):
    # set model as testing mode
    model.eval()

    test_loss = 0
    all_y = []
    all_y_pred = []
    with torch.no_grad():
        for X, y in test_loader:
            # distribute data to device
            X, y = X.to(device), y.to(device)

            output = model(X)

            loss = F.binary_cross_entropy(output, y, reduction='sum')
            test_loss += loss.item()                 # sum up batch loss
#             y_pred = output.max(1, keepdim=True)[1]  # (y_pred != output) get the index of the max log-probability
            y_pred = output.ge(0.5)

            # collect all y and y_pred in all batches
            all_y.extend(y)
            all_y_pred.extend(y_pred)

    test_loss /= len(test_loader.dataset)

    # to compute accuracy
    all_y = torch.stack(all_y, dim=0)
    all_y_pred = torch.stack(all_y_pred, dim=0)
    test_score = accuracy_score(all_y.cpu().data.squeeze().numpy(), all_y_pred.cpu().data.squeeze().numpy())

    # show information
    print('\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(len(all_y), test_loss, 100* test_score))

    # save Pytorch models of best record
#     torch.save(model.state_dict(), os.path.join(save_model_path, '3dcnn_epoch{}.pth'.format(epoch + 1)))  # save spatial_encoder
#     torch.save(optimizer.state_dict(), os.path.join(save_model_path, '3dcnn_optimizer_epoch{}.pth'.format(epoch + 1)))      # save optimizer
#     print("Epoch {} model saved!".format(epoch + 1))

    return test_loss, test_score

In [22]:
optimizer = torch.optim.Adam(cnn3d.parameters(), lr=learning_rate)   # optimize all cnn parameters
# record training process
epoch_train_losses = []
epoch_train_scores = []
epoch_test_losses = []
epoch_test_scores = []
# start training
for epoch in range(epochs):
    # train, test model
    train_losses, train_scores = train(log_interval, cnn3d, device, train_loader, optimizer, epoch)
    epoch_test_loss, epoch_test_score = validation(cnn3d, device, optimizer, valid_loader)

    # save results
    epoch_train_losses.append(train_losses)
    epoch_train_scores.append(train_scores)
    epoch_test_losses.append(epoch_test_loss)
    epoch_test_scores.append(epoch_test_score)

    # save all train test results
    A = np.array(epoch_train_losses)
    B = np.array(epoch_train_scores)
    C = np.array(epoch_test_losses)
    D = np.array(epoch_test_scores)
    np.save('E:/project_study/video-classification-master/Conv3D/outputs/3DCNN_epoch_training_losses.npy', A)
    np.save('E:/project_study/video-classification-master/Conv3D/outputs/3DCNN_epoch_training_scores.npy', B)
    np.save('E:/project_study/video-classification-master/Conv3D/outputs/3DCNN_epoch_test_loss.npy', C)
    np.save('E:/project_study/video-classification-master/Conv3D/outputs/3DCNN_epoch_test_score.npy', D)



In [23]:
# plot
fig = plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.plot(np.arange(1, epochs + 1), A[:, -1])  # train loss (on epoch end)
plt.plot(np.arange(1, epochs + 1), C)         #  test loss (on epoch end)
plt.title("model loss")
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(['train', 'test'], loc="upper left")
# 2nd figure
plt.subplot(122)
plt.plot(np.arange(1, epochs + 1), B[:, -1])  # train accuracy (on epoch end)
plt.plot(np.arange(1, epochs + 1), D)         #  test accuracy (on epoch end)
# plt.plot(histories.losses_val)
plt.title("training scores")
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend(['train', 'test'], loc="upper left")
title = "./fig_UCF101_3DCNN.png"
plt.savefig(title, dpi=600)
# plt.close(fig)
plt.show()