In [1]:
import numpy as np
import os
import json
from moviepy.editor import VideoFileClip
from collections import defaultdict
import pickle

In [2]:
dataset_folder = 'D:\Dataset'
labels_folder = os.path.join(dataset_folder, 'labels')
videos_folder = os.path.join(dataset_folder, 'videos')

In [3]:
classes = ['normal', 'anomaly']

def class_labels_into_one_hot(labels):
    label = np.zeros(2, dtype=np.float32)
    if not labels:
        label[0] = 1
    else:
        label[1] = 1
    return label

In [4]:
def get_number_from_filename(filename):
    return int(file.split('_')[-1].split('.')[0])

def parse_time(string_time):
    splitted_string = string_time.split(':')
    minutes = int(splitted_string[0])
    seconds = int(splitted_string[1])
    return minutes, seconds

In [None]:
#load the dataset and preprocess it
dataset = defaultdict(dict)
for (dirpath, dirnames, filenames) in os.walk(labels_folder):
    for file in filenames:
        with open(os.path.join(labels_folder, file)) as f:
            number = get_number_from_filename(file)
            dataset[number]['labels'] = json.load(f)['labels']
            
video_length = []
for (dirpath, dirnames, filenames) in os.walk(videos_folder):
    for file in filenames:
        clip = VideoFileClip(os.path.join(videos_folder, file))
        number = get_number_from_filename(file)
        dataset[number]['duration'] = clip.duration
        dataset[number]['fps'] = clip.fps
        labels_in_frames = []
        for label in dataset[number]['labels']:
            start_time_min, start_time_sec = parse_time(label[1])
            end_time_min, end_time_sec = parse_time(label[2])
            start_frame = (start_time_min * 60 + start_time_sec) * clip.fps
            end_frame = (end_time_min * 60 + end_time_sec) * clip.fps
            labels_in_frames.append((label[0], start_frame, end_frame))
        dataset[number]['labels_in_frames'] = labels_in_frames

In [22]:
with open(os.path.join(labels_folder) + 'unifed_labels.pkl', 'wb') as f:
    pickle.dump(dataset, f)

In [5]:
#load the preprocessed dataset

with open(os.path.join(labels_folder) + 'unifed_labels2.pkl', 'rb') as f:
    loaded_dataset = pickle.load(f)

print(loaded_dataset)

{1: {'labels': [['robbery', '00:08', '00:11']], 'duration': 25.3, 'fps': 30.0, 'labels_in_frames': [('robbery', 240.0, 330.0)]}, 10: {'labels': [['fighting', '00:06', '00:14']], 'duration': 94.58, 'fps': 30.0, 'labels_in_frames': [('fighting', 180.0, 420.0)]}, 100: {'labels': [['arrest', '00:03', '01:02']], 'duration': 83.4, 'fps': 30.0, 'labels_in_frames': [('arrest', 90.0, 1860.0)]}, 1000: {'labels': [], 'duration': 59.59, 'fps': 30.0, 'labels_in_frames': []}, 1001: {'labels': [], 'duration': 27.53, 'fps': 30.0, 'labels_in_frames': []}, 1002: {'labels': [], 'duration': 128.81, 'fps': 30.0, 'labels_in_frames': []}, 1003: {'labels': [], 'duration': 94.12, 'fps': 30.0, 'labels_in_frames': []}, 1004: {'labels': [], 'duration': 16.09, 'fps': 30.0, 'labels_in_frames': []}, 1005: {'labels': [], 'duration': 17.0, 'fps': 30.0, 'labels_in_frames': []}, 1006: {'labels': [], 'duration': 49.9, 'fps': 30.0, 'labels_in_frames': []}, 1007: {'labels': [], 'duration': 21.16, 'fps': 30.0, 'labels_in_fr

In [6]:
#filter irellevent categories
for video, atrr in loaded_dataset.items():
    atrr['labels'] = [label for label in atrr['labels'] if label[0] != 'pouring gas' and label[0] != 'gun pointing']
    atrr['labels_in_frames'] = [label for label in atrr['labels_in_frames'] if label[0] != 'pouring gas' and label[0] != 'gun pointing']


In [7]:
counter_dict = defaultdict(int)
for video, attr in loaded_dataset.items():
    if not attr['labels']:
        counter_dict['normal'] += 1
    else:
        for label in attr['labels']:
            counter_dict[label[0]] += 1

print(counter_dict)

defaultdict(<class 'int'>, {'robbery': 139, 'fighting': 141, 'arrest': 33, 'normal': 678, 'burglary': 85, 'explosion': 40, 'stealing': 160, 'arsen': 57, 'shooting': 62, 'vandalism': 70})


In [8]:
video_times_per_category = defaultdict(int)
for video, attr in loaded_dataset.items():
    if not attr['labels']:
        video_times_per_category['normal'] += attr['duration']
    else:
        for label in attr['labels_in_frames']:
            fps = attr['fps']
            start_frame = int(label[1])
            end_frame = int(label[2])
            duration = (label[2] - label[1]) / fps
            video_times_per_category[label[0]] += duration
            
print(video_times_per_category)

defaultdict(<class 'int'>, {'robbery': 5457.0, 'fighting': 3934.0, 'arrest': 2391.0, 'normal': 263930.5199999997, 'burglary': 10111.0, 'explosion': 257.0, 'stealing': 6239.0, 'arsen': 4625.0, 'shooting': 217.0, 'vandalism': 1493.0})


In [9]:
#separate data to test and train
from sklearn.model_selection import train_test_split

train_indexes, test_indexes = train_test_split(range(len(loaded_dataset)), test_size=0.2)

train = {key : value['labels_in_frames'] for key,value in loaded_dataset.items() if key in train_indexes}
test = {key : value['labels_in_frames'] for key,value in loaded_dataset.items() if key in test_indexes}


print(train)

{1: [('robbery', 240.0, 330.0)], 10: [('fighting', 180.0, 420.0)], 100: [('arrest', 90.0, 1860.0)], 1000: [], 1001: [], 1002: [], 1003: [], 1004: [], 1005: [], 1006: [], 1007: [], 1008: [], 1009: [], 101: [('fighting', 1260.0, 1320.0)], 1011: [], 1013: [], 1014: [], 1015: [], 1016: [], 1017: [], 1018: [], 1019: [], 1020: [], 1021: [], 1022: [], 1023: [], 1024: [], 1025: [], 1027: [], 1028: [], 1029: [], 103: [('arrest', 1590.0, 7950.0)], 1030: [], 1031: [], 1032: [], 1033: [], 1034: [], 1035: [], 1036: [], 1038: [], 1039: [], 104: [('arrest', 0.0, 330.0)], 1040: [], 1041: [], 1042: [], 1046: [], 1047: [], 1048: [], 1049: [], 105: [('arrest', 720.0, 2490.0)], 1050: [], 1051: [], 1052: [], 1054: [], 1056: [], 1057: [], 1058: [], 106: [('arrest', 2310.0, 5460.0)], 1060: [], 1062: [], 1063: [], 1064: [], 1065: [], 1066: [], 1067: [], 1069: [], 107: [('fighting', 570.0, 870.0)], 1070: [], 1071: [], 1073: [], 1074: [], 1075: [], 1076: [], 1077: [], 1079: [], 108: [('arrest', 900.0, 3390.0)],

In [10]:
#define the nn
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.resnet101 = models.resnet101(pretrained=True)
        for param in self.resnet101.parameters():
            param.requires_grad = False
        in_features = self.resnet101.fc.in_features
        self.resnet101.fc = nn.Linear(in_features, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.batch = nn.BatchNorm1d(512)
        self.dropout = nn.Dropout(0.2)
        self.fc3 = nn.Linear(512 , 256)
        self.fc4 = nn.Linear(256, 2)
        self.sigm = nn.Sigmoid()

    def forward(self, x):
        x = self.resnet101(x)
        x = self.fc2(x)
        x = self.batch(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.fc4(x)
        return self.sigm(x)

In [13]:
import torch
from torch.utils.data import IterableDataset, DataLoader
from torchvision import transforms
import cv2

class MyIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, video_folder, dict_labels):
        super(MyIterableDataset).__init__()
        self._video_folder = video_folder
        self._dict_labels = dict_labels
        self._transform = transforms.Compose([transforms.ToPILImage(),
                                             transforms.Resize((299,299)),
                                             transforms.ToTensor(),
                                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
        
    def _generate_labels(self, labels_in_frames, video_frame_index):
        labels = []
        for label in labels_in_frames:
            if int(label[1]) <= video_frame_index <= int(label[2]):
                labels.append(label[0])
        return class_labels_into_one_hot(labels)
        
    def __iter__(self):
        for video_num, labels in self._dict_labels.items():
            cap = cv2.VideoCapture(os.path.join(videos_folder, 'video_{}.mp4'.format(video_num)))
            index = 0
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                frame_labels = self._generate_labels(labels, index)
                index += 1
                yield self._transform(frame), frame_labels

In [14]:
import torch
#define the loss function
import torch.optim as optim
    
dataset = MyIterableDataset(videos_folder, train)

training_generator = torch.utils.data.DataLoader(dataset, batch_size=128)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)


net = Net()
net.to(device)

criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
net.train(True)

#start training

for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(training_generator, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 0:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

cuda:0
[1,     1] loss: 0.000


KeyboardInterrupt: 