In [None]:
!python --version

Python 3.7.12


### Dataset Paths

In [None]:
import os
from pathlib import Path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Project Directory
PROJECT_DIR = "/content/drive/MyDrive/Mini_Project_II"

# Videos Directory
TRAINING_CLIPS = os.path.join(PROJECT_DIR,'Dataset','Training','TrainingDataset')
TEST_CLIPS = os.path.join(PROJECT_DIR,'Dataset','Test','TestDataset')

# CSV Paths
TRAINING_CSV = os.path.join(PROJECT_DIR,'Dataset','Training','Dataset_ComicMischief_Training_Scene_Binary_Annotations.csv')
TEST_CSV = os.path.join(PROJECT_DIR,'Dataset','Test','Dataset_ComicMischief_Test_Scenes.csv')

### Verifying the Training Dataset with Downloaded videos

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv(TRAINING_CSV)
data.shape

(998, 7)

In [None]:
data.columns

Index(['Video ID', 'Video URL', 'Scene_ID', 'Original Video Codec',
       'Original Video Resolution', 'Original Video Avg Framerate',
       'Presence of Comic Mischief Content in Scene'],
      dtype='object')

In [None]:
for index,val in data.iterrows():
    if val['Scene_ID'] < 10:
        filename = f"{val['Video ID']}.0{val['Scene_ID']}.mp4"
    else:
        filename = f"{val['Video ID']}.{val['Scene_ID']}.mp4"

    filePath = os.path.join(TRAINING_CLIPS,filename)
    if not os.path.exists(filePath):
        print(f'{filename[:-4]} not found. Removing entry from data')
        data.drop(index,inplace=True)

data.shape
# print(index,val,sep=' : ',end='\n------------\n')

tt0493405.00 not found. Removing entry from data
tt0493405.01 not found. Removing entry from data
tt0493405.02 not found. Removing entry from data
kCppUtS9vLk.00 not found. Removing entry from data
kCppUtS9vLk.01 not found. Removing entry from data
EbcfiIeH63M.00 not found. Removing entry from data
EbcfiIeH63M.01 not found. Removing entry from data
EbcfiIeH63M.02 not found. Removing entry from data
pxxPznV38Hk.00 not found. Removing entry from data
pxxPznV38Hk.01 not found. Removing entry from data
pxxPznV38Hk.02 not found. Removing entry from data
1yhNm_8q07g.00 not found. Removing entry from data
1yhNm_8q07g.01 not found. Removing entry from data
1yhNm_8q07g.02 not found. Removing entry from data


(984, 7)

### Verifying Frame Rate with Downloaded Videos

In [None]:
import cv2 as cv

In [None]:
# Verifying Frame Rate

errorFrameRate = []

for index,val in data.iterrows():
    if val['Scene_ID'] < 10:
        filename = f"{val['Video ID']}.0{val['Scene_ID']}.mp4"
    else:
        filename = f"{val['Video ID']}.{val['Scene_ID']}.mp4"

    filePath = os.path.join(TRAINING_CLIPS,filename)
    # Getting frameRate
    frameRate = cv.VideoCapture(filePath).get(cv.CAP_PROP_FPS)
    if round(frameRate,2) != round(val['Original Video Avg Framerate'],2):
        errorFrameRate.append(filename[:-4])
        print('Frame Rate Not Matched with actual : {}'.format(filename))
        print('FrameRate from Video: {} | FramRate from data: {}'.format(round(frameRate,2),round(val['Original Video Avg Framerate'],2)))
        print('-'*20)

Frame Rate Not Matched with actual : tt3844362.02.mp4
FrameRate from Video: 23.97 | FramRate from data: 23.98
--------------------
Frame Rate Not Matched with actual : tt2380564.00.mp4
FrameRate from Video: 29.95 | FramRate from data: 29.96
--------------------
Frame Rate Not Matched with actual : tt2380564.01.mp4
FrameRate from Video: 29.97 | FramRate from data: 29.96
--------------------
Frame Rate Not Matched with actual : tt2380564.02.mp4
FrameRate from Video: 29.97 | FramRate from data: 29.96
--------------------
Frame Rate Not Matched with actual : tt2230358.01.mp4
FrameRate from Video: 23.97 | FramRate from data: 23.98
--------------------
Frame Rate Not Matched with actual : tt1714203.02.mp4
FrameRate from Video: 23.97 | FramRate from data: 23.98
--------------------
Frame Rate Not Matched with actual : tt1156398.02.mp4
FrameRate from Video: 23.97 | FramRate from data: 23.98
--------------------
Frame Rate Not Matched with actual : tt0368343.01.mp4
FrameRate from Video: 29.91 |

In [None]:
print(errorFrameRate)

['tt3844362.02', 'tt2380564.00', 'tt2380564.01', 'tt2380564.02', 'tt2230358.01', 'tt1714203.02', 'tt1156398.02', 'tt0368343.01', 'Z9QTS4doveY.01', 'xbEpWP5aWpA.01', 'eZjgg2lTYpE.02', 'al-TxOuSqc8.00', 'al-TxOuSqc8.01']


In [None]:
len(errorFrameRate)

13

### Calculating no of training images

In [None]:
image_count = []
for index,val in data.iterrows():
    if val['Scene_ID'] < 10:
        filename = f"{val['Video ID']}.0{val['Scene_ID']}.mp4"
    else:
        filename = f"{val['Video ID']}.{val['Scene_ID']}.mp4"

    filePath = os.path.join(TRAINING_CLIPS,filename)
    cap = cv.VideoCapture(filePath)
    while cap.isOpened():
        frameId = cap.get(1)
        ret, frame = cap.read()
        if not ret:
            break
    image_count.append(int(frameId))
    print(len(image_count),sep='\t')

sum(image_count)

In [None]:
sum(image_count)

1594045

## Preprocessing Video data

In [None]:
TRAIN_PRE_PROCESSED_DATA = os.path.join(PROJECT_DIR,'Dataset','Training','PreProcessed')
if not os.path.exists(TRAIN_PRE_PROCESSED_DATA):
    os.mkdir(TRAIN_PRE_PROCESSED_DATA)

In [None]:
from tqdm import tqdm
import math
import numpy as np

In [None]:
IMG_SIZE = 224
BATCH_SIZE = 41

In [None]:
def crop_center_square(frame):
    y,x = frame.shape[0:2]
    min_dim = min(x,y)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x: start_x + min_dim]

In [None]:
def load_video(path):
    frames = []
    cap = cv.VideoCapture(path)
    frameRate = cap.get(cv.CAP_PROP_FPS)
    try:
        while cap.isOpened():
            frameId = cap.get(1)
            ret, frame = cap.read()

            if not ret:
                break
            
            if frameId % math.floor(frameRate) == 0:
                frame = crop_center_square(frame)
                frame = cv.resize(frame, (IMG_SIZE,IMG_SIZE))
                frame = frame[:,:,[2,1,0]]
                frames.append(frame)
    finally:
        cap.release()
    
    frames = np.array(frames) / 255.0

    return frames

In [None]:
def preprocess_video_data(df,save_to=''):
    X = []
    y = []
    for idx,val in tqdm(df.iterrows(),total=df.shape[0]):
        if val['Scene_ID'] < 10:
            filename = f"{val['Video ID']}.0{val['Scene_ID']}.mp4"
        else:
            filename = f"{val['Video ID']}.{val['Scene_ID']}.mp4"

        filePath = os.path.join(TRAINING_CLIPS,filename)
        X.append(load_video(filePath))
        y.append(val['Presence of Comic Mischief Content in Scene'])

    with open(save_to,'wb') as fp:
        np.save(fp,X)
        np.save(fp,y)
    fp.close()

    print('-'*10)
    print(f'Written {save_to} success!')
    print('-'*10)

In [None]:
# Generating batch processed data frames
count = 0
temp_df_list = []
while count < data.shape[0]:
    temp_df_list.append(data[:][count:count+BATCH_SIZE])
    count += BATCH_SIZE

len(temp_df_list)

24

In [28]:
# Generating batch processed data
for idx,val in enumerate(temp_df_list):
    preprocess_video_data(val,save_to=os.path.join(TRAIN_PRE_PROCESSED_DATA,f'train_{idx}.npy'))

100%|██████████| 41/41 [03:44<00:00,  5.49s/it]
  return array(a, dtype, copy=False, order=order, subok=True)


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_0.npy success!
----------


100%|██████████| 41/41 [02:39<00:00,  3.89s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_1.npy success!
----------


100%|██████████| 41/41 [02:10<00:00,  3.18s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_2.npy success!
----------


100%|██████████| 41/41 [02:11<00:00,  3.22s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_3.npy success!
----------


100%|██████████| 41/41 [04:05<00:00,  5.98s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_4.npy success!
----------


100%|██████████| 41/41 [04:37<00:00,  6.78s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_5.npy success!
----------


100%|██████████| 41/41 [04:10<00:00,  6.10s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_6.npy success!
----------


100%|██████████| 41/41 [04:54<00:00,  7.18s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_7.npy success!
----------


100%|██████████| 41/41 [04:44<00:00,  6.95s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_8.npy success!
----------


100%|██████████| 41/41 [05:04<00:00,  7.43s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_9.npy success!
----------


100%|██████████| 41/41 [05:14<00:00,  7.67s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_10.npy success!
----------


100%|██████████| 41/41 [05:17<00:00,  7.76s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_11.npy success!
----------


100%|██████████| 41/41 [05:20<00:00,  7.81s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_12.npy success!
----------


100%|██████████| 41/41 [05:12<00:00,  7.62s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_13.npy success!
----------


100%|██████████| 41/41 [05:22<00:00,  7.86s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_14.npy success!
----------


100%|██████████| 41/41 [05:06<00:00,  7.47s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_15.npy success!
----------


100%|██████████| 41/41 [04:15<00:00,  6.24s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_16.npy success!
----------


100%|██████████| 41/41 [05:42<00:00,  8.36s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_17.npy success!
----------


100%|██████████| 41/41 [04:40<00:00,  6.83s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_18.npy success!
----------


100%|██████████| 41/41 [05:29<00:00,  8.03s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_19.npy success!
----------


100%|██████████| 41/41 [05:20<00:00,  7.81s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_20.npy success!
----------


100%|██████████| 41/41 [05:35<00:00,  8.18s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_21.npy success!
----------


100%|██████████| 41/41 [04:51<00:00,  7.11s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_22.npy success!
----------


100%|██████████| 41/41 [04:39<00:00,  6.81s/it]


----------
Written /content/drive/MyDrive/Mini_Project_II/Dataset/Training/PreProcessed/train_23.npy success!
----------
