# Preprocessing the data
execute only if u need to

In [21]:
'''
Das Notebook basiert auf diesem Turoial:
https://www.analyticsvidhya.com/blog/2019/09/step-by-step-deep-learning-tutorial-video-classification-python/
'''

''

In [1]:
# imports
import cv2     # for capturing videos
import math   # for mathematical operations
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from keras.preprocessing import image   # for preprocessing the images
import numpy as np    # for mathematical operations
from keras.utils import np_utils
#from skimage.transform import resize   # for resizing images
from sklearn.model_selection import train_test_split
from glob import glob
from tqdm import tqdm

In [12]:
# preprocessing training labels
labels_df = pd.read_csv('material/labels.csv', sep=';')

# deleting videos with changing perspectives
labels_df = labels_df.drop(labels_df.loc[labels_df['Wechsel'] == True].index)
labels_df = labels_df.reset_index()

# droping unused columns:
labels_df.drop(columns=['index','Wechsel','ID'], inplace=True)
labels_df.rename(columns={'Camera Position (Side/Front/Upwards/Downwards)':'labels'}, inplace=True)

print(labels_df.shape)
labels_df.head()

(101, 2)


Unnamed: 0,NameOfFile,labels
0,Video_1.mp4,side
1,Video_2.mp4,upwards
2,Video_3.mp4,front
3,Video_4.mp4,side
4,Video_5.mp4,downwards


In [10]:
# storing the frames from training videos
for i in tqdm(range(labels_df.shape[0])):
    count = 0
    videoFile = labels_df['NameOfFile'][i]
    cap = cv2.VideoCapture('material/raw_training_videos/'+videoFile)   # capturing the video from the given path
    frameRate = cap.get(5) #frame rate
    #print(frameRate)
    x=1
    while(cap.isOpened()):
        frameId = cap.get(1) #current frame number
        #print(frameId)
        ret, frame = cap.read()
        if (ret != True):
            break
        #if (frameId % math.floor(frameRate) == 0):
            # storing the frames in a new folder named train_frames
        else:
            filename ='material/train_frames/' + videoFile +"_frame%d.jpg" % count;count+=1
            cv2.imwrite(filename, frame)
    cap.release()

100%|██████████| 101/101 [00:43<00:00,  2.32it/s]


In [21]:
# getting the names of all the images
images = glob("material/train_frames/*.jpg")
train_image = []
train_class = []
for i in tqdm(range(len(images))):
    # creating the image name
    name = images[i].split('/')[2].split('_f')[0]
    #print(name)
    train_image.append(images[i].split('/')[2])
    # creating the class of image
    train_class.append(labels_df['labels'].loc[labels_df['NameOfFile'] == name].values[0])
    
# storing the images and their class in a dataframe
train_data = pd.DataFrame()
train_data['image'] = train_image
train_data['class'] = train_class

# converting the dataframe into csv file 
train_data.to_csv('material/train_frames.csv',header=True, index=False)

100%|██████████| 8300/8300 [00:01<00:00, 5651.36it/s]


# Training the model starts here

In [22]:
#imports
import keras
from keras.models import Sequential
from keras.applications.vgg16 import VGG16
from keras.layers import Dense, InputLayer, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D
from keras.preprocessing import image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [23]:
# read in csv with labels for frames
train = pd.read_csv('material/train_frames.csv')
train.head()

Unnamed: 0,image,class
0,Video_92.mp4_frame60.jpg,side
1,Video_68.mp4_frame6.jpg,side
2,Video_18.mp4_frame9.jpg,side
3,Video_16.mp4_frame54.jpg,side
4,Video_39.mp4_frame76.jpg,side


In [25]:
# reading in all the frames and saving them to a numpy array

# creating an empty list
train_image = []

# for loop to read and store frames
for i in tqdm(range(train.shape[0])):
    # loading the image and keeping the image size (480, 640, 3)
    '''
    TO DO:
    finde heraus was die ideale input größe der bilder ist. 
    '''
    img = image.load_img('material/train_frames/'+train['image'][i], target_size=(480, 640, 3))
    # converting it to array
    img = image.img_to_array(img)
    # normalizing the pixel value
    '''
    TO DO:
    finde heraus was die optimale normalisierung der Bild daten ist
    '''
    img = img/480
    # appending the image to the train_image list
    train_image.append(img)
    
# converting the list to numpy array
X = np.array(train_image)

# shape of the array
X.shape

100%|██████████| 8300/8300 [00:43<00:00, 192.24it/s]


(8300, 480, 640, 3)

In [26]:
# creating train and test set

# separating the target
y = train['class']

# creating the training and validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify = y)

In [28]:
# creating dummies of target variable for train and validation set
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)

In [29]:
y_train

Unnamed: 0,downwards,front,side,upwards
3222,0,1,0,0
6280,0,0,1,0
722,0,1,0,0
3089,1,0,0,0
2868,1,0,0,0
...,...,...,...,...
3109,0,0,1,0
384,0,0,1,0
1981,0,0,1,0
7873,0,0,1,0


### Defining the architecture of the model

In [31]:
# creating the base model of pre-trained VGG16 model
'''
TO DO: 
finde heraus wie ein Model performt, welches von "scratch" creiert wird
'''
base_model = VGG16(weights='imagenet', include_top=False)

In [32]:
# extracting features for training frames
X_train = base_model.predict(X_train)
X_train.shape

(6640, 15, 20, 512)

In [33]:
# extracting features for validation frames
X_test = base_model.predict(X_test)
X_test.shape

(1660, 15, 20, 512)

In [34]:
# reshaping the training as well as validation frames in single dimension
X_train = X_train.reshape(6640, 15*20*512)
X_test = X_test.reshape(1660, 15*20*512)

In [35]:
# normalizing the pixel values
max = X_train.max()
X_train = X_train/max
X_test = X_test/max

In [36]:
# shape of images
X_train.shape

(6640, 153600)

In [37]:
#defining the model architecture
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(153600,)))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

In [38]:
# defining a function to save the weights of best model
from keras.callbacks import ModelCheckpoint
mcp_save = ModelCheckpoint('weight_v2.hdf5', save_best_only=True, monitor='val_loss', mode='min')

In [39]:
# compiling the model
model.compile(loss='categorical_crossentropy',optimizer='Adam',metrics=['accuracy'])

### Model wird hier trainiert

In [40]:
# training the model 
'''
TO DO: optimale epochen zahl herausfinden. Führen 200 epochen zu overfitting?
'''
model.fit(X_train, y_train, epochs=25, validation_data=(X_test, y_test), callbacks=[mcp_save], batch_size=128)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f5e702e5460>

# Evaluating our Video Classification Model

## Defining Model architecture and loading weights

In [1]:
# imports
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.preprocessing import image
import numpy as np
import pandas as pd
from tqdm import tqdm
from keras.applications.vgg16 import VGG16
import cv2
import math
import os
from glob import glob
from scipy import stats as s

In [2]:
# base pretrained model
base_model = VGG16(weights='imagenet', include_top=False)

In [3]:
#defining the model architecture
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(153600,)))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

In [4]:
# loading the trained weights
model.load_weights("weight_v2.hdf5")

In [5]:
# compiling the model
model.compile(loss='categorical_crossentropy',optimizer='Adam',metrics=['accuracy'])

In [6]:
# creating data Frame with test video names
videos = glob("material/raw_test_videos"+"/*mp4")

videoNames = []
for i in range(len(videos)):
    videoNames.append(videos[i].split('/')[2])

test = pd.DataFrame()
test['video_name'] = videoNames
test = test[:-1]
test_videos = test['video_name']
test.head()

Unnamed: 0,video_name
0,_8Vy3dlHg2w_00118.mp4
1,_8Vy3dlHg2w_00126.mp4
2,_8Vy3dlHg2w_00115.mp4
3,_8Vy3dlHg2w_00108.mp4
4,_8Vy3dlHg2w_00110.mp4


In [7]:
# creating the tags
train = pd.read_csv('material/test_labels.csv', sep=';')
y = train['label']
y = pd.get_dummies(y)

In [8]:
test_videos.shape

(23,)

## Generating predictions for test videos

In [9]:
# creating two lists to store predicted and actual tags
predict = []
actual = []

# for loop to extract frames from each test video
for i in tqdm(range(test_videos.shape[0])):
    count = 0
    videoFile = test_videos[i]
    cap = cv2.VideoCapture('material/raw_test_videos/'+videoFile)   # capturing the video from the given path
    frameRate = cap.get(5) #frame rate
    x=1
    # removing all other files from the temp folder
    files = glob('temp/*')
    for f in files:
        os.remove(f)
    while(cap.isOpened()):
        frameId = cap.get(1) #current frame number
        ret, frame = cap.read()
        if (ret != True):
            break
        #if (frameId % math.floor(frameRate) == 0):
            # storing the frames of this particular video in temp folder
        else:
            filename ='temp/' + "_frame%d.jpg" % count;count+=1
            cv2.imwrite(filename, frame)
    cap.release()
    
    # reading all the frames from temp folder
    images = glob("temp/*.jpg")
    
    prediction_images = []
    for i in range(len(images)):
        img = image.load_img(images[i], target_size=(480, 640, 3))
        img = image.img_to_array(img)
        img = img/480
        prediction_images.append(img)
    
    # converting all the frames for a test video into numpy array
    prediction_images = np.array(prediction_images)
    # extracting features using pre-trained model
    prediction_images = base_model.predict(prediction_images)
    # converting features in one dimensional array
    prediction_images = prediction_images.reshape(prediction_images.shape[0], 15*20*512)
    # predicting tags for each array
    prediction = model.predict_classes(prediction_images)
    # appending the mode of predictions in predict list to assign the tag to the video
    predict.append(y.columns.values[s.mode(prediction)[0][0]])
    # appending the actual tag of the video
    actual.append(train['label'].loc[train['NameOfFile'] == videoFile].values[0])

  0%|          | 0/23 [00:00<?, ?it/s]

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


100%|██████████| 23/23 [01:34<00:00,  4.12s/it]


In [10]:

# checking the accuracy of the predicted tags
from sklearn.metrics import accuracy_score
accuracy_score(predict, actual)*100

100.0

In [11]:
predict

['side',
 'front',
 'front',
 'downwards',
 'side',
 'downwards',
 'side',
 'side',
 'downwards',
 'front',
 'downwards',
 'downwards',
 'side',
 'side',
 'side',
 'side',
 'front',
 'front',
 'side',
 'front',
 'side',
 'side',
 'front']

In [12]:
actual

['side',
 'front',
 'front',
 'downwards',
 'side',
 'downwards',
 'side',
 'side',
 'downwards',
 'front',
 'downwards',
 'downwards',
 'side',
 'side',
 'side',
 'side',
 'front',
 'front',
 'side',
 'front',
 'side',
 'side',
 'front']