# Identify Dance Move: Transfer Learning

## Part 1: Load Videos, Preprocess, and Extract Features

In [19]:
from tensorflow import keras
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import cv2
import os
import pickle

Following the tutorial from here: https://keras.io/examples/vision/video_classification/

In [25]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 50

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [3]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [4]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

feature_extractor = build_feature_extractor()

In [10]:
moves = os.listdir('data')
moves.remove('.DS_Store')
moves.remove('all_combined')
print(moves)

['Stick and Roll', 'Brooklyn', 'Charleston', 'Monastery']


In [11]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=moves
)
print(label_processor.get_vocabulary())

['Stick and Roll', 'Brooklyn', 'Charleston', 'Monastery']


  return bool(asarray(a1 == a2).all())


In [13]:
# iterate through each folder and file and add the file name and label to a table

filenames = []
labels = []

for move in moves:
    filenames = filenames + os.listdir('data/{}'.format(move))
    labels = labels + [move] * len(os.listdir('data/{}'.format(move)))

df = pd.DataFrame({'video_name': filenames, 'tag': labels})
print(df.head(5))

                                 video_name             tag
0                              IMG_4742.MOV  Stick and Roll
1  2B780CEE-3D09-4820-BEFD-CFDFF074F444.mov  Stick and Roll
2                              IMG_4753.MOV  Stick and Roll
3  2B93BBED-F225-4523-9DC6-31ABB7347F2F.mov  Stick and Roll
4  24FD3C61-03D2-4BAA-9F27-BE0A1743C71A.mov  Stick and Roll


In [14]:
train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 150)

In [15]:
train_df

Unnamed: 0,video_name,tag
20,IMG_9221.mov,Brooklyn
39,IMG_4754.MOV,Monastery
51,5D8BB37D-643F-4976-9104-BBDD2E7B1CAC.mov,Monastery
21,1D94BB45-0578-489C-A52F-418B6F1279F7.mov,Brooklyn
31,IMG_4702.MOV,Charleston
15,IMG_4749.MOV,Stick and Roll
4,24FD3C61-03D2-4BAA-9F27-BE0A1743C71A.mov,Stick and Roll
43,IMG_4687.MOV,Monastery
32,698334CC-5E67-4CE3-8C7F-A1390D0488B9.mov,Charleston
3,2B93BBED-F225-4523-9DC6-31ABB7347F2F.mov,Stick and Roll


In [16]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        print('Working on video {}'.format(idx))
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels

Working on video 0


2022-08-01 20:33:56.633849: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Working on video 1
Working on video 2
Working on video 3
Working on video 4
Working on video 5
Working on video 6
Working on video 7
Working on video 8
Working on video 9
Working on video 10
Working on video 11
Working on video 12
Working on video 13
Working on video 14
Working on video 15
Working on video 16
Working on video 17
Working on video 18
Working on video 19
Working on video 20
Working on video 21
Working on video 22
Working on video 23
Working on video 24
Working on video 25
Working on video 26
Working on video 27
Working on video 28
Working on video 29
Working on video 30
Working on video 31
Working on video 32
Working on video 33
Working on video 34
Working on video 35
Working on video 36
Working on video 37
Working on video 38
Working on video 39
Working on video 40
Working on video 41
Working on video 42
Working on video 43
Working on video 0
Working on video 1
Working on video 2
Working on video 3
Working on video 4
Working on video 5
Working on video 6
Working on video

In [None]:
train_data, train_labels = prepare_all_videos(train_df, "data/all_combined")
test_data, test_labels = prepare_all_videos(test_df, "data/all_combined")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

In [22]:
pickle.dump(train_data, open('arrays/train_data', 'wb'))
pickle.dump(train_labels, open('arrays/train_labels', 'wb'))
pickle.dump(test_data, open('arrays/test_data', 'wb'))
pickle.dump(test_labels, open('arrays/test_labels', 'wb'))

In [17]:
print(train_data)

(array([[[0.00000000e+00, 5.22811860e-02, 1.47548243e-01, ...,
         2.74500363e-02, 1.09732188e-01, 3.82891387e-01],
        [2.09405139e-01, 9.99020506e-03, 3.90151590e-01, ...,
         4.21903649e-04, 7.39076734e-02, 5.30389488e-01],
        [2.19072402e-01, 2.83084740e-03, 3.36810708e-01, ...,
         0.00000000e+00, 6.72419816e-02, 9.39447761e-01],
        ...,
        [2.17002407e-01, 1.50484383e-01, 4.12761539e-01, ...,
         1.38557166e-01, 8.90455619e-02, 1.12989771e+00],
        [3.31145018e-01, 7.40275607e-02, 4.81291592e-01, ...,
         1.10463314e-01, 5.28983623e-02, 9.85677779e-01],
        [1.19318552e-01, 1.15119860e-01, 5.43458700e-01, ...,
         1.42023310e-01, 1.12287290e-01, 1.08493936e+00]],

       [[3.56361777e-01, 5.56771457e-01, 8.96400690e-01, ...,
         1.81419230e+00, 2.15969586e+00, 6.47249222e-01],
        [4.58724469e-01, 4.61036682e-01, 6.91684365e-01, ...,
         1.90414500e+00, 2.03265429e+00, 4.22077209e-01],
        [3.56523395e-01,

In [18]:
print(train_labels)

[[1]
 [3]
 [3]
 [1]
 [2]
 [0]
 [0]
 [3]
 [2]
 [0]
 [2]
 [0]
 [0]
 [3]
 [2]
 [3]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [3]
 [3]
 [1]
 [0]
 [3]
 [3]
 [3]
 [2]
 [2]
 [3]
 [3]
 [2]
 [2]
 [2]
 [2]
 [0]
 [3]
 [3]
 [3]
 [2]
 [2]]


## Part 2: Fit Models

In [23]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model


# Utility for running experiments.
def run_experiment():
    filepath = "/tmp/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model

In [26]:
_, sequence_model = run_experiment()

Epoch 1/50
Epoch 1: val_loss improved from inf to 1.61814, saving model to /tmp/video_classifier
Epoch 2/50
Epoch 2: val_loss improved from 1.61814 to 1.61755, saving model to /tmp/video_classifier
Epoch 3/50
Epoch 3: val_loss improved from 1.61755 to 1.56347, saving model to /tmp/video_classifier
Epoch 4/50
Epoch 4: val_loss improved from 1.56347 to 1.55585, saving model to /tmp/video_classifier
Epoch 5/50
Epoch 5: val_loss did not improve from 1.55585
Epoch 6/50
Epoch 6: val_loss did not improve from 1.55585
Epoch 7/50
Epoch 7: val_loss did not improve from 1.55585
Epoch 8/50
Epoch 8: val_loss did not improve from 1.55585
Epoch 9/50
Epoch 9: val_loss did not improve from 1.55585
Epoch 10/50
Epoch 10: val_loss did not improve from 1.55585
Epoch 11/50
Epoch 11: val_loss did not improve from 1.55585
Epoch 12/50
Epoch 12: val_loss did not improve from 1.55585
Epoch 13/50
Epoch 13: val_loss did not improve from 1.55585
Epoch 14/50
Epoch 14: val_loss did not improve from 1.55585
Epoch 15/5

Epoch 31/50
Epoch 31: val_loss did not improve from 1.55585
Epoch 32/50
Epoch 32: val_loss did not improve from 1.55585
Epoch 33/50
Epoch 33: val_loss did not improve from 1.55585
Epoch 34/50
Epoch 34: val_loss did not improve from 1.55585
Epoch 35/50
Epoch 35: val_loss did not improve from 1.55585
Epoch 36/50
Epoch 36: val_loss did not improve from 1.55585
Epoch 37/50
Epoch 37: val_loss did not improve from 1.55585
Epoch 38/50
Epoch 38: val_loss did not improve from 1.55585
Epoch 39/50
Epoch 39: val_loss did not improve from 1.55585
Epoch 40/50
Epoch 40: val_loss did not improve from 1.55585
Epoch 41/50
Epoch 41: val_loss did not improve from 1.55585
Epoch 42/50
Epoch 42: val_loss did not improve from 1.55585
Epoch 43/50
Epoch 43: val_loss did not improve from 1.55585
Epoch 44/50
Epoch 44: val_loss did not improve from 1.55585
Epoch 45/50
Epoch 45: val_loss did not improve from 1.55585
Epoch 46/50
Epoch 46: val_loss did not improve from 1.55585
Epoch 47/50
Epoch 47: val_loss did not i

## Part 3: Register the model 
Setting up MLFlow remotely to be shared requires an instance of SQL database + storage such as S3. May be too costly for this project.

For our use case, we can just pickle the resulting model.

After we register the model, it's ready to be consumed by an application, in our case the web UI.

## Part 4: Consume the model

This phase will happen outside of this notebook, where the UI will pull our trained model, process users' video uploads, feeds it through our model, and displays the results.