<a href="https://colab.research.google.com/github/doyeon16/test/blob/master/deep_learning_hand_gesture_recognition_01_data_download%2Bpytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hand Gesture Datasets

This notebook/colab downloads hand gesture datasets and stores them as pickle files.

For more information, please take a look at: https://github.com/guillaumephd/deep_learning_hand_gesture_recognition/issues/1

Note: a (very minor) bug present in the third step described in that issue, when calling `train_test_split` function, is corrected in this notebook.

In [0]:
# ---------------------------------------------------------
# Step 1. Download hand gesture datasets
# ---------------------------------------------------------
download_shrec_17 = False
download_dhg = True
download_online_dhg = False

# --------------------------
# SHREC2017 dataset
#     http://www-rech.telecom-lille.fr/shrec2017-hand/
# --------------------------
if download_shrec_17:
  !mkdir dataset_shrec2017
  !wget http://www-rech.telecom-lille.fr/shrec2017-hand/HandGestureDataset_SHREC2017.tar.gz -O SHREC2017.tar.gz
  !tar -xzf SHREC2017.tar.gz -C dataset_shrec2017
# --------------------------
# DHG14/28 dataset
#     http://www-rech.telecom-lille.fr/DHGdataset/
# --------------------------
# Note: you should register on http://www-rech.telecom-lille.fr/DHGdataset/ before downloading the dataset
if download_dhg:
  !mkdir dataset_dhg1428
  !wget http://www-rech.telecom-lille.fr/DHGdataset/DHG2016.zip
  !unzip DHG2016.zip -d dataset_dhg1428
# --------------------------
# Online DHG dataset
#     http://www-rech.telecom-lille.fr/shrec2017-hand/
# --------------------------
if download_online_dhg:
  !mkdir dataset_onlinedhg
  !wget http://www-rech.telecom-lille.fr/shrec2017-hand/OnlineDHG.zip
  !unzip OnlineDHG.zip -d dataset_onlinedhg

In [0]:
# ---------------------------------------------------------
# Step 2. Utils
# ---------------------------------------------------------
import glob
import numpy
import pickle
from scipy import ndimage as ndimage
from sklearn.model_selection import train_test_split


def resize_gestures(input_gestures, final_length=100):
    """
    Resize the time series by interpolating them to the same length

    Input:
        - input_gestures: list of numpy.ndarray tensors.
              Each tensor represents a single gesture.
              Gestures can have variable durations.
              Each tensor has a shape: (duration, channels)
              where duration is the duration of the individual gesture
                    channels = 44 = 2 * 22 if recorded in 2D and
                    channels = 66 = 3 * 22 if recorded in 3D 
    Output:
        - output_gestures: one numpy.ndarray tensor.
              The output tensor has a shape: (records, final_length, channels)
              where records = len(input_gestures)
                   final_length is the common duration of all gestures
                   channels is the same as above 
    """
    # please use python3. if you still use python2, important note: redefine the classic division operator / by importing it from the __future__ module
    output_gestures = numpy.array([numpy.array([ndimage.zoom(x_i.T[j], final_length / len(x_i), mode='reflect') for j in range(numpy.size(x_i, 1))]).T for x_i in input_gestures])
    return output_gestures


def load_gestures(dataset='dhg', root='/content/dataset_dhg1428', version_x='3D', version_y='both', resize_gesture_to_length=100):
    """
    Get the 3D or 2D pose gestures sequences, and their associated labels.

    Ouput:
        - a tuple of (gestures, labels) or (gestures, labels_14, labels_28)
              where gestures is either a numpy.ndarray tensor or
                                       a list of numpy.ndarray tensors,
                                       depending on if the gestures have been resized or not.
              Each tensor represents a single gesture.
              Gestures can have variable durations.
              Each tensor has a shape: (duration, channels) where channels is either 44 (= 2 * 22) or 66 (=3 * 22)
    """

    # SHREC 2017 (on Google Colab):
    # root = '/content/dataset_shrec2017/HandGestureDataset_SHREC2017'
    # DHG 14/28 (on Google Colab):
    # root = '/content/dataset_dhg1428'
    if dataset == 'dhg':
      assert 'dataset_dhg' in root
    if dataset == 'shrec':
      assert 'dataset_shrec' in root
    
    if version_x == '3D':
        if dataset == 'dhg':
            pattern = root + '/gesture_*/finger_*/subject_*/essai_*/skeleton_world.txt'
        elif dataset == 'shrec':
            pattern = root + '/gesture_*/finger_*/subject_*/essai_*/skeletons_world.txt'
    else:
        if dataset == 'dhg':
            pattern = root + '/gesture_*/finger_*/subject_*/essai_*/skeleton_image.txt'
        elif dataset == 'shrec':
            pattern = root + '/gesture_*/finger_*/subject_*/essai_*/skeletons_image.txt'

    gestures_filenames = sorted(glob.glob(pattern))
    gestures = [numpy.genfromtxt(f) for f in gestures_filenames]
    if resize_gesture_to_length is not None:
        gestures = resize_gestures(gestures, final_length=resize_gesture_to_length)

    labels_14 = [int(filename.split('/')[-5].split('_')[1]) for filename in gestures_filenames]
    labels_28 = [int(filename.split('/')[-4].split('_')[1]) for filename in gestures_filenames]
    labels_28 = [labels_14[idx_gesture] if n_fingers_used == 1 else 14 + labels_14[idx_gesture] for idx_gesture, n_fingers_used in enumerate(labels_28)]

    if version_y == '14' or version_y == 14:
        return gestures, labels_14
    elif version_y == '28' or version_y == 28:
        return gestures, labels_28
    elif version_y == 'both':
        return gestures, labels_14, labels_28


def write_data(data, filepath):
    """Save the dataset to a file. Note: data is a dict with keys 'x_train', ..."""
    with open(filepath, 'wb') as output_file:
        pickle.dump(data, output_file)


def load_data(filepath='./shrec_data.pckl'):
    """
    Returns hand gesture sequences (X) and their associated labels (Y).
    Each sequence has two different labels.
    The first label  Y describes the gesture class out of 14 possible gestures (e.g. swiping your hand to the right).
    The second label Y describes the gesture class out of 28 possible gestures (e.g. swiping your hand to the right with your index pointed, or not pointed).
    """
    file = open(filepath, 'rb')
    data = pickle.load(file, encoding='latin1')  # <<---- change to 'latin1' to 'utf8' if the data does not load
    file.close()
    return data['x_train'], data['x_test'], data['y_train_14'], data['y_train_28'], data['y_test_14'], data['y_test_28']

In [0]:
# ---------------------------------------------------------
# Step 3. Save the dataset(s) you need
# ---------------------------------------------------------
# Example: 3D version of the SHREC17 and DHG gesture datasets, with gestures resized to 100 timesteps
gestures, labels_14, labels_28 = load_gestures(dataset='dhg',
                                               root='/content/dataset_dhg1428',
                                               version_x='3D',
                                               version_y='both',
                                               resize_gesture_to_length=100)
# Split the dataset into train and test sets if you want:
x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28 = train_test_split(gestures, labels_14, labels_28, test_size=0.30)

# Save the dataset
data = {
    'x_train': x_train,
    'x_test': x_test,
    'y_train_14': y_train_14,
    'y_train_28': y_train_28,
    'y_test_14': y_test_14,
    'y_test_28': y_test_28
}
write_data(data, filepath='dhg_data.pckl')

In [28]:
print(gestures.shape)
print(len(labels_14))
print(len(labels_28))
print(type(x_train))
print(type(y_train_14))
print(type(y_train_28))
print(x_train.shape)
print(x_test.shape)
print(len(y_train_14))
#print(y_test_14.shape)
print(len(y_train_28))
#print(y_test_28.shape)

(2800, 100, 66)
2800
2800
<class 'numpy.ndarray'>
<class 'list'>
<class 'list'>
(1960, 100, 66)
(840, 100, 66)
1960
840


In [29]:
# ---------------------------------------------------------
# Step 4. Optional: copy to google drive, if you're in a Google Colab
# ---------------------------------------------------------
try:

  # Connect Google Colab instance to Google Drive
  from google.colab import drive
  drive.mount('/gdrive')

  # Save your dataset on Google Drive
  !cp dhg_data.pckl /gdrive/My\ Drive/dhg_data.pckl

  # Load your dataset from Google Drive
  # !cp /gdrive/My\ Drive/dhg_data.pckl dhg_data.pckl

except:
  print("You're not in a Google Colab!")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
# ---------------------------------------------------------
# Step 5. Use the dataset(s)
# ---------------------------------------------------------
x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28 = load_data('dhg_data.pckl')

In [31]:
print(x_train.shape)
print(x_test.shape)
print(len(y_train_14))
#print(y_test_14.shape)
print(len(y_train_28))
#print(y_test_28.shape)

(1960, 100, 66)
(840, 100, 66)
1960
840


In [0]:
from __future__ import unicode_literals, print_function, division
import sys
if sys.version_info.major < 3:
    print('You are using python 2, but you should rather use python 3.')
    print('    If you still want to use python 2, ensure you import:')
    print('    >> from __future__ import unicode_literals, print_function, division')

import numpy
import pickle
import torch
import itertools
import time
import math
from torch.utils.data import Dataset, DataLoader

In [0]:
# (bonus) plot acc with tensorboard
#   Command to start tensorboard if installed (requires tensorflow):
#   $  tensorboard --logdir ./runs
try:
    from tensorboardX import SummaryWriter
except:
    # tensorboardX is not installed, just fail silently
    class SummaryWriter():
        def __init__(self):
            pass
        def add_scalar(self, tag, scalar_value, global_step=None, walltime=None):
            pass

In [0]:
n_classes = 14
duration = 100
n_channels = 66
learning_rate = 1e-3

In [0]:
#create model


class HandGestureNet(torch.nn.Module):
    """
    [Devineau et al., 2018] Deep Learning for Hand Gesture Recognition on Skeletal Data

    Summary
    -------
        Deep Learning Model for Hand Gesture classification using pose data only (no need for RGBD)
        The model computes a succession of [convolutions and pooling] over time independently on each of the 66 (= 22 * 3) sequence channels.
        Each of these computations are actually done at two different resolutions, that are later merged by concatenation
        with the (pooled) original sequence channel.
        Finally, a multi-layer perceptron merges all of the processed channels and outputs a classification.
    
    TL;DR:
    ------
        input ------------------------------------------------> split into n_channels channels [channel_i]
            channel_i ----------------------------------------> 3x [conv/pool/dropout] low_resolution_i
            channel_i ----------------------------------------> 3x [conv/pool/dropout] high_resolution_i
            channel_i ----------------------------------------> pooled_i
            low_resolution_i, high_resolution_i, pooled_i ----> output_channel_i
        MLP(n_channels x [output_channel_i]) -------------------------> classification

    Article / PDF:
    --------------
        https://ieeexplore.ieee.org/document/8373818

    Please cite:
    ------------
        @inproceedings{devineau2018deep,
            title={Deep learning for hand gesture recognition on skeletal data},
            author={Devineau, Guillaume and Moutarde, Fabien and Xi, Wang and Yang, Jie},
            booktitle={2018 13th IEEE International Conference on Automatic Face \& Gesture Recognition (FG 2018)},
            pages={106--113},
            year={2018},
            organization={IEEE}
        }
    """
    
    def __init__(self, n_channels=66, n_classes=14, dropout_probability=0.2):

        super(HandGestureNet, self).__init__()
        
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.dropout_probability = dropout_probability

        # Layers ----------------------------------------------
        self.all_conv_high = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=7, padding=3),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])

        self.all_conv_low = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.AvgPool1d(2),

            torch.nn.Conv1d(in_channels=4, out_channels=4, kernel_size=3, padding=1),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=self.dropout_probability),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])

        self.all_residual = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2),
            torch.nn.AvgPool1d(2)
        ) for joint in range(n_channels)])

        self.fc = torch.nn.Sequential(
            torch.nn.Linear(in_features=9 * n_channels * 12, out_features=1936),  # <-- 12: depends of the sequences lengths (cf. below)
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=1936, out_features=n_classes)
        )

        # Initialization --------------------------------------
        # Xavier init
        for module in itertools.chain(self.all_conv_high, self.all_conv_low, self.all_residual):
            for layer in module:
                if layer.__class__.__name__ == "Conv1d":
                    torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                    torch.nn.init.constant_(layer.bias, 0.1)

        for layer in self.fc:
            if layer.__class__.__name__ == "Linear":
                torch.nn.init.xavier_uniform_(layer.weight, gain=torch.nn.init.calculate_gain('relu'))
                torch.nn.init.constant_(layer.bias, 0.1)

    def forward(self, input):
        """
        This function performs the actual computations of the network for a forward pass.

        Arguments
        ---------
            input: a tensor of gestures of shape (batch_size, duration, n_channels)
                   (where n_channels = 3 * n_joints for 3D pose data)
        """

        # Work on each channel separately
        all_features = []

        for channel in range(0, self.n_channels):
            input_channel = input[:, :, channel]

            # Add a dummy (spatial) dimension for the time convolutions
            # Conv1D format : (batch_size, n_feature_maps, duration)
            input_channel = input_channel.unsqueeze(1)

            high = self.all_conv_high[channel](input_channel)
            low = self.all_conv_low[channel](input_channel)
            ap_residual = self.all_residual[channel](input_channel)

            # Time convolutions are concatenated along the feature maps axis
            output_channel = torch.cat([
                high,
                low,
                ap_residual
            ], dim=1)
            all_features.append(output_channel)

        # Concatenate along the feature maps axis
        all_features = torch.cat(all_features, dim=1)
        
        # Flatten for the Linear layers
        all_features = all_features.view(-1, 9 * self.n_channels * 12)  # <-- 12: depends of the initial sequence length (100).
        # If you have shorter/longer sequences, you probably do NOT even need to modify the modify the network architecture:
        # resampling your input gesture from T timesteps to 100 timesteps will (surprisingly) probably actually work as well!

        # Fully-Connected Layers
        output = self.fc(all_features)

        return output

In [0]:
# -------------
# Network instantiation
# -------------
model = HandGestureNet(n_channels=n_channels, n_classes=n_classes)

In [37]:
#data load
# We load a gesture dataset:
#
#   x.shape should be (dataset_size, duration, channel)
#   y.shape should be (dataset_size, 1)


# If you want to use the DHG dataset, go to: https://colab.research.google.com/drive/1ggYG1XRpJ50gVgJqT_uoI257bspNogHj
use_dhg_dataset = True

if use_dhg_dataset:
    # ------------------------
    # DHG Dataset
    # ------------------------
    try:
        # Connect Google Colab instance to Google Drive
        from google.colab import drive
        drive.mount('/gdrive')
        # Load the dataset (you already have created in the other notebook) from Google Drive
        !cp /gdrive/My\ Drive/dhg_data.pckl dhg_data.pckl
    except:
        print("You're not in a Google Colab!")

    def load_data(filepath='./shrec_data.pckl'):
        """
        Returns hand gesture sequences (X) and their associated labels (Y).
        Each sequence has two different labels.
        The first label  Y describes the gesture class out of 14 possible gestures (e.g. swiping your hand to the right).
        The second label Y describes the gesture class out of 28 possible gestures (e.g. swiping your hand to the right with your index pointed, or not pointed).
        """
        file = open(filepath, 'rb')
        data = pickle.load(file, encoding='latin1')  # <<---- change to 'latin1' to 'utf8' if the data does not load
        file.close()
        return data['x_train'], data['x_test'], data['y_train_14'], data['y_train_28'], data['y_test_14'], data['y_test_28']

    x_train, x_test, y_train_14, y_train_28, y_test_14, y_test_28 = load_data('dhg_data.pckl')
    y_train_14, y_test_14 = numpy.array(y_train_14), numpy.array(y_test_14)
    y_train_28, y_test_28 = numpy.array(y_train_28), numpy.array(y_test_28)
    if n_classes == 14:
        y_train = y_train_14
        y_test = y_test_14
    elif n_classes == 28:
        y_train = y_train_28
        y_test = y_test_28

else:
    # ------------------------
    # Custom Dataset
    # ------------------------
    # On the left bar of this colaboratory notebook there is a section called "Files".
    # Upload your files there and use a path like "/content/each_file_you_just_uploaded" to load your data
    # 
    # For now, for the sake of demonstration purposes, let's create fake data
    x_train = numpy.random.randn(2000, duration, n_channels)
    y_train = numpy.random.random_integers(n_classes, size=2000)

    x_test = numpy.random.randn(1000, duration, n_channels)
    y_test = numpy.random.random_integers(n_classes, size=1000)

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
class GestureDataset(Dataset):
 
    def __init__(self, x, y):
        self.x = x
        self.y = y
 
    def __len__(self):
        return len(self.x)
 
    def __getitem__(self, i):
        return self.x[i], self.y[i]

In [0]:
# ------------------------
# Create pytorch datasets and dataloaders:
# ------------------------
# Convert from numpy to torch format
x_train, x_test = torch.from_numpy(x_train), torch.from_numpy(x_test)
y_train, y_test = torch.from_numpy(y_train), torch.from_numpy(y_test)

# Ensure the label values are between 0 and n_classes-1
if y_train.min() > 0:
  y_train = y_train - 1
if y_test.min() > 0:
  y_test = y_test - 1

# Ensure the data type is correct
x_train, x_test = x_train.float(), x_test.float()
y_train, y_test = y_train.long(), y_test.long()

# Create the datasets
train_dataset = GestureDataset(x=x_train, y=y_train)
test_dataset = GestureDataset(x=x_test, y=y_test)

# Pytorch dataloaders are used to group dataset items into batches
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_dataloader  = DataLoader(test_dataset,  batch_size=32, shuffle=True, num_workers=4)

In [0]:
def time_since(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '{:02d}m {:02d}s'.format(int(m), int(s))


def get_accuracy(model, x, y_ref):
    """Get the accuracy of the pytorch model on a batch"""
    acc = 0.
    model.eval()
    with torch.no_grad():
        predicted = model(x)
        _, predicted = predicted.max(dim=1)
        acc = 1.0 * (predicted == y_ref).sum().item() / y_ref.shape[0]
    return acc

In [0]:
#training model

# -----------------------------------------------------
# Loss function & Optimizer
# -----------------------------------------------------
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

In [0]:
# -------------
# Training
# -------------


def train(model, criterion, optimizer, dataloader,
          x_train, y_train, x_test, y_test,
          force_cpu=False, num_epochs=5):
    
    # use a GPU (for speed) if you have one
    device = torch.device("cuda") if torch.cuda.is_available() and not force_cpu else torch.device("cpu")
    model = model#.to(device)
    #x_train, y_train = x_train.to(device), y_train.to(device)
    #x_test, y_test = x_test.to(device), y_test.to(device)

    # (bonus) log accuracy values to visualize them in tensorboard:
    writer = SummaryWriter()
    
    # Training starting time
    start = time.time()

    print('[INFO] Started to train the model.')
    print('Training the model on {}.'.format('GPU' if device == torch.device('cuda') else 'CPU'))
    
    for ep in range(num_epochs):

        # Ensure we're still in training mode
        model.train()

        current_loss = 0.0

        for idx_batch, batch in enumerate(dataloader):

            # Move data to GPU, if available
            x, y = batch
            #x, y = x.to(device), y.to(device)

            # zero the gradient parameters
            optimizer.zero_grad()

            # forward
            y_pred = model(x)

            # backward + optimize
            # backward
            loss = criterion(y_pred, y)
            loss.backward()
            # optimize
            optimizer.step()
            # for an easy access
            current_loss += loss.item()
        
        train_acc = get_accuracy(model, x_train, y_train)
        test_acc = get_accuracy(model, x_test, y_test)
        
        writer.add_scalar('data/accuracy_train', train_acc, ep)
        writer.add_scalar('data/accuracy_test', test_acc, ep)
        print('Epoch #{:03d} | Time elapsed : {} | Loss : {:.4e} | Accuracy_train : {:.2f}% | Accuracy_test : {:.2f}% '.format(
                ep + 1, time_since(start), current_loss, 100 * train_acc, 100 * test_acc))

    print('[INFO] Finished training the model. Total time : {}.'.format(time_since(start)))

In [43]:
print(x_train.shape)
print(x_test.shape)
print(type(y_train))
print(len(y_train))
print(type(y_test))
print(len(y_test))

torch.Size([1960, 100, 66])
torch.Size([840, 100, 66])
<class 'torch.Tensor'>
1960
<class 'torch.Tensor'>
1960


In [47]:
# Please adjust the training epochs count, and the other hyperparams (lr, dropout, ...), for a non-overfitted training according to your own needs.
# tip: use tensorboard to display the accuracy (see cells above for tensorboard usage)

num_epochs = 20

train(model=model, criterion=criterion, optimizer=optimizer, dataloader=train_dataloader,
      x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, num_epochs=num_epochs)

[INFO] Started to train the model.
Training the model on CPU.


RuntimeError: ignored

In [0]:
#get trained model

# Reminder: first redefine/load the HandGestureNet class before you use it, if you want to use it elsewhere
model = HandGestureNet(n_channels=n_channels, n_classes=n_classes)
model.load_state_dict(torch.load('gesture_pretrained_model.pt'))
model.eval()

# make predictions
with torch.no_grad():
    demo_gesture_batch = torch.randn(32, duration, n_channels)
    predictions = model(demo_gesture_batch)
    _, predictions = predictions.max(dim=1)
    print("Predicted gesture classes: {}".format(predictions.tolist()))