In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.utils.rnn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils import weight_norm
import cv2
import os
import itertools
import math
from tqdm import tqdm
from datetime import datetime
from sklearn import metrics

In [None]:
"""
There are 96 feature files in each fold but they seem identical - diffrent features for the same frame
there are 100 label files from each kind, gesture, right tool and left tool, 300 total
there are 100 videos from each angle, frame collections. top view and side view, 200 total
there are 100 kinematics files and 100 kinematict numpy files


There is a concern of using an architecture without pretraining, but the features we are supplied with might overcome that

* its possible that the kinematics are not aligned
need a clean data and data loader
"""

In [None]:
DATA_ROOT = "/datashare/APAS"

clipSize = 30
BATCH_SIZE = 20
UNDER_SAMPLE_RATE = 1

In [None]:
gestures = {"no gesture" : "G0",
"needle passing" : "G1",
"pull the suture": "G2",
"Instrument tie": "G3",
"Lay the knot" : "G4",
"Cut the suture" :"G5"}

tool_usage ={"no tool in hand" : "T0",
 "needle_driver": "T1",
 "forceps": "T2",
 "scissors":"T3"}

with open(os.path.join(DATA_ROOT, "mapping_gestures.txt"), 'r') as f1, \
    open(os.path.join(DATA_ROOT, "mapping_tools.txt"), 'r') as f2:
        gestures_mapping = {int(k): v for k, v in [x.split() for x in f1.readlines()]}
        tools_mapping = {int(k): v for k, v in [x.split() for x in f2.readlines()]}

gestures = dict(gestures, **{v: k for k, v in gestures.items()})
tool_usage = dict(tool_usage, **{v: k for k, v in tool_usage.items()})
gestures_mapping = dict(gestures_mapping, **{v: k for k, v in gestures_mapping.items()})
tools_mapping = dict(tools_mapping, **{v: k for k, v in tools_mapping.items()})
"""
now the dictionaries are from the G# to either name or number , same for T#
"""

gestures_mapping

We will load the list of videos that have features
We load all the folds definitions and check all files listed exist

In [None]:
usedVideos = {}
for vid in os.listdir(os.path.join(DATA_ROOT, "features", "fold0")):
    usedVideos[vid.split('.')[0]] = vid
print(len(usedVideos))
#usedVideos

folds = {}
for fold in os.listdir(os.path.join(DATA_ROOT,"folds")):
    with open(os.path.join(DATA_ROOT,"folds", fold)) as f:
        vids = [x.strip('.csv\n') for x in f.readlines()]
        for vid in vids:
            if vid in usedVideos.keys():
                continue
            else:
                print(vid)
        folds[fold.split('.')[0]] = vids

#folds

We will load the gestures labels and check for validity

In [None]:
badVideos = []
goldGestures = {}
for vid in os.listdir(os.path.join(DATA_ROOT,"transcriptions_gestures")):
    with open(os.path.join(DATA_ROOT,"transcriptions_gestures", vid)) as f:
        gesturesSeq = [x.split("\n")[0].split(" ") for x in f.readlines()]
        vidName = vid.split('.')[0]
        if vidName not in usedVideos.keys():
            print(vidName, " should not be used, missing features")
            badVideos.append(vidName)
        for i in range(0, len(gesturesSeq)):
            start, end, ges = gesturesSeq[i]
            if ges not in gestures:
                print(vidName, gesturesSeq[i], "not known gesture")
                badVideos.append(vidName)
            if i == 0:
                if int(start) != 0:
                    print(vidName, gesturesSeq[i], "doesnt start from 0")
                    badVideos.append(vidName)
            if i != 0:
                if int(end) <= int(start):
                    print(vidName, gesturesSeq[i], "bad order")
                    badVideos.append(vidName)
                if int(start) != int(gesturesSeq[i-1][1]) + 1:
                    print(vidName, gesturesSeq[i], "part missing")
                    badVideos.append(vidName)
        goldGestures[vidName] = gesturesSeq

print(badVideos)
#goldGestures

will do the same for tools

In [None]:
goldLeft = {}
for vid in os.listdir(os.path.join(DATA_ROOT,"transcriptions_tools_left_new")):
    with open(os.path.join(DATA_ROOT,"transcriptions_tools_left_new", vid)) as f:
        leftSeq = [x.split("\n")[0].split(" ") for x in f.readlines()]
        vidName = vid.split('.')[0]
        if vidName not in usedVideos.keys():
            print(vidName, " should not be used, missing features")
            badVideos.append(vidName)
        for i in range(0, len(leftSeq)):
            start, end, tool = leftSeq[i]
            if tool not in tool_usage:
                print(vidName, leftSeq[i], "not known gesture")
                badVideos.append(vidName)
            if i == 0:
                if int(start) != 0:
                    print(vidName, leftSeq[i], "doesnt start from 0")
                    badVideos.append(vidName)
            if i != 0:
                if int(end) <= int(start):
                    print(vidName, leftSeq[i], "bad order")
                    badVideos.append(vidName)
                if int(start) != int(leftSeq[i-1][1]) + 1:
                    print(vidName, leftSeq[i], "part missing")
                    badVideos.append(vidName)
        goldLeft[vidName] = leftSeq
print(badVideos)
#goldLeft

In [None]:
goldRight = {}
for vid in os.listdir(os.path.join(DATA_ROOT,"transcriptions_tools_right_new")):
    with open(os.path.join(DATA_ROOT,"transcriptions_tools_right_new", vid)) as f:
        rightSeq = [x.split("\n")[0].split(" ") for x in f.readlines()]
        vidName = vid.split('.')[0]
        if vidName not in usedVideos.keys():
            print(vidName, " should not be used, missing features")
            badVideos.append(vidName)
        for i in range(0, len(rightSeq)):
            start, end, tool = rightSeq[i]
            if tool not in tool_usage:
                print(vidName, rightSeq[i], "not known gesture")
                badVideos.append(vidName)
            if i == 0:
                if int(start) != 0:
                    print(vidName, rightSeq[i], "doesnt start from 0")
                    badVideos.append(vidName)
            if i != 0:
                if int(end) <= int(start):
                    print(vidName, rightSeq[i], "bad order")
                    badVideos.append(vidName)
                if int(start) != int(rightSeq[i-1][1]) + 1:
                    print(vidName, rightSeq[i], "part missing")
                    badVideos.append(vidName)
        goldRight[vidName] = rightSeq

print(badVideos)
#goldRight

we will check there  are  kinematics for each video

In [None]:
kinematicsFiles = {}
for vid in os.listdir(os.path.join(DATA_ROOT,"kinematics_npy")):
    vidName = vid.split('.')[0]
    if vidName not in usedVideos.keys():
        print(vidName)
    kinematicsFiles[vidName] = vid
print(len(kinematicsFiles))


We will ignore the videos with broken labels

In [None]:
badVideos = set(badVideos)

def delete_multiple_keys(dict, keysToDelete):
    for key in keysToDelete:
        if key in dict.keys():
            del dict[key]

delete_multiple_keys(usedVideos, badVideos)
delete_multiple_keys(goldGestures, badVideos)
delete_multiple_keys(goldLeft, badVideos)
delete_multiple_keys(goldRight, badVideos)
delete_multiple_keys(kinematicsFiles, badVideos)

print([len(folds[fold]) for fold in folds.keys()])

for fold in folds.keys():
    for vid in folds[fold]:
        if vid in badVideos:
            folds[fold].remove(vid)

print(len(usedVideos))
print(len(goldGestures))
print(len(goldLeft))
print(len(goldRight))
print(len(kinematicsFiles))
print([len(folds[fold]) for fold in folds.keys()])

We will load the features and kinematics and check synchronization between them and the labels
also we will transform the labels to vectors

In [None]:
features = {}
foldName = "fold0"
foldFeatures = {}
for vid in usedVideos.keys():
    foldFeatures[vid] = np.load(os.path.join(DATA_ROOT, "features", foldName, usedVideos[vid]))
features[foldName] = foldFeatures
print(len(foldFeatures))

kinematicFeatures = {}
for vid in usedVideos.keys():
    kinematicFeatures[vid] = np.load(os.path.join(DATA_ROOT, "kinematics_npy", usedVideos[vid]))
print(len(kinematicFeatures))

def segmentsToVector(segments):
    vec = []
    for start, end, label in segments:
        for i in range(int(start),int(end) + 1):
            vec.append(label)
    if int(end)+1 != len(vec):
        print("opps", int(end), len(vec))
    return vec

goldGesturesVectors = {}
goldRightVectors = {}
goldLeftVectors = {}
for vid in usedVideos.keys():
    goldGesturesVectors[vid] = segmentsToVector(goldGestures[vid])
    goldRightVectors[vid] = segmentsToVector(goldRight[vid])
    goldLeftVectors[vid] = segmentsToVector(goldLeft[vid])

#print(goldGesturesVectors[vid])
#print(goldRightVectors[vid])

In [None]:
vid_features_gest_right_left_kinematics = [[x, np.shape(y), int(goldGestures[x][-1][1]), int(goldRight[x][-1][1]), int(goldLeft[x][-1][1]), np.shape(kinematicFeatures[x])] for x,y in foldFeatures.items()]
[[x, x[1][1] == x[2] and x[2] == x[3] and x[3] == x[4] and x[4] == x[5][1], x[1][1] == x[2] and x[2] == x[3] and x[3] == x[4], max(x[1][1],x[2],x[3],x[4],x[5][1]) - min(x[1][1],x[2],x[3],x[4],x[5][1])] for x in vid_features_gest_right_left_kinematics]

as we can see the data is still problematic, there is no good alignment between the features, labels and kinematics
we will take the shortest length for each video and cut the data accordingly
We will under sample the data:

In [None]:
shortestLengths = [(x[0] , min(x[1][1],x[2],x[3],x[4],x[5][1])) for x in vid_features_gest_right_left_kinematics]

def underSample(array):
    return [array[j] for j in range(0,len(array),UNDER_SAMPLE_RATE)]

#shortestLengths
for i, (vid, shortestLength) in enumerate(shortestLengths):
    goldGesturesVectors[vid] = underSample(goldGesturesVectors[vid][0:shortestLength])
    goldRightVectors[vid] = underSample(goldRightVectors[vid][0:shortestLength])
    goldLeftVectors[vid] = underSample(goldLeftVectors[vid][0:shortestLength])
    kinematicFeatures[vid] = underSample(np.transpose(kinematicFeatures[vid])[0:shortestLength])
    foldFeatures[vid] = underSample(np.transpose(foldFeatures[vid])[0:shortestLength])
    shortestLengths[i] = (vid, int(shortestLength/UNDER_SAMPLE_RATE) + 1)

#print(np.shape(goldGesturesVectors[vid]))
#print(np.shape(goldRightVectors[vid]))
#print(np.shape(goldLeftVectors[vid]))
#print(np.shape(kinematicFeatures[vid]))
#print(np.shape(foldFeatures[vid]))

In [None]:
shortestLengths

Feature Engineering
we will calculate the movement of each hand and the distance between the hands and the movement between that distances

In [None]:
# There are 15 measurements per sensor, there are 6 sensors, 3 for each hand
# we will guess the first is left hand the second is the right hand - confirmed by looking at the data, right hand moves and sensors 4,5,6 change position
NUMBER_OF_MEASUREMENTS = 6 # should be 15
NUMBER_OF_INTERESTING_MEASUREMENTS = 6 # 3 pos and 3 angles
LOCATION_AXIS = 3
NUMBER_OF_SENSORS_PER_HAND = 3

rightKinematicsFeatures = {}
leftKinematicsFeatures = {}

def flattArray(array):
    tmp = array[0]
    for i in range(1, len(array)):
        tmp = np.concatenate((tmp, array[i]), axis=None)
    return np.reshape(tmp,(1, len(tmp)))

for vid in usedVideos.keys():
    print(vid)
    rightKinematicsFeatures[vid] = []
    leftKinematicsFeatures[vid] = []
    #print(np.shape(kinematicFeatures[vid]))
    for i in range(0, len(kinematicFeatures[vid])):
        samples = [np.array(kinematicFeatures[vid][i][x*NUMBER_OF_MEASUREMENTS:(x*NUMBER_OF_MEASUREMENTS) + NUMBER_OF_INTERESTING_MEASUREMENTS]) for x in range(0,NUMBER_OF_SENSORS_PER_HAND*2)]
        left = samples[0:NUMBER_OF_SENSORS_PER_HAND]
        right = samples[NUMBER_OF_SENSORS_PER_HAND:NUMBER_OF_SENSORS_PER_HAND*2]
        # adding difference between the sensor on each hand:
        # adding distance as well
        left.append(np.linalg.norm(left[0][0:LOCATION_AXIS] - left[-1][0:LOCATION_AXIS]))
        right.append(np.linalg.norm(right[0][0:LOCATION_AXIS] - right[-1][0:LOCATION_AXIS]))
        for j in range(1,NUMBER_OF_SENSORS_PER_HAND):
            left.append(left[j] - left[j-1])
            left.append(np.linalg.norm(left[j][0:LOCATION_AXIS] - left[j-1][0:LOCATION_AXIS]))
            right.append(right[j] - right[j-1])
            right.append(np.linalg.norm(right[j][0:LOCATION_AXIS] - right[j-1][0:LOCATION_AXIS]))
        # adding difference between the hands:
        # adding distance as well
        for j in range(0,NUMBER_OF_SENSORS_PER_HAND):
            diff = left[j] - right[j]
            left.append(diff)
            right.append(diff)
            dist= np.linalg.norm(diff[0:LOCATION_AXIS])
            left.append(dist)
            right.append(dist)
        # converting to numpy array
        left = flattArray(left)
        right = flattArray(right)
        # adding speeds - diffrence from last sample
        if i == 0:
            left = np.concatenate((left,np.zeros((np.shape(left)))), axis=1)
            right = np.concatenate((right,np.zeros((np.shape(right)))), axis=1)
        else:
            left = np.concatenate((left,left - leftKinematicsFeatures[vid][i-1][0][0:np.size(left)]), axis=1)
            right = np.concatenate((right,right - rightKinematicsFeatures[vid][i-1][0][0:np.size(right)]), axis=1)
        leftKinematicsFeatures[vid].append(left)
        rightKinematicsFeatures[vid].append(right)
        # Experiment
        # option 1: adding 1 0 to left and 0 1 to right for training a single network for both hands
        # option 2: training two networks one for right and one for left hand
        #print(np.shape(samples))
        #print(vid,samples)


In [None]:
np.shape(rightKinematicsFeatures[vid])

Now we have a usable data set, we will transform it to a data set of clips containing 30 frames and a single label for the clip
We will create a data set class , the class need to implement __getitem__(self, index)
meaning we can hold the data however we want but need to return a sample by index
a List of tuples of np.array is a good idea

In [None]:
TheDataSet = {}
for vid, shortestLength in shortestLengths:
    print(np.shape(leftKinematicsFeatures[vid]))
    print(np.shape(rightKinematicsFeatures[vid]))
    TheDataSet[vid] = []
    for i in range(0, shortestLength, clipSize):
        if i + clipSize < shortestLength:
            clip = []
            clip.append(goldGesturesVectors[vid][i:i+clipSize])
            clip.append(goldRightVectors[vid][i:i+clipSize])
            clip.append(goldLeftVectors[vid][i:i+clipSize])
            clip.append(leftKinematicsFeatures[vid][i:i+clipSize])
            clip.append(rightKinematicsFeatures[vid][i:i+clipSize])
            clip.append(foldFeatures[vid][i:i+clipSize])
            TheDataSet[vid].append(clip)
    #print(len(TheDataSet[vid]))

trainData = {}
validData = {}
testData = {}
foldNumber = 0
for vid in usedVideos.keys():
    if vid in folds["test " + str(foldNumber)]:
        testData[vid] = TheDataSet[vid]
    elif vid in folds["valid " + str(foldNumber)]:
        validData[vid] = TheDataSet[vid]
    else:
        trainData[vid] = TheDataSet[vid]
print(len(trainData))
print(len(validData))
print(len(testData))

In [None]:
class baseDataset(Dataset):
    def __init__(self, targets, clip_size=30):
        self.targets = targets
        self.clip_size = clip_size
        self.y_gesture = 0
        self.y_right = 1
        self.y_left = 2
        self.leftKinematics = 3
        self.rightKinematics = 4
        self.features = 5
        self.data = []

        for vid in targets.keys():
            for clip in targets[vid]:
                self.data.append(clip)


    def __getitem__(self, index):
        clip = self.data[index]
        return (np.array(self.convertLabels(clip[self.y_gesture], gestures_mapping)),
                         np.array(self.convertLabels(clip[self.y_right], tools_mapping)),
                         np.array(self.convertLabels(clip[self.y_left], tools_mapping)),
                         np.array(clip[self.leftKinematics]),
                         np.array(clip[self.rightKinematics]),
                         np.array(clip[self.features]))



    def __len__(self):
        return len(self.data)

    def convertLabels(self, vec, map):
        counts = {}
        maxValue = 0
        maxLabel = None
        for i in vec:
            if i in counts:
                counts[i] += 1
            else:
                counts[i] = 1
        for k,v in counts.items():
            if v > maxValue:
                maxValue = v
                maxLabel = k
        return map[maxLabel]


In [None]:
trainDataset = baseDataset(trainData, clipSize)
validDataset = baseDataset(validData, clipSize)
testDataset = baseDataset(testData, clipSize)

t = trainDataset.__getitem__(20)
for i in t:
    print(np.shape(i))

Working with the gpu

In [None]:
torch.cuda.is_available()

In [None]:
trainLoader = DataLoader(trainDataset, batch_size=BATCH_SIZE, shuffle=True)
validLoader = DataLoader(validDataset, batch_size=BATCH_SIZE, shuffle=True)
testLoader = DataLoader(testDataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        self.linear1 = torch.nn.Linear(108 * clipSize, 200)
        self.activation = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(200, 4)
        self.softmax = torch.nn.Softmax()

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.softmax(x)
        return x

leftTinymodel = TinyModel()
rightTinymodel = TinyModel()

In [None]:
class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, dropout=0.2):
        super(TemporalBlock, self).__init__()
        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs,  kernel_size,stride=stride, dilation=dilation, padding=1))
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,stride=stride, dilation=dilation, padding=1))
        self.net = nn.Sequential(self.conv1, self.relu, self.dropout, self.conv2, self.relu, self.dropout)
        self.relu = nn.ReLU()
        self.init_weights()

    def init_weights(self):
        self.conv1.weight.data.normal_(0, 0.01)
        self.conv2.weight.data.normal_(0, 0.01)

    def forward(self, x):
        out = self.net(x)
        return self.relu(out)


class TemporalConvNet(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
        super(TemporalConvNet, self).__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size, dropout=dropout)]

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

class SingleMaTcnKinModel(torch.nn.Module):
    def __init__(self):
        super(SingleMaTcnKinModel, self).__init__()
        self.tcn = TemporalConvNet( num_inputs=108, num_channels=[128,64,32, 64], kernel_size=2, dropout=0.2)
        self.linear1 = torch.nn.Linear(1024, 200)
        self.activation = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(200, 4)
        self.softmax = torch.nn.Softmax()
        self.fullnet = nn.Sequential(self.linear1, self.activation, self.linear2, self.softmax)

    def forward(self, x):
        x = self.tcn(x)
        return self.fullnet(torch.reshape(x, (np.shape(x)[0], 1024)))

In [None]:
leftModel = SingleMaTcnKinModel()

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(leftTinymodel.parameters())

In [None]:
def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(trainLoader):
        # Every data instance is an input + label pair
        ges, right, left, leftkin, rightkin, fea = data
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = leftTinymodel(torch.reshape(leftkin, (np.shape(leftkin)[0], 108 * clipSize)).to(torch.float32))
        # Compute the loss and its gradients
        loss = loss_fn(outputs, left.to(torch.long))
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 100 == 99:
            last_loss = running_loss / 100 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(trainLoader) + i + 1
            print('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [None]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
epoch_number = 0

EPOCHS = 500

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    leftTinymodel.train(True)
    avg_loss = train_one_epoch(epoch_number)

    # We don't need gradients on to do reporting
    leftTinymodel.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(validLoader):
        vges, vright, vleft, vleftkin, vrightkin, vfea = vdata
        voutputs = leftTinymodel(torch.reshape(vleftkin, (np.shape(vleftkin)[0], 108 * clipSize)).to(torch.float32))
        vloss = loss_fn(voutputs, vleft.to(torch.long))
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    print('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(leftTinymodel.state_dict(), os.path.join("models", model_path))

    epoch_number += 1

We would like to analyze  the performance of our models

saved_model = tinymodel()
saved_model.load_state_dict(torch.load(PATH))

In [None]:
def compareToolUsage(predictions, groundTruth):
    print("segmentation performance")
    precision = metrics.precision_score(groundTruth, predictions, average="weighted")
    print("weighted precision: ", precision)
    recall = metrics.recall_score(groundTruth, predictions, average="weighted")
    print("weighted recall: ", recall)
    f1_micro = metrics.f1_score(groundTruth, predictions, average="micro")
    print("f1_micro: ", f1_micro)
    f1_macro = metrics.f1_score(groundTruth, predictions, average="macro")
    print("f1_macro: ", f1_macro)
    accuracy = metrics.accuracy_score(groundTruth, predictions)
    print("accuracy: ", accuracy)
    return [precision, recall, f1_micro, f1_macro, accuracy]

