# Processing the sign language dataset

Import the basics here. As it happens, we only need numpy.

In [1]:
import numpy as np

Here we import the packages required for the get_data() function.

In [None]:
import os
import re

The data in the sign language dataset comes in the form of a multivariate time series. There are 22 variables (pitch, yaw, roll, etc. for each of the two hands). The sampling rate is estimated to be in the region of 100Hz, and each sign varies in length and so number of recorded data points varies from sign to sign.

The get_data() function below iterates over a list of directories

Sklearn requires that all data is in form [n X m] where n is the number of observations and m is the number of features. Because of the varied length of each sign, the number of features is different



In [None]:
def get_data():
    x = np.empty(0)
    y = np.empty(0)
    
    for i in os.listdir(os.getcwd() + '/tctodd'):
        if not i.endswith('.DS_Store'):
            for fn in os.listdir(os.getcwd() + '/tctodd/' + i):
                if fn.endswith("tsd"):
                    action = re.search('(.+?)-[0-9]',fn).group(1)
                    data_current = np.loadtxt(os.getcwd() + '/tctodd/' + i + '/' + fn,delimiter='\t').ravel(order='F')
                    interp_data = interpolate(data_current)
                    data = np.append(data,interp_data)
                    y = np.append(y,action)
            # print 'Done with directory ' + i # Provide status updates
    return data.reshape((len(data)/2970,2970)), y

In [None]:
def interpolate(data):

    def nanhelper(x):
        return np.isnan(x), lambda z: z.nonzero()[0]
    
    holder = np.ndarray(0)
    
    for i in range(1,23):
        scaffold = np.full(135,np.nan)
        dim = data.shape[0]/22
        current_var = data[i*dim-dim:i*dim]
        randpts = np.sort(np.random.choice(np.linspace(0,134,135,dtype=np.intp),dim))
        scaffold[randpts] = current_var[:]
        nans,x = nanhelper(scaffold)
        scaffold[nans] = np.interp(x(nans),x(~nans),scaffold[~nans])
        holder = np.append(holder,scaffold)
    
    return holder

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
x,y = get_data()
X_train, X_test, y_train, y_test = train_test_split(x, y)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler() # instantiate the scaler
scaler.fit(X_train) # Fit the model to the training data
X_train_scaled = scaler.transform(X_train) # transform the training data
X_test_transformed = scaler.transform(X_test) # transform the testing data

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(100) # instantiate the model
pca.fit(X_train_scaled) # Fit the model to training data
X_train_PCA = pca.transform(X_train) # transform the training data
X_test_PCA = pca.transform(X_test) # transform the testing data

In [None]:
from sklearn.svm import LinearSVC

In [None]:
svm = LinearSVC(C=0.1)
svm.fit(X_train_PCA, y_train)
svm.score(X_train_PCA, y_train)
svm.score(X_test_PCA, y_test)