# Interpolator script

This serves to fill missing values into the sign lang data since each example is not the same size

In [36]:
import numpy as np
import re
import random
from scipy.interpolate import interp1d
import pickle
from sklearn.preprocessing import MinMaxScaler

In [3]:
with open('./data/pickled/x.pickle', 'rb') as f:
    x = pickle.load(f)

with open('./data/pickled/y.pickle', 'rb') as f:
    y = pickle.load(f)

In [4]:
# This is the version of the interpolator that uses the numpy.interp linear interpolator

def npinterpolate(data):
    
    def nanfinder(x):
        return np.isnan(x), lambda z: z.nonzero()[0]
    
    #np.random.seed(26)
    x_interp = []
    
    
    for row in range(data.shape[0]): # For each of 2565 rows
        holder = []
        dim = np.int(np.count_nonzero(~np.isnan(data[row]))/22) # return the number of observations in this example
        for i in range(1,23): # For each of 22 variables
            scaffold = np.array([np.nan]*136) # (136,) array of nans
            current_var = data[row,i*dim-dim:i*dim] # return array with current var of size (dim,)
            randpts = np.sort(random.sample(range(136),dim)) # return array of random points of size (dim,)
            scaffold[randpts] = current_var[:]
            nans,x = nanfinder(scaffold)
            scaffold[nans] = np.interp(x(nans),x(~nans),scaffold[~nans])
            holder.extend(scaffold.tolist()) # long list of values, 1D
        x_interp.extend(holder) # even longer list of values, 1D
    return np.array(x_interp).reshape(data.shape)

In [5]:
# This is the version of the interpolator script that uses the scipy interpolator with kind = slinear

def scipyinterpolate(data):
    x_interp = []
    for row in range(data.shape[0]): # For each of 2565 rows
        holder = []
        dim = np.int(np.count_nonzero(~np.isnan(data[row]))/22)
        for i in range(1,23): # For each of 22 variables
            scaffold = np.full(2992,np.nan)
            old_x = data[row,i*dim-dim:i*dim]
            old_y= [1]+np.sort(random.sample(range(2,136),dim-2)).tolist()+[136]
            new_y = [x for x in range(2,136) if np.in1d(range(2,136),old_y).tolist()[range(2,136).index(x)] == False]
            f = interp1d(old_y,old_x,kind=1)
            new_x = f(new_y)
            scaffold[[x-1 for x in old_y]] = old_x[:]
            scaffold[[x-1 for x in new_y]] = new_x[:]
            holder.extend(scaffold) # even longer list of values, 1D
        x_interp.extend(holder)
    return np.array(x_interp).reshape(data.shape)

In [7]:
x_interp = npinterpolate(x)
#x_scipyinterp = scipyinterpolate(x)

In [11]:
x_interp = MinMaxScaler().fit_transform(x_interp)

In [12]:
with open('./data/pickled/x_interp.pickle', 'wb') as f:
    pickle.dump(x_interp, f, pickle.HIGHEST_PROTOCOL)
    
#np.save('data/x_npinterp.npy',x_npinterp)
#np.save('x_scipyinterp.npy',x_scipyinterp)

In [8]:
# Find min / max / mean sizes of data

size = []
for i in range(x.shape[0]):
    size.append(np.count_nonzero(~np.isnan(x[i]))/22)
np.max(size)

136.0

In [41]:
# Without  ravel / padding with np.nans / etc
# Just gives the raw values. Each would be a different size.
# Initialize empty numpy arrays
x = np.array(0)
y = np.array(0)
data = np.empty(0)

for root, dirs, files in os.walk('data/tctodd'):
    for i,fn in enumerate(files):
        if fn.endswith("tsd"):
            vals = np.loadtxt(os.path.join(root,fn),delimiter='\t') # m features X 22 meausrements / ravel makes it one line
            x[i] = vals
            y[i] = re.search('(.+?)-[0-9]',fn).group(1)

IndexError: list assignment index out of range

In [38]:
data.shape

(3232878,)

In [23]:
y_normed = {k: (data-np.mean(data))/np.std(data) for k, data in my_time_series.items()}

maxlength = max(my_time_series)

x_interped = {k: np.interp(np.linspace(0, 1, maxlength), np.linspace(0, 1, k), data) for k, data in y_normed.items()}

[plot(data) for data in x_interped.values()]

array([ 1125.07194785  +0.j        ,    29.38434614 -38.32643892j,
        -130.68569558-350.2620491j , ...,    63.63042299-103.20299939j,
        -130.68569558+350.2620491j ,    29.38434614 +38.32643892j])

In [24]:
np.linspace(0, 1, 10)

array([ 0.        ,  0.11111111,  0.22222222,  0.33333333,  0.44444444,
        0.55555556,  0.66666667,  0.77777778,  0.88888889,  1.        ])