Date: 2016-08-23

The goal of this notebook is to:
-  work out code to convert the existing Matlab data structures into pandas DataFrames.
-  make the data structures more 'flat' (*denormalize* in database terms) so that they're more easily readable

This first cell converts the *tasks* data to a single pandas DataFrame.

In [15]:
%matplotlib inline
%reload_ext autoreload
%autoreload 3
import os
import sys
import collections
import scipy.io
import numpy as np
import matplotlib.pyplot as plt  
import seaborn as sns
import pandas as pd

sys.path.append('../src/')
import data_filter as df

# Setup
Animal = collections.namedtuple('Animal', {'directory', 'short_name'})
num_days = 8
days = range(1, num_days + 1)
animals = {'HPa': Animal(directory='HPa_direct', short_name='HPa')}

# Get all epochs
tasks = [(df.get_data_structure(animals[animal], day, 'task', 'task'), animal)
         for animal in animals
         for day in days]
epochs = [(epoch, animal) for day, animal in tasks for epoch in day] # flatten

# Convert into pandas dataframes
ndata = [{name: epoch[0][name][0][0][0]
          for name in epoch[0].dtype.names
          if name not in 'linearcoord'}
         for epoch in epochs]
df1 = pd.DataFrame(ndata)
# print(df1)

day_epoch_ind = [{'animal': day[1], 'day': day_ind + 1, 'epoch_ind': epoch_ind + 1}
                 for day_ind, day in enumerate(tasks)
                 for epoch_ind, epoch in enumerate(day[0])]

df2 = pd.DataFrame(day_epoch_ind)
# print(df2)

epochs_df = (pd
             .concat([df2, df1], axis=1, join_axes=[df1.index])
             .set_index(['animal', 'day', 'epoch_ind'])
             .assign(environment=lambda x: pd.Categorical(x['environment']))
             .assign(type=lambda x: pd.Categorical(x['type']))
            )

print(epochs_df.info())
print('\n')
print(epochs_df)
print('\n')

# Check accessing by Multi-dimensional index (animal HPa, days 6 and 8)
print(epochs_df.loc[(['HPa'], [6,8]), :])
print('\n')

print(epochs_df
      .loc[(['HPa'], [6,8]), :]
      .loc[epochs_df.environment == 'wtr1'])
print('\n')


# Get back the index values
epoch_ind = tuple(epochs_df
            .loc[(['HPa'], [6,8]), :]
            .loc[epochs_df.environment == 'wtr1'].index)
print(epoch_ind)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 42 entries, (HPa, 1, 1) to (HPa, 8, 5)
Data columns (total 2 columns):
environment    33 non-null category
type           42 non-null category
dtypes: category(2)
memory usage: 484.0+ bytes
None


                     environment   type
animal day epoch_ind                   
HPa    1   1            presleep  sleep
           2                 lin    run
           3                 NaN   rest
           4                wtr1    run
           5                 NaN   rest
           6                wtr1    run
           7           postsleep  sleep
       2   1            presleep  sleep
           2                wtr1    run
           3                 NaN   rest
           4                wtr1    run
           5           postsleep  sleep
       3   1            presleep  sleep
           2                wtr1    run
           3                 NaN   rest
           4                wtr1    run
           5           postsleep 

Now the same thing with the tetrode info. This is a little trickier, because there are tetrodes for each epoch. It might be better to have one for each day-epoch (a dictionary of data frames).

In [16]:
def get_tetrode_info(animal):
    '''Returns the Matlab tetrodeinfo file name assuming it is in the Raw Data directory.
    '''
    data_dir = '{working_dir}/Raw-Data'.format(working_dir=os.path.abspath(os.path.pardir))
    return '{data_dir}/{animal.directory}/{animal.short_name}tetinfo.mat'.format(
        data_dir=data_dir,
        animal=animal)

def convert_to_dict(struct_array):
    return {name: np.squeeze(struct_array[name][0,0][0]) for name in struct_array.dtype.names}

tetrode_file_names = [(get_tetrode_info(animals[animal]), animal) for animal in animals]

tetrode_info = [(scipy.io.loadmat(file_name[0]), file_name[1]) for file_name in tetrode_file_names]
tetrode_info = [(convert_to_dict(tetrode[0]), animal, day_ind + 1, epoch_ind + 1, tetrode_ind + 1)
       for info, animal in tetrode_info
       for day_ind, day in enumerate(info['tetinfo'].T)
       for epoch_ind, epoch in enumerate(day[0].T)
       for tetrode_ind, tetrode in enumerate(epoch[0].T)]

tet_df = pd.DataFrame([info[0] for info in tetrode_info])
day_epoch_ind = [{'animal': info[1], 'day': info[2], 'epoch_ind': info[3], 'tetrode_number': info[4]} 
                 for info in tetrode_info]
day_epoch_df = pd.DataFrame(day_epoch_ind)

tetrode_df = (pd
              .concat([day_epoch_df, tet_df], axis=1, join_axes=[day_epoch_df.index])
              .set_index(['animal', 'day', 'epoch_ind', 'tetrode_number'])  # set multi-index to identify rows
              .assign(numcells=lambda x: x['numcells'].astype(int))  # convert numcells to integer type
              .assign(depth=lambda x: x['depth'].astype(int)) # convert depth to integer type
              .assign(area=lambda x: pd.Categorical(x['area']))  # convert numcells to integer type
             )

print(tetrode_df.info())
print('\n')

# Check accessing by Multi-dimensional index

# Get all electrodes from HPa, day 8, epoch 2,4
print(tetrode_df.loc[(['HPa'], [8], [2,4]), :]) 
print('\n')

# Now only electrodes with cells recorded on them
print(tetrode_df
      .loc[tetrode_df.numcells > 0]
      .loc[('HPa', 8, [2,4])])
print('\n')

print(tetrode_df
      .loc[tetrode_df.numcells > 0]
      .loc[('HPa', 8, [2,4])].index.values)

tetrode_df.head()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 840 entries, (HPa, 1, 1, 1) to (HPa, 8, 5, 20)
Data columns (total 4 columns):
area        840 non-null category
depth       840 non-null int64
descrip     462 non-null object
numcells    840 non-null int64
dtypes: category(1), int64(2), object(1)
memory usage: 27.1+ KB
None


                                     area  depth descrip  numcells
animal day epoch_ind tetrode_number                               
HPa    8   2         1                CA1    113  riptet        12
                     2                CA1    121     NaN         0
                     3                CA1     90  CA1Ref         0
                     4                CA1    116  riptet        15
                     5                CA1    116  riptet         0
                     6                CA1    110  riptet         0
                     7                CA1    114  riptet         0
                     8               iCA1    114  riptet         0
  

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,area,depth,descrip,numcells
animal,day,epoch_ind,tetrode_number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HPa,1,1,1,CA1,114,riptet,7
HPa,1,1,2,CA1,124,,0
HPa,1,1,3,CA1,90,CA1Ref,0
HPa,1,1,4,CA1,117,riptet,13
HPa,1,1,5,CA1,119,riptet,1


Now let's experiment with filtering the epochs (say we only want session with the first w-track *wtr1*) and then using that to get the tetrode keys.

In [3]:
epochs_df.loc[epochs_df.environment == 'wtr1']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,environment,type
animal,day,epoch_ind,Unnamed: 3_level_1,Unnamed: 4_level_1
HPa,1,4,wtr1,run
HPa,1,6,wtr1,run
HPa,2,2,wtr1,run
HPa,2,4,wtr1,run
HPa,3,2,wtr1,run
HPa,3,4,wtr1,run
HPa,4,2,wtr1,run
HPa,4,4,wtr1,run
HPa,5,2,wtr1,run
HPa,5,4,wtr1,run
