In [299]:
import os
import sys
sys.path.insert(0, os.path.abspath(r'D:\Code Repos\prey_capture'))


import panel as pn
import holoviews as hv
from holoviews import opts, dim
from holoviews.operation import histogram
hv.extension('bokeh')
from bokeh.resources import INLINE

import paths
import functions_bondjango as bd
import functions_misc as fm
import functions_plotting as fp
import functions_loaders as fl
from snakemake_scripts.classify_batch import reverse_roll_shuffle, chunk_shuffle
import pandas as pd
import numpy as np
import processing_parameters
import importlib
from pprint import pp as pprint
from itertools import product

import scipy.signal as ss
import scipy.stats as stat
import sklearn.preprocessing as preproc

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

In [5]:
# set up the figure config
importlib.reload(fp)
importlib.reload(processing_parameters)
# define the target saving path
save_path = os.path.join(paths.figures_path, 'GLM_NN')

# define the printing mode
save_mode = True
# define the target document
target_document = 'poster'
# set up the figure theme
fp.set_theme()
# load the label dict
label_dict = processing_parameters.label_dictionary
variable_list = processing_parameters.variable_list

In [6]:
# load the data

importlib.reload(processing_parameters)
importlib.reload(fl)

# get the paths from the database using search_list
all_paths, all_queries = fl.query_search_list()
# print(all_paths)

data_list = []
# load the data
for path, queries in zip(all_paths, all_queries):
    
    data, _, _  = fl.load_preprocessing(path, queries)
    data_list.append(data)

# print(all_paths)
print(data_list)



[[     mouse_snout_x  mouse_snout_y  mouse_barl_x  mouse_barl_y  mouse_barr_x  \
0        37.593005      38.788323     36.170868     36.588643     38.363729   
1        38.555963      39.096029     37.202837     37.211424     38.986265   
2        38.134375      39.480823     37.133764     37.999894     39.278974   
3        37.492098      39.225254     36.267561     37.802667     38.937473   
4        35.571300      38.191011     35.687719     36.439891     36.953608   
..             ...            ...           ...           ...           ...   
135      21.291103       2.393621     21.878647     -0.739680     22.447907   
136      21.430820       2.393598     22.636849     -0.791401     22.448212   
137      21.858535       2.182355     23.286332     -1.270482     22.570760   
138      22.173580       1.495951     23.307002     -1.270496     22.705392   
139      22.416743       1.495883     23.453139     -1.201538     22.747579   

     mouse_barr_y  mouse_head_x  mouse_head_y    

In [7]:
def maxmin(array_in):
    return (array_in-np.nanmin(array_in))/(np.nanmax(array_in)-np.nanmin(array_in))

def basis_predictors(variable, basis_number, kernel, kernel_spacing, total_length,label):
    # initialize the output dataframe
    out_frame = pd.DataFrame()
    # generate the displaced basis functions
    for idx2 in np.arange(basis_number):
        # generate the sizes of the before and after padding of the kernel
        back = int(kernel_spacing*idx2)
        front = int(total_length-kernel.shape[0]-back)
        # generate the full kernel
        if back == 0:
            current_kernel = np.concatenate((kernel, np.zeros(front)))
        elif idx2 == basis_number-1:
            current_kernel = np.concatenate((np.zeros(back), kernel))
        else:
            current_kernel = np.concatenate((np.zeros(back), kernel, np.zeros(front)))

        # convolve with the data
        vector = np.convolve(variable, current_kernel, 'same')
        # normalize to 0-1
        vector = maxmin(vector)
        # if the vector was all zeros, it'll turn into nans so remove
        vector[np.isnan(vector)] = 0

        # generate the field in the new data frame
        out_frame[label+'_'+str(idx2)] = vector
        
    return out_frame

In [59]:
# assemble the feature and calcium matrices

# set up the feature and calcium matrices
# list the radial features in the dataset
radial_features = ['cricket_0_delta_heading', 'cricket_0_visual_angle', 'mouse_heading', 
                   'cricket_0_delta_head', 'cricket_0_heading', 'head_direction']
# define the design matrix
feature_list = variable_list

# define the frame rate (fps)
frame_rate = 10
# define the width of the kernel (s), multiplied to convert to frames
sigma = 1*frame_rate
# calculate the kernel
kernel = ss.gaussian(sigma*5, sigma)
# define the number of basis functions per regressor
basis_number = 9
# define the kernel spacing (in s)
kernel_spacing = 0.2*frame_rate
# get the total length of the kernel
total_length = kernel_spacing*(basis_number-1) + kernel.shape[0]
# # get the start positions of the basis functions (assume sigma defines the interval)
# basis_starts = [int(el) for el in np.arange(-sigma*((basis_number-1)/2), 
#                                        sigma*((basis_number-1)/2)+1, sigma)]
# allocate memory for the output
feature_trials = []
# allocate memory for a data frame without the encoding model features
feature_raw_trials = []
# allocate memory for the calcium
calcium_trials = []
# allocate a list for the mouse/day pairs
pairs_list = []
# get the number of trials
trial_number = len(data_list[0])
# get the features
for idx, el in enumerate(data_list[0]):
    # get the intersection of the labels
    label_intersect = [feat for feat in feature_list if feat in el.columns]
    
    if len(label_intersect) != len(feature_list):
        continue
    # get the features of interest
    target_features = el.loc[:, feature_list]
    # save the original features for simpler calculations
    feature_raw_trials.append(target_features.copy())
    # get the original columns
    original_columns = target_features.columns
    
    # turn the radial variables into linear ones
    # for all the columns
    for label in original_columns:
        # calculate head speed
        if label == 'head_direction':
            # get the head direction
            head = target_features[label].copy().to_numpy()
            # get the angular speed and acceleration of the head
            speed = np.concatenate(([0], np.diff(ss.medfilt(head, 21))), axis=0)
            acceleration = np.concatenate(([0], np.diff(head)), axis=0)
            # add to the features
            target_features['head_speed'] = speed
            target_features['head_acceleration'] = acceleration
        # check if the feature is radial
        if label in radial_features:
            # get the feature
            rad_feature = target_features[label].copy().to_numpy()
            # convert to radians
            rad_feature = np.deg2rad(rad_feature)
            # perform angular decomposition (assume unit circle)
            x = np.cos(rad_feature)
            y = np.sin(rad_feature)
            # replace the original column by the extracted ones
            target_features[label+'_x'] = x
            target_features[label+'_y'] = y
            # drop the original column
            target_features.drop(labels=label, axis=1, inplace=True)
        # check if the label is a speed and calculate acceleration
        if 'speed' in label:
            # get the speed
            speed = target_features[label].copy().to_numpy()
            # calculate the acceleration with the smoothed speed
            acceleration = np.concatenate(([0], np.diff(ss.medfilt(speed, 21))), axis=0)
            # add to the features
            target_features[label.replace('speed', 'acceleration')] = acceleration
    
    # Generate the gaussian convolved and displaced regressors
    # allocate an empty dataframe for the outputs
    new_dataframe = pd.DataFrame()
    # for all the regressors
    for label in target_features:
        # get the variable
        variable = target_features[label].to_numpy().copy()
        # Remove nans
        variable[np.isnan(variable)] = 0

        # get the basis function-based predictors
        out_frame = basis_predictors(variable, basis_number, kernel, kernel_spacing, total_length,label)
        # add to the dataframe
        new_dataframe = pd.concat((new_dataframe, out_frame), axis=1)

#     # add a constant factor
#     constant = np.ones(new_dataframe.shape[0])
#     new_dataframe['constant'] = constant
    # add a trial factor
#     new_dataframe['trial'] = idx*np.ones(vector.shape[0])
#     # for all the trials
#     for trial in np.arange(trial_number):
#         new_dataframe['trial_'+str(trial)] = np.zeros(vector.shape[0])
#         if trial == idx:
#             new_dataframe['trial_'+str(trial)] += 1

    # replace the old dataframe with the new one
    target_features = new_dataframe
        
    # store the columns
    resulting_columns = target_features.columns
    # turn the dataframe into an array
    target_features = target_features.to_numpy()

    # store the array
    feature_trials.append(target_features)
    
    # get the calcium data
    cells = [cell for cell in el.columns if 'cell' in cell]
    cells = el.loc[:, cells].to_numpy()

    # store
    calcium_trials.append(cells)
    # store the mouse and date
    pairs_list.append([el.loc[0, 'mouse'], el.loc[0, 'datetime'][:10]])

print(f'Time by features: {feature_trials[0].shape}')
print(f'Time by ROIs: {calcium_trials[0].shape}')
print(resulting_columns)

# calculate the unique pairs for mouse and date
unique_pairs = np.unique(pairs_list, axis=0)
print(unique_pairs)

Time by features: (140, 135)
Time by ROIs: (140, 26)
Index(['mouse_speed_0', 'mouse_speed_1', 'mouse_speed_2', 'mouse_speed_3',
       'mouse_speed_4', 'mouse_speed_5', 'mouse_speed_6', 'mouse_speed_7',
       'mouse_speed_8', 'mouse_x_0',
       ...
       'cricket_0_visual_angle_x_8', 'cricket_0_visual_angle_y_0',
       'cricket_0_visual_angle_y_1', 'cricket_0_visual_angle_y_2',
       'cricket_0_visual_angle_y_3', 'cricket_0_visual_angle_y_4',
       'cricket_0_visual_angle_y_5', 'cricket_0_visual_angle_y_6',
       'cricket_0_visual_angle_y_7', 'cricket_0_visual_angle_y_8'],
      dtype='object', length=135)
[['MM_200129_a' '2020-03-02']
 ['MM_200129_a' '2020-03-04']
 ['MM_200129_a' '2020-03-05']
 ['MM_200129_a' '2020-03-06']
 ['MM_200129_a' '2020-03-10']
 ['MM_200129_a' '2020-03-11']
 ['MM_200129_a' '2020-03-12']
 ['MM_200129_a' '2020-03-13']]


In [26]:
# plot an example feature matrix and corresponding calcium

raster = hv.Raster(feature_trials[0].T)
raster.opts(width=800, height=600, xlabel='Frames', ylabel='Features', tools=['hover'], cmap='Reds')

raster1 = hv.Raster(calcium_trials[0].T)
raster1.opts(width=800, height=300, xlabel='Frames', ylabel='Cells', tools=['hover'], cmap='Viridis')
(raster+raster1).cols(1).opts(shared_axes=False)

In [10]:
print(calcium_trials[0].shape, feature_trials[0].shape)


(140, 26) (140, 136)


In [411]:
def train_test_glm_nn(current_features, current_calcium, scaler=None, activation='relu', l1=0.01, l2=0.01, loss='mse', learning_rate=0.001, validation_split=0.3,
                     batch_size=100, epochs=200, test_train_shuffle=True, verbose=0, sample_shuffle=None, scale_calcium=False):
    """Train a GLM-NN for the given data"""
    
    # scale the features
    if scaler is not None:
        current_features = [scaler().fit_transform(el) for el in current_features]
    # scale the calcium
    if scale_calcium:
        current_calcium = [el/np.max(el) for el in current_calcium]
    # concatenate them
    current_features = np.vstack(current_features)
    current_calcium = np.vstack(current_calcium)
    # get the trial data
    X = current_features.copy()

    # get the calcium
    y = current_calcium.copy()
    # shuffle if needed
    if sample_shuffle is not None:
        y = sample_shuffle(y)
    
    # get the number of output features
    output_features = y.shape[1]
    # define the optimizer and network parameters
    model = tf.keras.Sequential()
    model.add(layers.Dense(output_features, activation='relu'))
    model.add(layers.ActivityRegularization(l1=0.02, l2=0.03))
    # compile the model with the Adam optimizer
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')

    # train the model
    history = model.fit(X, y, validation_split=validation_split, batch_size=batch_size, epochs=epochs, shuffle=test_train_shuffle, verbose=verbose)
    
    # calculate performance
    predictions = model.predict(X)
    correlations_per_cell = [stat.spearmanr(predictions[:, el], y[:, el])[0] for el in np.arange(predictions.shape[1])]
    average_correlation = np.nanmean(correlations_per_cell)
    performance = [correlations_per_cell, average_correlation]
    # extract the weights
    weights = model.layers[0].get_weights()[0]
    # extract the history
    history = history.history
    
    return performance, weights, history, X, y, model

In [413]:
# define network parameters
kwargs = {
    'learning_rate': 0.001,
    'validation_split': 0.3,
    'batch_size': 100,
    'epochs': 100,
    'test_train_shuffle': False,
    'verbose': 0,
    'scaler': preproc.StandardScaler,
    'l1': 5,
    'l2': 1,
    'scale_calcium': True,
    'sample_shuffle': None,
#     'sample_shuffle': reverse_roll_shuffle,
}

In [414]:
%%time
# calculate the fits for all the trials

# allocate memory for the performances
glm_performances = []
# allocate memory for the weights
glm_weights = []
# allocate memory for the losses
glm_loss = []

# for all the pairs
for mouse, day in unique_pairs[:1]:
    print(mouse, day)
    # find the corresponding trials
    trial_idx = [el for el in np.arange(len(feature_trials)) if (mouse == pairs_list[el][0]) & (day == pairs_list[el][1])]
    
    # for all the trials
    current_features = [feature_trials[el] for el in trial_idx]
    current_calcium = [calcium_trials[el] for el in trial_idx]
    
    # train the net
    performance, weights, loss, X, y, model = train_test_glm_nn(current_features, current_calcium, **kwargs)
    # save the output
    glm_performances.append(performance)
    glm_weights.append(weights)
    glm_loss.append(loss)


MM_200129_a 2020-03-02
Wall time: 4.52 s


In [415]:
# plot lossess and mse

plot_list = []
# for all the days
for idx, (mouse, day) in enumerate(unique_pairs):
    if idx + 1 > len(glm_loss):
        continue
    loss = glm_loss[idx]['loss']
    val_loss = glm_loss[idx]['val_loss']
    
    performance = glm_performances[idx][1]
    # assemble the title
    title = f'{mouse} {day} {performance:0.3f}'

    loss_plot = hv.Curve(loss, label='Train Loss')
    loss_plot.opts(width=400, height=400, tools=['hover'], xlabel='Epochs', ylabel='Loss', title=title)
    val_loss_plot = hv.Curve(val_loss, label='Val Loss')

    overlay = (loss_plot*val_loss_plot).opts(show_legend=True)
    plot_list.append(overlay)
    
layout = hv.Layout(plot_list).cols(4)

layout

In [417]:
# plot all predictions and all real data
# predict the full traces

predictions = model.predict(X)

# calculate the correlations per cell and average
correlations_per_cell = [stat.spearmanr(predictions[:, el], y[:, el])[0] for el in np.arange(predictions.shape[1])]
print(f'{correlations_per_cell}', f'Average correlation: {np.mean(correlations_per_cell):0.3f}')

# print(predictions.shape)
real = hv.Raster(y.T)
# real.opts(width=1000, height=400)
pred = hv.Raster(predictions.T)

(real+pred).cols(1).opts(opts.Raster(tools=['hover'], width=1000, height=400, cmap='Inferno', ylabel='Cells', xlabel='Frames'))

[-0.03705686660279633, 0.23743475397470204, 0.039704081317511554, 0.11234208419377185, 0.061795512755575134, 0.14867917088138122, 0.2438412568978257, 0.12172277599098483, -0.05925087659365878, 0.10866733054448817, 0.13549777586416917, 0.0008231699907000104, -0.032226486855870526] Average correlation: 0.083


In [400]:
raster = hv.Raster(X.T)
raster.opts(width=1000, height=600, tools=['hover'], cmap='Reds', colorbar=True, ylabel='Regressors', xlabel='Frames')
raster

In [268]:
# plot the weights
weights = model.layers[0].get_weights()[0]

raster = hv.Raster(weights.T)
raster.opts(width=800, height=600, tools=['hover'], cmap='RdBu', colorbar=True)
raster

In [130]:
# plot a defined prediction

# define the target cell
target_cell = 1

# get the real cell
real_cell = y[:, target_cell]
# get the prediction
predicted_cell = predictions[:, target_cell]

real_plot = hv.Curve(real_cell)
real_plot.opts(width=1000, height=400, tools=['hover'])
predicted_plot = hv.Curve(predicted_cell)

real_plot*predicted_plot

In [None]:
# Plot the kernels for each variable

In [368]:
%%time
# do a grid search on regularization

# define the parameters to grid search on
# l1_list = [0.001, 0.005, 0.01, 0.05, 0.1]
# l2_list = [0.001, 0.005, 0.01, 0.05, 0.1]
l1_list = [0.01, 0.05, 0.1, 0.5, 1, 5]
l2_list = [0.1, 1, 10, 100, 1000]
# l1_list = [0.01]
# l2_list = [0.01]

# define the number of iterations
number_iterations = 10

# define the base parameters
params = {
    'learning_rate': 0.001,
    'validation_split': 0.3,
    'batch_size': 100,
    'epochs': 100,
    'test_train_shuffle': False,
    'verbose': 0,
    'scaler': preproc.StandardScaler,
}

# allocate the output
performance_list = []
# set up the for loop
for l1, l2 in product(l1_list, l2_list):
    print(l1, l2)
    # add the regularizations to the parameters
    params['l1'] = l1
    params['l2'] = l2
    
    # for all the iterations
    for el in np.arange(number_iterations):
        
        # train the real net
        params['sample_shuffle'] = None
        performance, _, _, _, _, _ = train_test_glm_nn(current_features, current_calcium, **params)
        # save the output
        performance_list.append([l1, l2, performance[1], el, False])
        # train the shuffle net
        params['sample_shuffle'] = reverse_roll_shuffle
        performance, _, _, _, _ = train_test_glm_nn(current_features, current_calcium, **params)
        # save the output
        performance_list.append([l1, l2, performance[1], el, True])
# create a dataframe with the output
performance_df = pd.DataFrame(performance_list, columns=['l1', 'l2', 'performance', 'iteration', 'shuffle'])
    

0.01 0.1
0.01 1
0.01 10
0.01 100
0.01 1000
0.05 0.1
0.05 1
0.05 10
0.05 100
0.05 1000
0.1 0.1
0.1 1
0.1 10
0.1 100
0.1 1000
0.5 0.1
0.5 1
0.5 10
0.5 100
0.5 1000
1 0.1
1 1
1 10
1 100
1 1000
5 0.1
5 1
5 10
5 100
5 1000
Wall time: 42min 53s


In [371]:
# plot the results of the grid search

# average across iterations
average_results = performance_df.groupby(['l1', 'l2', 'performance', 'shuffle'], as_index=False).mean()

# allocate memory for the output matrices
real_data = np.zeros((len(l1_list), len(l2_list)))
shuffle_data = np.zeros((len(l1_list), len(l2_list)))
# run through the rows
for idx, row in average_results.iterrows():
    # get the indexes of the l1 and l2 values
    l1_idx = np.argwhere(row['l1'] == np.array(l1_list))[0][0]
    l2_idx = np.argwhere(row['l2'] == np.array(l2_list))[0][0]
    
    # assign the performance to the corresponding matrix
    if row['shuffle']:
        shuffle_data[l1_idx, l2_idx] = row['performance']
    else:
        real_data[l1_idx, l2_idx] = row['performance']
        
# plot
real_plot = hv.Raster(real_data).opts(title='Real')
shuffle_plot = hv.Raster(shuffle_data).opts(title='Shuffle')
# print(real_data)
yticks = [(idx + 0.5, el) for idx, el in enumerate(l1_list)]
xticks = [(idx + 0.5, el) for idx, el in enumerate(l2_list)]

layout = real_plot+shuffle_plot
layout.cols(2).opts(opts.Raster(width=600, height=600, cmap='Reds', colorbar=True, tools=['hover'], xticks=xticks, yticks=yticks, xrotation=45, xlabel='L2', ylabel='L1'))
layout


    

In [343]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15178317776443214509
]
