In [74]:
# imports
import os
import sys
sys.path.insert(0, os.path.abspath(r'D:\Code Repos\prey_capture'))

import panel as pn
import holoviews as hv
from bokeh.io import export_svgs, export_png
from holoviews import opts, dim
from holoviews.operation import histogram
hv.extension('bokeh')
from bokeh.resources import INLINE

import importlib
import processing_parameters
import datetime

import functions_bondjango as bd
import functions_plotting as fp
import pandas as pd
import numpy as np
from pprint import pprint
import paths
import random
import scipy.stats as stat
import umap
from scipy.signal import medfilt

pd.options.mode.chained_assignment = None  # default='warn'


# define the target saving path
# save_path = os.path.join(paths.figures_path, 'TC_aggregate')

In [18]:
# importlib.reload(processing_parameters)
# get the search string
# search_string = processing_parameters.search_string

# # get the paths from the database
# all_path = bd.query_database('analyzed_data', search_string)
# input_path = [el['analysis_path'] for el in all_path if '_preproc' in el['slug']]

# assemble the output path
# out_path = os.path.join(paths.analysis_path, 'test_latentconsolidate.hdf5')
# pprint(input_path)

In [56]:
# load the latents for each file with their attributes
# %%time
# Load the desired files
importlib.reload(processing_parameters)

# define the threshold for matched cells
match_threshold = 10
# set up the exclusion counter
exclude_counter = 0
include_counter = 0
# get the data paths
try: 
    data_path = snakemake.input[0]
except NameError:
    # get the search list
    search_list = processing_parameters.search_list
    # allocate memory for the data
    pre_data = []
    
    # allocate a list for all paths (need to preload to get the dates)
    all_paths = []
    # for all the search strings
    for search_string in search_list:

        # query the database for data to plot
        data_all = bd.query_database('analyzed_data', search_string)
        data_all = [el for el in data_all if 'preproc' in el['slug']]
        data_path = [el['analysis_path'] for el in data_all if '_preproc' in el['slug']]
        all_paths.append(data_path)
    # get the dates present
    data_dates = np.unique([os.path.basename(el)[:10] for el in np.concatenate(all_paths)])
    print(f'Dates present: {data_dates}')
    # now load the files
    for data_path in all_paths:
        # load the calcium data
        beh_data = []
        # for all the files
        for files in data_path:
            # load the data
            with pd.HDFStore(files) as h:
                beh_data.append(h['full_traces'])
                if '/matched_calcium' in h.keys():
                    # get the cell matches
                    cell_matches = h['cell_matches']
                    
                    # perform only if there are more files
                    if len(data_dates) > 1:
#                         print('Successful match')
                        match_dates = [el for el in data_dates if el in cell_matches.columns]
                        # get only the days present in the search
                        cell_matches = cell_matches[match_dates]
                        # generate a list with the number of days and the number of cells kept

                        # get the unique cell combinations
                        # unique contains the unique patterns followed by cells across days
                        # inverse indicates which pattern is followed by each cell
                        # count contains the number of times each pattern is found
                        unique, inverse, counts = np.unique(~np.isnan(cell_matches.to_numpy()), axis=0, 
                                        return_counts=True, return_inverse=True)
                        # remove the single day and no day cases
                        counts[np.sum(unique, axis=1)==0] = 0
                        counts[np.sum(unique, axis=1)==1] = 0

                        # get an index vector with only the most popular pattern
                        # (regardless of how many cells share it)
                        cell_idx = np.array(inverse==np.argmax(counts))

                        cell_matches = cell_matches.iloc[cell_idx, :]
                    else:
                        counts = 1
                        cell_idx = cell_matches[data_dates].to_numpy()
                        cell_idx = ~np.isnan(cell_idx)
                        cell_matches = cell_matches.iloc[cell_idx, :]
                        unique = np.array([[1]])
#                     print(h.keys())
                    # concatenate the latents
                    if '/latents' in h.keys():
                        # get the data
                        behavior = h['matched_calcium']
                        latents = h['latents']
                        # pad the latents
                        # get the delta frames between latent and behavior
                        delta_frames = behavior.shape[0] - latents.shape[0]
                        
#                         # pad the latents
#                         padding = pd.DataFrame(np.zeros((int(delta_frames/2), len(latents.columns))), columns=latents.columns)
#                         # pad latents due to the VAME calculation window
#                         latents = pd.concat([padding, latents, padding], axis=0).reset_index(drop=True)

                        # trim the behavior
                        behavior = behavior.iloc[int(delta_frames/2):-int(delta_frames/2), :].reset_index(drop=True)
                    
                        dataframe = pd.concat([behavior, latents], axis=1)
                        
                        # store
                        pre_data.append((files, dataframe,  cell_matches))
                        include_counter += 1
                    else:
#                         print()
                        exclude_counter += 1
                    
print(f'Number of matched cells: {np.sum(cell_idx)}')
print(f'Number of matched trials: {unique[np.argmax(counts)].sum()}')
print(f'Number of trials without latents: {exclude_counter}')
print(f'Number of trials with latents: {include_counter}')

Dates present: ['07_31_2020' '08_01_2020' '08_03_2020' '08_04_2020' '08_05_2020'
 '08_06_2020' '08_07_2020' '08_08_2020' '08_09_2020' '08_10_2020'
 '08_11_2020' '08_12_2020' '08_13_2020' '08_14_2020' '08_15_2020'
 '08_16_2020' '08_17_2020' '08_18_2020' '08_19_2020' '08_20_2020'
 '08_21_2020' '08_29_2020' '08_30_2020' '08_31_2020' '09_01_2020'
 '09_02_2020' '09_03_2020' '09_04_2020' '09_05_2020' '09_06_2020'
 '09_07_2020' '09_08_2020']
Number of matched cells: 6
Number of matched trials: 2
Number of trials without latents: 0
Number of trials with latents: 215


In [6]:
def normalize_rows(data_in):
    
    for idx, el in enumerate(data_in):
        data_in[idx, :] = (el-np.nanmin(el))/(np.nanmax(el)-np.nanmin(el))
    return data_in

In [151]:
# visualize latents
# define the target trial
target_trial = 11

# get the target trial
current_trial = pre_data[target_trial][1]
print(pre_data[target_trial][0])
print(current_trial.shape)
print(current_trial.columns[:50])
# pprint(current_trial.iloc[-50:, 1])

# get the time columns
x = current_trial['time_vector']

# define the target columns to keep
target_columns = ['cricket_0_mouse_distance', 'cricket_0_delta_heading', 
                  'mouse_speed', 'cricket_0_speed', 'mouse_heading', 'mouse_x', 'mouse_y', 'cricket_0_x', 'cricket_0_y']
target_columns += [el for el in current_trial.columns if 'latent' in el]
# leave only the target columns
current_trial = current_trial[target_columns]
y = np.arange(len(current_trial.columns))

pprint(current_trial.columns)
y_labels = [(idx, el) for idx, el in enumerate(target_columns)]

# transpose, get rid of nans and normalize the rows
current_trial = current_trial.to_numpy().T
current_trial[np.isnan(current_trial)] = 0
current_trial = normalize_rows(current_trial)

# visualize latents with respect to other variables
trial_plot = hv.Image((x, y, current_trial))
trial_plot.opts(frame_width=600, frame_height=600, tools=['hover'], colorbar=True, yticks=y_labels)

latent_correlation = np.corrcoef(current_trial)
correlation_plot = hv.Image((y, y, latent_correlation))
correlation_plot.opts(frame_width=600, frame_height=600, tools=['hover'], colorbar=True,
                      yticks=y_labels, xticks=y_labels, xrotation=45, cmap='RdBu', clim=(-1, 1))

(trial_plot+correlation_plot).opts(shared_axes=False).cols(1)



J:\Drago Guggiana Nilo\Prey_capture\AnalyzedData\09_07_2020_14_45_29_miniscope_DG_200701_a_succ_preproc.hdf5
(99, 182)
Index(['mouse_snout_x', 'mouse_snout_y', 'mouse_barl_x', 'mouse_barl_y',
       'mouse_barr_x', 'mouse_barr_y', 'mouse_head_x', 'mouse_head_y',
       'mouse_x', 'mouse_y', 'mouse_body2_x', 'mouse_body2_y', 'mouse_body3_x',
       'mouse_body3_y', 'mouse_base_x', 'mouse_base_y', 'mouse_heading',
       'mouse_speed', 'mouse_acceleration', 'head_direction', 'cricket_0_x',
       'cricket_0_y', 'cricket_0_heading', 'cricket_0_speed',
       'cricket_0_acceleration', 'cricket_0_mouse_distance',
       'cricket_0_delta_heading', 'cricket_0_size', 'cricket_0_delta_head',
       'cricket_0_quadrant', 'cricket_0_visual_angle', 'cricket_0_head_x',
       'cricket_0_head_y', 'time_vector', 'mouse', 'datetime', 'cell_0',
       'cell_1', 'cell_2', 'cell_3', 'cell_4', 'cell_5', 'cell_6', 'cell_7',
       'cell_8', 'cell_9', 'cell_10', 'cell_11', 'cell_12', 'cell_13'],
      dtype

In [58]:
# generate 2D TCs with latents as x and y and a behavioral varible as the Z

# define the target behavioral variable
target_behavior = 'mouse_speed'
# define the latents of interest
first_latent = 'latent_0'
second_latent = 'latent_1'

# define the bins
bins = 10
# allocate a list for the plots
plot_list = []
# for all the trials
for trial in pre_data:
    # get just the dataframe
    trial = trial[1]
    # skip if the behavior is not there
    if target_behavior not in trial.columns:
        continue
    # get the variables of interest
    feature_0 = trial.loc[:, first_latent]
    feature_1 = trial.loc[:, second_latent]
    behavior = trial.loc[:, target_behavior]
    # remove nans and infs
    feature_0[np.isnan(feature_0)] = 0
    feature_1[np.isnan(feature_1)] = 0
    behavior[np.isnan(behavior)] = 0
    
    feature_0[np.isinf(feature_0)] = 0
    feature_1[np.isinf(feature_1)] = 0
    behavior[np.isinf(behavior)] = 0
    
    # get the histogram
    current_tc, x_edge, y_edge, bin_number = stat.binned_statistic_2d(feature_0, feature_1, behavior, statistic='mean', bins=bins)
    
    # plot and store
    plot = hv.Image((x_edge, y_edge, np.array(current_tc)), kdims=[first_latent, second_latent])
    plot.opts(tools=['hover'], cmap='Spectral')
    plot_list.append(plot)

hv.Layout(plot_list).cols(5)
    
    

In [59]:
# generate a single map for all trials

# define the target behavioral variable
target_behavior = 'mouse_speed'
# define the latents of interest
first_latent = 'latent_0'
second_latent = 'latent_1'

# define the bins
bins = 10
# allocate memory for the accumulated trials
feature_0_list = []
feature_1_list = []
behavior_list = []
# for all the trials
for trial in pre_data:
    # get just the dataframe
    trial = trial[1]
    # skip if the behavior is not there
    if target_behavior not in trial.columns:
        continue
    # get the variables of interest
    feature_0 = trial.loc[:, first_latent]
    feature_1 = trial.loc[:, second_latent]
    behavior = trial.loc[:, target_behavior]
    # remove nans and infs
    feature_0[np.isnan(feature_0)] = 0
    feature_1[np.isnan(feature_1)] = 0
    behavior[np.isnan(behavior)] = 0
    
    feature_0[np.isinf(feature_0)] = 0
    feature_1[np.isinf(feature_1)] = 0
    behavior[np.isinf(behavior)] = 0
    
    # store the variables in a list
    feature_0_list.append(feature_0)
    feature_1_list.append(feature_1)
    behavior_list.append(behavior)
    
# concatenate
feature_0 = pd.concat(feature_0_list, axis=0)
feature_1 = pd.concat(feature_1_list, axis=0)
behavior = pd.concat(behavior_list, axis=0)
# get the histogram
current_tc, x_edge, y_edge, bin_number = stat.binned_statistic_2d(feature_0, feature_1, behavior, statistic='mean', bins=bins)

# plot and store
plot = hv.Image((x_edge, y_edge, np.array(current_tc)), kdims=[first_latent, second_latent])
plot.opts(tools=['hover'], cmap='Spectral')

In [10]:
# function to separate the day from the time in the datetime field
def separate_day_time(input_string):
    split_string = input_string.split(' ')
    day = split_string[0]
#     time = split_string[1]
    return day

In [157]:
# concatenate the data
# pprint(pre_data[0][1].columns[:50])
# pprint(pre_data[0][1].loc[:, 'datetime'])

# convert the datetime to date and accumulate in a single data frame

# allocate the full dataframe
full_df = []
# initialize a trial counter
trial_counter = 0
# for all the trials
for trial in pre_data:
    # get the trial from the tuple
    trial = trial[1]
    # trim it at the ends for the valid VAME interval
    trial = trial.loc[:, :]
    # remove the cells
    not_cells = [el for el in trial.columns if 'cell' not in el]
    trial = trial.loc[:, not_cells]
    
    # convert the date
    trial.loc[:, 'datetime'] = trial.loc[:, 'datetime'].apply(separate_day_time)
    
    # add the trial number
    trial.loc[:, 'trial_idx'] = trial_counter
    trial_counter += 1
    # get distance to the center
    # define the center point
    center_point = [20, 20]
    distance_to_center = np.linalg.norm(trial.loc[:, ['mouse_x', 'mouse_y']] - center_point, axis=1)
    trial.loc[:, 'center_distance'] = distance_to_center
    
    
    # store in the output list
    full_df.append(trial)

# concatenate the list
full_df = pd.concat(full_df, axis=0)

print(full_df.shape)

(63499, 48)


In [61]:
# generate averages of each latent over time

# get a list of the latents
latent_list = [el for el in full_df.columns if 'latent' in el]

# get the latent variable along with the date
current_variable = full_df.loc[:, latent_list + ['datetime']]
# group by the date and average
current_mean = current_variable.groupby(['datetime']).mean()
current_sem = current_variable.groupby(['datetime']).sem()
# get the dates
date_list = np.array(current_mean.index)
print(date_list)
mean_list = []
sem_list = []
# for all the latents
for latent in latent_list:
    
    y_mean = current_mean.loc[:, latent]
    y_sem = current_sem.loc[:, latent]
#     y_mean = (y_mean - y_mean.min())/(y_mean.max() - y_mean.min())
#     y_sem = y_sem/y_mean.max()
    
    curve = hv.Curve((date_list, y_mean), vdims=[latent], kdims=['Date'])
    curve.opts(width=600, xrotation=45)
    spread = hv.Spread((date_list, y_mean, y_sem), vdims=[latent, 'Error'], kdims=['Date'])
    spread.opts(width=600, xrotation=45)
    mean_list.append(curve*spread)
#     sem_list.append(spread)

means = hv.Layout(mean_list).cols(3)
means
# sems = hv.Layout(sem_list).cols(3)

# hv.Overlay((means, sems))

['2020-08-03' '2020-08-04' '2020-08-05' '2020-08-06' '2020-08-07'
 '2020-08-08' '2020-08-09' '2020-08-10' '2020-08-11' '2020-08-12'
 '2020-08-13' '2020-08-14' '2020-08-15' '2020-08-16' '2020-08-17'
 '2020-08-18' '2020-08-19' '2020-08-20' '2020-08-21' '2020-08-29'
 '2020-08-30' '2020-08-31' '2020-09-01' '2020-09-02' '2020-09-03'
 '2020-09-04' '2020-09-05' '2020-09-06' '2020-09-07' '2020-09-08']


In [192]:
# group by date and generate the TC

# define the target behavioral variable
target_behavior = 'mouse_x'
# define the latents of interest
first_latent = 'latent_2'
second_latent = 'latent_3'

# get a list of dates
date_list = np.unique(full_df['datetime'])
# allocate a plot list
plot_list = []

# for all the dates
for date in date_list:
    # get the data corresponding to this date
    data_idx = np.argwhere(full_df.loc[:, 'datetime'].to_numpy() == date).flatten()
    current_data = full_df.iloc[data_idx, :]
    # get the variables of interest
    feature_0 = current_data.loc[:, first_latent]
    feature_1 = current_data.loc[:, second_latent]
    behavior = current_data.loc[:, target_behavior]
    # remove nans and infs
    feature_0.loc[np.isnan(feature_0)] = 0
    feature_1.loc[np.isnan(feature_1)] = 0
    behavior.loc[np.isnan(behavior)] = 0
    
    feature_0.loc[np.isinf(feature_0)] = 0
    feature_1.loc[np.isinf(feature_1)] = 0
    behavior.loc[np.isinf(behavior)] = 0
    
    # generate the TC
    current_tc, x_edge, y_edge, bin_number = stat.binned_statistic_2d(feature_0, feature_1, behavior, statistic='mean', bins=bins)

    # plot and store
    im = hv.Image((x_edge, y_edge, np.array(current_tc)), kdims=[first_latent, second_latent])
    im.opts(tools=['hover'], cmap='Spectral', title=date)
    plot_list.append(im)

hv.Layout(plot_list).cols(5)

In [152]:
%%time
# UMAP embedding of the VAME data

# compile the data
# compiled_latent = np.vstack(latent_list)
compiled_latent = full_df.loc[:, [el for el in full_df.columns if 'latent' in el]].to_numpy()
# embed using UMAP
# original parameters 0.5 and 10
# 0.1 and 30 also works
# 0.05 and 30 works too
reducer = umap.UMAP(min_dist=0.1, n_neighbors=20)
embedded_data = reducer.fit_transform(compiled_latent[:, :])

# # save the embedding
# np.save(os.path.join(target_folder, 'UMAP_result'), embedded_data)

# # generate the model name
# model_name = os.path.join(target_folder, 'UMAP_model.pk')
# # save the estimator
# with open(model_name, 'wb') as file:
#     pk.dump(reducer, file)

Wall time: 24.5 s


In [208]:
# plot the UMAP results

# get the labels
# compiled_labels = np.expand_dims(np.hstack(distance_list), axis=1)
compiled_labels = np.expand_dims(full_df.loc[:, 'cricket_0_mouse_distance'].to_numpy(), axis=1)
compiled_labels = medfilt(compiled_labels, kernel_size=[21, 1])

# define the sampling ratio
sampling_ratio = 10

umap_data = np.concatenate((embedded_data[::sampling_ratio, :],compiled_labels[::sampling_ratio, :]), axis=1)

print(umap_data.shape)
                            
                            
umap_plot = hv.Scatter(umap_data, vdims=['Dim 2','parameter'], kdims=['Dim 1'])
# umap_plot = hv.HexTiles(umap_data, kdims=['Dim 1', 'Dim 2'])
umap_plot.opts(color='parameter', colorbar=True, cmap='Spectral', tools=['hover'], alpha=0.5)
umap_plot.opts(width=1200, height=1000, size=5)
# umap_plot.opts(width=1200, height=1000)
umap_plot


(6350, 3)


In [202]:
# add a trajectory on top of the map

# define the target trial
target_trial = 150

traj_data = embedded_data[full_df.loc[:, 'trial_idx'] == target_trial, :]
trajectory = hv.Curve((traj_data[:, 0], traj_data[:, 1]))
start = hv.Scatter((traj_data[0, 0], traj_data[0, 1])).opts(color='Red', size=10)
end = hv.Scatter((traj_data[-1, 0], traj_data[-1, 1])).opts(color='Blue', size=10)

umap_plot*trajectory*start*end

In [190]:
# plot aggregated trajectories in UMAP space

# allocate a list for the trials
trial_plots = []
# get the number of trials
trial_list = np.unique(full_df.loc[:, 'trial_idx'])
# for all the trials
for trial in trial_list:
    # get the corresponding coordinates
    traj_data = embedded_data[full_df.loc[:, 'trial_idx'] == trial, :]
    
    # generate the curve
    curve = hv.Curve((traj_data[::10, 0], traj_data[::10, 1]))
    curve.opts(width=1200, height=1000, alpha=0.2)
#     start = hv.Scatter((traj_data[0, 0], traj_data[0, 1])).opts(color='Red', size=10)
#     start.opts(width=1200, height=1000)
#     end = hv.Scatter((traj_data[-1, 0], traj_data[-1, 1])).opts(color='Blue', size=10)
#     end.opts(width=1200, height=1000)
    
#     trial_plots.append(start)
#     trial_plots.append(end)
    
    # store
    trial_plots.append(curve)
    
# generate the figure
overlay = umap_plot*hv.Overlay(trial_plots[:])
overlay.opts({'Curve': dict(color=hv.Palette('Spectral'))})
overlay