In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(r'D:\Code Repos\prey_capture'))


import panel as pn
import holoviews as hv
from holoviews import opts, dim
hv.extension('bokeh')
from bokeh.resources import INLINE

import paths
import importlib
import functions_plotting as fp
import functions_bondjango as bd
import processing_parameters
import numpy as np
import pandas as pd
import h5py
import scipy.stats as stat
import datetime
import umap

In [2]:
importlib.reload(fp)
importlib.reload(processing_parameters)
# set up the figure theme
fp.set_theme()
label_dict = processing_parameters.label_dictionary

In [3]:
# load the desired files and their associated regressions

# # define the target variable
# target_variable = 'cricket_0_mouse_distance'

# load the latents for each file with their attributes
# %%time
# Load the desired files
importlib.reload(processing_parameters)

# load the constants from the regression calculation
time_shifts = processing_parameters.time_shifts
shift_dict = {el: idx for idx, el in enumerate(time_shifts)}
shift_number = len(time_shifts)
shuffles = processing_parameters.regression_shuffles

# load the variable list
variable_list = processing_parameters.variable_list
# assemble the dataframe columns
reals = ['real_'+str(el) for el in time_shifts]
shuffle_means = ['smean_'+str(el) for el in time_shifts]
shuffle_sems = ['ssem_'+str(el) for el in time_shifts]
columns = reals + shuffle_means + shuffle_sems + ['mouse', 'day']

# get the search list
search_list = processing_parameters.search_list

# allocate a list for all paths (need to preload to get the dates)
all_paths = []
all_results = []
# for all the search strings
for search_string in search_list:

    # query the database for data to plot
    data_all = bd.query_database('analyzed_data', search_string)
#         data_all = [el for el in data_all if 'preproc' in el['slug']]
    data_path = [el['analysis_path'] for el in data_all if '_combinedanalysis' in el['slug']]
    data_result = [el['result'] for el in data_all if '_combinedanalysis' in el['slug']]
    all_paths.append(data_path)
    all_results.append(data_result)
# get the dates present
data_dates = np.unique([os.path.basename(el)[:10] for el in np.concatenate(all_paths)])
print(f'Dates present: {data_dates}')

# allocate memory for the resulting dataframe
data = {}
weights = {}
day_list = []
animal_list = []
joint_list = []
# for all the list items
for idx0, data_path in enumerate(all_paths):

    # for all the files
    for idx1, files in enumerate(data_path):
        
        # if a habi trial, skip
#         if 'habi' in files:
#             continue
        
        # get the animal and date from the slug
        name_parts = os.path.basename(files).split('_')
        animal = '_'.join(name_parts[7:10])
        day_s = '_'.join(name_parts[:3])
        day = datetime.datetime.strptime(day_s, '%m_%d_%Y')
        # skip if the animal and day are already evaluated, 
        # since the CC is the same for the whole day
        if animal+'_'+day_s in joint_list:
            continue
        else:
            animal_list.append(animal)
            day_list.append(day)
            joint_list.append(animal+'_'+day_s)
        # assemble the preproc path
        files_preproc = files.replace('_combinedanalysis', '_preproc')
        # open the file
        with pd.HDFStore(files_preproc, 'r') as preproc:
            if '/cell_matches' in preproc.keys():
                # get the matches
                cell_matches = preproc['cell_matches']
                
                print(animal, day_s, files_preproc)
                # get the idx for this file
                current_matches = cell_matches[datetime.datetime.strftime(day, '%m_%d_%Y')].to_numpy()
                current_idx = np.argsort(current_matches).astype(float)
                # remove the nan entries
                current_idx = current_idx[~np.isnan(np.sort(current_matches))]
        
        # load the data and the cell matches (wasteful, but cleaner I think)
        with h5py.File(files, 'r') as h:
            
            # for all the target variables
            for target_variable in variable_list:
                # create an empty list only if it's the same time this variable runs
                if target_variable not in data.keys():
                    data[target_variable] = []
                    weights[target_variable] = []
                # allocate memory for the real and shuffled regressions
                real_array = np.zeros((shift_number, 1))
                shuffle_array = np.zeros((shift_number, shuffles))
                real_weight = []
                shuffle_weight = []
#                 print(files)
                if 'regression' not in h.keys():
                    continue

                # for all the keys (will iterate through shifts and reps for shuffle)
                for key in h['/regression'].keys():

                    # skip if it's not a cc key or is not the target variable
                    if (target_variable not in key):
                        continue
                    # get the time shift and shuffle
                    key_parts = key.split('_')
                    shift = int([el[5:] for el in key_parts if 'shift' in el][0])
                    if 'cc' in key:

                        if 'real' in key_parts:
                             # save the values
                            real_array[shift_dict[shift]] = np.array(h['/regression/'+key])
                        else:
                            shuffle = int([el[7:] for el in key_parts if 'shuffle' in el][0])
                            shuffle_array[shift_dict[shift], shuffle-1] = np.array(h['/regression/'+key])
                    elif ('coefficients' in key and shift == 0):
                        if 'real' in key_parts:
                            real_weight = np.array(h['/regression/'+key])
                        else:
                            shuffle_weight.append(np.array(h['/regression/'+key]))    
                    else:
                        continue
                # average the shuffles and get the sem
                shuffle_mean = np.mean(shuffle_array, axis=1)
                shuffle_sem = stat.sem(shuffle_array, axis=1)
                # add the columns to the main list
                data[target_variable].append(list(real_array[:, 0]) + list(shuffle_mean) + list(shuffle_sem) + [animal, day])
                # take only the non time shift (need to check)
                if isinstance(real_weight, list):
                    continue
                shuffle_weight_mean = np.mean(shuffle_weight, axis=0)
                shuffle_weight_sem = stat.sem(shuffle_weight, axis=0)

                # store the weights
#                 weights[target_variable].append(list(real_weight) + list(shuffle_weight_mean) + list(shuffle_weight_sem) + [animal, day])
                
                
                temp_df = pd.DataFrame(np.vstack((real_weight, shuffle_weight_mean, shuffle_weight_sem)).T, columns=['weight', 'shuffle_mean', 'shuffle_sem'])
                temp_df['match_id'] = current_idx 
                temp_df['animal'] = animal
                temp_df['day'] = day
#                 print(temp_df)
#                 raise ValueError
                weights[target_variable].append(temp_df)

            
# for all the variables once more
for target_variable in variable_list:
    # turn the overall list into a dataframe
    data[target_variable] = pd.DataFrame(data[target_variable], columns=columns)
    # turn the weights into a dictionary
#     weights[target_variable] = {(el[-2], el[-1]): el[:-2] for el in weights[target_variable]}
    weights[target_variable] = pd.concat(weights[target_variable], axis=0)

    print(f'Shape of the data dictionary: {data[target_variable].shape}')
    print(f'Shape of the weights dataframe: {weights[target_variable].shape}')

#             dataframe = pd.concat([behavior, latents], axis=1)
#                 # add the results to the dataframe
#                 dataframe.loc[:, 'result'] = all_results[idx0][idx1]

                # store
#                 pre_data.append((files, dataframe))
#                 include_counter += 1

                    
# print(f'Number of matched trials: {unique[np.argmax(counts)].sum()}')
# print(f'Number of trials without latents: {exclude_counter}')
# print(f'Number of trials with latents: {include_counter}')

Dates present: ['03_02_2020' '03_04_2020' '03_05_2020' '03_06_2020' '03_10_2020'
 '03_11_2020' '03_12_2020' '03_13_2020' '03_19_2021' '03_22_2021'
 '03_23_2021' '03_24_2021' '03_25_2021' '03_26_2021' '03_29_2021'
 '03_30_2021' '03_31_2021' '04_01_2021' '04_02_2021' '04_05_2021'
 '04_06_2021' '04_07_2021' '04_08_2021' '04_09_2021' '04_12_2021'
 '04_13_2021' '04_14_2021' '04_15_2021' '04_16_2021' '04_21_2021'
 '04_22_2021' '04_23_2021' '04_26_2021' '04_27_2021' '04_28_2021'
 '04_29_2021' '04_30_2021' '05_03_2021' '05_04_2021' '05_05_2021'
 '05_06_2021' '05_07_2021' '08_03_2020' '08_04_2020' '08_05_2020'
 '08_06_2020' '08_07_2020' '08_08_2020' '08_09_2020' '08_10_2020'
 '08_11_2020' '08_12_2020' '08_13_2020' '08_14_2020' '08_15_2020'
 '08_16_2020' '08_17_2020' '08_18_2020' '08_19_2020' '08_20_2020'
 '08_21_2020' '08_29_2020' '08_30_2020' '08_31_2020' '09_01_2020'
 '09_02_2020' '09_03_2020' '09_04_2020' '09_05_2020' '09_06_2020'
 '09_07_2020' '09_08_2020' '12_07_2019' '12_09_2019' '12_10_2

DG_200701_a 08_16_2020 J:\Drago Guggiana Nilo\Prey_capture\AnalyzedData\08_16_2020_15_35_26_miniscope_DG_200701_a_succ_preproc.hdf5
DG_200701_a 09_02_2020 J:\Drago Guggiana Nilo\Prey_capture\AnalyzedData\09_02_2020_16_55_43_miniscope_DG_200701_a_succ_preproc.hdf5
DG_200701_a 08_17_2020 J:\Drago Guggiana Nilo\Prey_capture\AnalyzedData\08_17_2020_15_34_07_miniscope_DG_200701_a_succ_injured_preproc.hdf5
DG_200701_a 08_21_2020 J:\Drago Guggiana Nilo\Prey_capture\AnalyzedData\08_21_2020_15_42_56_miniscope_DG_200701_a_succ_preproc.hdf5
DG_200617_b 08_17_2020 J:\Drago Guggiana Nilo\Prey_capture\AnalyzedData\08_17_2020_16_25_53_miniscope_DG_200617_b_succ_injured_preproc.hdf5
DG_200701_a 08_11_2020 J:\Drago Guggiana Nilo\Prey_capture\AnalyzedData\08_11_2020_15_33_47_miniscope_DG_200701_a_succ_preproc.hdf5
DG_200617_b 08_21_2020 J:\Drago Guggiana Nilo\Prey_capture\AnalyzedData\08_21_2020_16_03_54_miniscope_DG_200617_b_succ_preproc.hdf5
MM_200129_a 03_05_2020 J:\Drago Guggiana Nilo\Prey_capture\A

In [102]:
# count the cells per animal

animal_counts = weights[target_variable].groupby(['animal', 'day'])[['day']].count()
# print(animal_counts)

for el in np.arange(animal_counts.shape[0]):
    print(animal_counts.iloc[el, :])

day    15
Name: (DG_200617_b, 2020-08-06 00:00:00), dtype: int64
day    29
Name: (DG_200617_b, 2020-08-07 00:00:00), dtype: int64
day    21
Name: (DG_200617_b, 2020-08-08 00:00:00), dtype: int64
day    29
Name: (DG_200617_b, 2020-08-09 00:00:00), dtype: int64
day    31
Name: (DG_200617_b, 2020-08-10 00:00:00), dtype: int64
day    36
Name: (DG_200617_b, 2020-08-11 00:00:00), dtype: int64
day    9
Name: (DG_200617_b, 2020-08-12 00:00:00), dtype: int64
day    25
Name: (DG_200617_b, 2020-08-13 00:00:00), dtype: int64
day    15
Name: (DG_200617_b, 2020-08-15 00:00:00), dtype: int64
day    31
Name: (DG_200617_b, 2020-08-16 00:00:00), dtype: int64
day    11
Name: (DG_200617_b, 2020-08-18 00:00:00), dtype: int64
day    21
Name: (DG_200617_b, 2020-08-19 00:00:00), dtype: int64
day    64
Name: (DG_200701_a, 2020-08-05 00:00:00), dtype: int64
day    163
Name: (DG_200701_a, 2020-08-14 00:00:00), dtype: int64
day    93
Name: (DG_200701_a, 2020-08-17 00:00:00), dtype: int64
day    83
Name: (DG_20070

In [4]:
# plot the average kernel per animal compared to shuffle

overlay_list = []
# for all the target variables
for target_variable in variable_list:
    # average across days for each animal
    averages = data[target_variable].groupby(['mouse',], as_index=False)[reals+shuffle_means+shuffle_sems].mean()
    sems = data[target_variable].groupby(['mouse',], as_index=False)[reals+shuffle_means+shuffle_sems].sem().fillna(0)

    # allocate a list for the plots
    plot_list = []
    # for all the mice
    for idx, (mouse, df)  in enumerate(averages.groupby(['mouse'])):
        real_plot = hv.Curve((time_shifts, df.loc[:, reals].to_numpy().flatten()))
        real_plot.opts(width=400, height=400, title=target_variable)
        real_sem = hv.Spread((time_shifts, df.loc[:, reals].to_numpy().flatten(), sems.loc[idx, reals].to_numpy().flatten()))
        real_sem.opts(color='red')
        shuffle_plot = hv.Curve((time_shifts, df.loc[:, shuffle_means].to_numpy().flatten()))
        shuffle_plot.opts(color='black')
        shuffle_error = hv.Spread((time_shifts, df.loc[:, shuffle_means].to_numpy().flatten(), sems.loc[idx, shuffle_means].to_numpy().flatten()))
        shuffle_error.opts(color='black')

        plot_list.append(real_plot*real_sem*shuffle_plot*shuffle_error)
    

    overlay_list.append(hv.Overlay(plot_list))

hv.Layout(overlay_list).cols(3)


In [5]:
# Average across mice and time

overlay_list = []
# for all the target variables
for target_variable in variable_list:
    # average across days for each animal
#     averages = data[target_variable].groupby(, as_index=False)[reals+shuffle_means+shuffle_sems].mean()
    averages = data[target_variable].loc[:, reals+shuffle_means+shuffle_sems].mean(axis=0)
    sems = data[target_variable].loc[:, reals+shuffle_means+shuffle_sems].sem(axis=0)
    
    real_plot = hv.Curve((time_shifts, averages.loc[reals].to_numpy().flatten()))
    real_sem = hv.Spread((time_shifts, averages.loc[reals].to_numpy().flatten(), sems.loc[reals].to_numpy().flatten()))
    real_plot.opts(width=400, height=400, title=target_variable)
    shuffle_plot = hv.Curve((time_shifts, averages.loc[shuffle_means].to_numpy().flatten()))
    shuffle_error = hv.Spread((time_shifts, averages.loc[shuffle_means].to_numpy().flatten(), sems.loc[shuffle_means].to_numpy().flatten()))

    overlay_list.append(real_plot*real_sem*shuffle_plot*shuffle_error)
    


hv.Layout(overlay_list).cols(3)
    

In [26]:
# plot the distributions per variable

importlib.reload(processing_parameters)
variable_list = processing_parameters.variable_list
label_dict = processing_parameters.label_dictionary

# define the target time point
tpoint = 0
real_list = []
# shuffle_list = []

# def format_whisker()

# define the variables to include
include_variables = variable_list
# for all the variables
for target_variable in include_variables:
    # get only the 0 lag value for each day
    real_data = data[target_variable].loc[:, ['real_'+str(tpoint)]]
    real_data['Feature'] = label_dict[target_variable]
    real_list.append(real_data)
    
    shuffle_data = data[target_variable].loc[:, ['smean_'+str(tpoint)]]
    shuffle_data = shuffle_data.rename({'smean_'+str(tpoint): 'real_'+str(tpoint)}, axis=1)
    shuffle_data['Feature'] = label_dict[target_variable]+' shuffle'
    real_list.append(shuffle_data)
    
    x_mwu = real_data.loc[:, 'real_'+str(tpoint)].to_numpy()
    x_mwu = x_mwu[~np.isnan(x_mwu)]
    y_mwu = shuffle_data.loc[:, 'real_'+str(tpoint)].to_numpy()
    y_mwu = y_mwu[~np.isnan(y_mwu)]
    test = stat.mannwhitneyu(x_mwu, y_mwu)
    print(test)

# get the number of plots
number_plots = len(real_list)
# get the half index
half_index = int(np.ceil(number_plots/2)) + 1
# generate 2 plots with half of the variables each
violin0 = hv.BoxWhisker(pd.concat(real_list, axis=0), ['Feature'], ['real_'+str(tpoint)])
violin0.opts(width=2000, height=800, xrotation=45, ylabel='CC (a.u.)')
violin0
# violin1 = hv.BoxWhisker(pd.concat(real_list[half_index:], axis=0), ['Feature'], ['real_'+str(tpoint)])
# violin1.opts(width=100, height=800, xrotation=45, ylabel='CC (a.u.)')

# (violin0+violin1).cols(1).opts(shared_axes=False)
# violin0
# violin1 = hv.Violin(pd.concat(shuffle_list, axis=0), ['Feature'], ['smean_'+str(tpoint)])
# violin1.opts(width=800, height=800, xrotation=45)

# (violin0*violin1)

MannwhitneyuResult(statistic=4766.0, pvalue=1.9832262234348574e-16)
MannwhitneyuResult(statistic=2961.0, pvalue=0.24665486377290435)
MannwhitneyuResult(statistic=3673.0, pvalue=7.973157728575959e-05)
MannwhitneyuResult(statistic=3309.0, pvalue=0.011694100840252073)
MannwhitneyuResult(statistic=3154.0, pvalue=0.055566228803801423)
MannwhitneyuResult(statistic=2205.0, pvalue=0.0946768080134388)
MannwhitneyuResult(statistic=3388.0, pvalue=0.004647151904965428)
MannwhitneyuResult(statistic=3299.0, pvalue=0.01306130243209943)
MannwhitneyuResult(statistic=2781.0, pvalue=0.6497302468981812)
MannwhitneyuResult(statistic=3378.0, pvalue=0.00524844044759598)
MannwhitneyuResult(statistic=2505.0, pvalue=0.5336257358705493)
MannwhitneyuResult(statistic=2857.0, pvalue=0.45224653927421365)
MannwhitneyuResult(statistic=2992.0, pvalue=0.20047605305712002)
MannwhitneyuResult(statistic=3451.0, pvalue=0.002089764645699112)
MannwhitneyuResult(statistic=3146.0, pvalue=0.05968648826465379)


In [7]:
# plot the CC over time

# initialize a list for the plots
time_plot = []

# define the target timepoint
tpoint = 10
# for all the variables
for target_variable in variable_list:
    temp_list = []
    # for all the mice
    for mouse, df_ori in data[target_variable].groupby(['mouse']):
        # copy to not mess the original dataframe
        df = df_ori.copy()
        df = df.sort_values(['day'], axis=0).reset_index(drop=True)
        # get the delta time
        delta_time = [(el-df['day'][0]).days for el in df['day']]

        real_plot = hv.Curve((delta_time, df.loc[:, 'real_'+str(tpoint)]))
        real_plot.opts(title=target_variable, width=400, height=400, xlabel='Time (days)', ylabel='CC (a.u.)')
        shuffle_plot = hv.Curve((delta_time, df.loc[:, 'smean_'+str(tpoint)]))
        shuffle_error = hv.Spread((delta_time, df.loc[:, 'smean_'+str(tpoint)], df.loc[:, 'ssem_'+str(tpoint)]))
        
        

        temp_list.append(real_plot*shuffle_plot*shuffle_error)
#         temp_list.append(real_plot)
    time_plot.append(hv.Overlay(temp_list))

hv.Layout(time_plot).cols(3)
    
    
    

In [8]:
# plot the weight distributions
# put all the cells in a single dataframe with variables vs cells
# print(weights['cricket_0_mouse_distance'].keys())

# need to iterate through the variables, and concatenate the weights in order per day

# allocate memory for the output dataframe
output_df = []

# for all the variables
for idx, target_feature in enumerate(weights.keys()):
    # get the data
    current_feature = weights[target_feature]
    temp_list = []
    # iterate through the items
    for mouse, day in current_feature.keys():
#         print(mouse)
#         print(day)
#         print(current_feature[mouse, day])
        # build a data frame from the day, mouse and weights
        df = pd.DataFrame(current_feature[mouse, day], columns=[target_feature])
        if idx == 0:
#             df['mouse'] = mouse
#             df['day'] = day
#             df['feature'] = target_feature
            df.insert(0, 'mouse', mouse)
            df.insert(0, 'day', day)        
        # store and continue accumulating
        temp_list.append(df)
    
    temp_df = pd.concat(temp_list, axis=0).reset_index(drop=True)
    output_df.append(temp_df)
# concatenate into a single dataframe
output_df = pd.concat(output_df, axis=1)


# eliminate rows with nans
output_df = output_df.iloc[~np.any(np.isnan(output_df.drop(['mouse', 'day'], axis=1).to_numpy()), axis=1), :]
print(output_df.columns)
print(output_df.shape)
    

Index(['day', 'mouse', 'mouse_speed', 'mouse_x', 'mouse_angular_speed',
       'cricket_0_mouse_distance', 'cricket_0_delta_heading', 'cricket_0_x',
       'cricket_0_visual_angle', 'hunt_trace', 'cricket_0_direction',
       'cricket_0_loom', 'cricket_0_delta_visual', 'motifs', 'latent_0',
       'latent_1', 'latent_2'],
      dtype='object')
(36789, 17)


In [None]:
# new loading scheme



In [94]:
# recalculate the day column
output_df = []

# for all the variables
for target_feature in weights.keys():
    # get the dataframe for this feature
    current_feature = weights[target_feature]
    
    # get only the weights and store in the output dataframe
    output_df.append(current_feature.loc[:, ['weight']].to_numpy())    
# convert to dataframe
output_df = pd.DataFrame(np.hstack(output_df), columns=weights.keys())
print(output_df.shape)



mouse_speed (12425, 1)
mouse_x (12425, 1)
mouse_angular_speed (12425, 1)
cricket_0_mouse_distance (12425, 1)
cricket_0_delta_heading (12425, 1)
cricket_0_x (12425, 1)
cricket_0_visual_angle (12425, 1)
hunt_trace (12425, 1)
cricket_0_direction (12425, 1)
cricket_0_loom (12425, 1)
cricket_0_delta_visual (12425, 1)
motifs (12425, 1)
latent_0 (12425, 1)
latent_1 (12425, 1)
latent_2 (12425, 1)
(12425, 15)


In [95]:
# calculate the correlation matrix for the variables

feature_matrix = output_df.loc[:, variable_list]
correlation_matrix, pvalue_matrix = stat.spearmanr(feature_matrix)

print(correlation_matrix.shape)


(15, 15)


In [96]:
ticks = [(idx+0.5, el) for idx, el in enumerate(variable_list)]

raster = hv.Raster(correlation_matrix)
raster.opts(width=800, height=600, yticks=ticks, xticks=ticks, xrotation=45, colorbar=True, cmap='RdBu', clim=(-1, 1), tools=['hover'])

In [12]:
%%time
# perform a umap decomposition of the weights across variables

# format the data
umap_data = output_df.drop(['mouse', 'day'], axis=1).to_numpy()

# run the decomposition
reducer = umap.UMAP(min_dist=0.5, n_neighbors=30)
embedded_data = reducer.fit_transform(umap_data)

Wall time: 29.1 s


In [13]:
# plot the decomposition

# define the interval between points
interv = 5

umap_list = []
# target_key = 'cricket_0_mouse_distance'
# for all the variables
for target_key in variable_list+['mouse', 'day']:
    # if target_key in ['mouse', 'day']:
    counts, raw_labels = np.unique(output_df.loc[:, target_key].to_numpy(), return_inverse=True)
    raw_labels = (raw_labels - raw_labels.min())/(raw_labels.max() - raw_labels.min())
    # else:
    #     counts, raw_labels = np.unique(output_df.loc[:, target_key].to_numpy(), return_inverse=True)
    #     raw_labels = output_df.loc[:, target_key].to_numpy()

    compiled_labels = np.expand_dims(raw_labels, axis=1)

    umap_data = np.concatenate((embedded_data,compiled_labels),axis=1)

    compiled_labels = compiled_labels[::interv]
    umap_data = umap_data[::interv, :]


    umap_plot = hv.Scatter(umap_data, vdims=['Dim 2', target_key], kdims=['Dim 1'])
    umap_plot.opts(color=target_key, colorbar=True, cmap='Spectral', size=3, title=target_key)

    umap_plot.opts(height=600, width=800)
    umap_list.append(umap_plot)

hv.Layout(umap_list).cols(2)