In [None]:
# imports
import os
import sys
sys.path.insert(0, os.path.abspath(r'C:\Users\mmccann\repos\bonhoeffer\prey_capture'))


import panel as pn
import holoviews as hv
from holoviews import opts, dim
from holoviews.operation import histogram
hv.extension('bokeh')
from bokeh.resources import INLINE

import paths
import functions_bondjango as bd
import functions_misc as fm
import functions_plotting as fp
import pandas as pd
import numpy as np
import sklearn.mixture as mix
import sklearn.decomposition as decomp
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn import svm, datasets
from sklearn import preprocessing
import sklearn.linear_model as lin
import sklearn.metrics as smet
import scipy.signal as ss
import scipy.stats as stat
import scipy.optimize as opt
from sklearn.neighbors import KernelDensity as kds

import random
# import functions_data_handling as fd
# import functions_vame as fv
import importlib
import processing_parameters
# import PSID
# from PSID.evaluation import evalPrediction
# import sklearn.cross_decomposition as cros
# import umap
np.seterr(divide='ignore', invalid='ignore')


In [None]:
def maxmin(array_in):
    return (array_in-np.nanmin(array_in))/(np.nanmax(array_in)-np.nanmin(array_in))

In [None]:
%%time
# Load the desired files
importlib.reload(processing_parameters)

# define the threshold for matched cells
match_threshold = 10
# get the data paths
try: 
    data_path = snakemake.input[0]
except NameError:
    # get the search list
    search_list = processing_parameters.search_string
    # allocate memory for the data
    pre_data = []
    
    # allocate a list for all paths (need to preload to get the dates)
    all_paths = []
    # for all the search strings
    for search_string in search_list:

        # query the database for data to plot
        data_all = bd.query_database('analyzed_data', search_string)
        data_all = [el for el in data_all if 'preproc' in el['slug']]
        data_path = [el['analysis_path'] for el in data_all if '_preproc' in el['slug']]
        all_paths.append(data_path)
    # get the dates present
    data_dates = np.unique([os.path.basename(el)[:10] for el in np.concatenate(all_paths)])
    print(f'Dates present: {data_dates}')
    # now load the files
    for data_path in all_paths:
        # load the calcium data
        beh_data = []
        # for all the files
        for files in data_path:
            # load the data
            with pd.HDFStore(files) as h:
                beh_data.append(h['full_traces'])
                if '/matched_calcium' in h.keys():
                    # get the cell matches
                    cell_matches = h['cell_matches']
                    
                    # perform only if there are more files
                    if len(data_dates) > 1:
#                         print('Successful match')
                        match_dates = [el for el in data_dates if el in cell_matches.columns]
                        # get only the days present in the search
                        cell_matches = cell_matches[match_dates]
                        # generate a list with the number of days and the number of cells kept

                        # get the unique cell combinations
                        # unique contains the unique patterns followed by cells across days
                        # inverse indicates which pattern is followed by each cell
                        # count contains the number of times each pattern is found
                        unique, inverse, counts = np.unique(~np.isnan(cell_matches.to_numpy()), axis=0, 
                                        return_counts=True, return_inverse=True)
                        # remove the single day and no day cases
                        counts[np.sum(unique, axis=1)==0] = 0
                        counts[np.sum(unique, axis=1)==1] = 0

                        # get an index vector with only the most popular pattern
                        # (regardless of how many cells share it)
                        cell_idx = np.array(inverse==np.argmax(counts))

                        cell_matches = cell_matches.iloc[cell_idx, :]
                    else:
                        counts = 1
                        cell_idx = cell_matches[data_dates].to_numpy()
                        cell_idx = ~np.isnan(cell_idx)
                        cell_matches = cell_matches.iloc[cell_idx, :]
                        unique = np.array([[1]])
                    # concatenate the latents
                    dataframe = pd.concat([h['matched_calcium'], h['latents']], axis=1)
                    # store
                    pre_data.append((files, dataframe,  cell_matches))
                    
print(f'Number of matched cells: {np.sum(cell_idx)}')
print(f'Number of matched trials: {unique[np.argmax(counts)].sum()}')

In [None]:
# Leave only common cells across all datasets

# allocate memory for the cleaned up data
data = []
trial_dates = []

# print(pre_normal_data[0][2])
# for all the normal trials
for idx, el in enumerate(pre_data):
    # get the date
    current_date = os.path.basename(el[0])[:10]
    # get the corresponding indexes
    current_idx = el[2][current_date].to_numpy()
    # if they're all nans, skip the day
    if np.isnan(np.sum(current_idx)):
        continue
    # store the date
    trial_dates.append(current_date)
    # get the current df
    current_df = el[1]
    labels = list(current_df.columns)
    cells = [el for el in labels if 'cell' in el]
    not_cells = [el for el in labels if 'cell' not in el]
    # get the non-cell data
    non_cell_data = current_df[not_cells]
    # get the current calcium data
    cell_data = current_df[cells]
    # remove the non matched cells
#     cell_data = cell_data.iloc[:, current_idx]
    # rename the cell fields
    cell_names = ['cell_' + str(el) for el in np.arange(cell_data.shape[1])]
    cell_data.columns = cell_names
    # normalize the single trial activity
#     cell_data = (cell_data-cell_data.mean())/cell_data.std()
#     cell_data = cell_data/cell_data.std()
#     cell_data = (cell_data-cell_data.min())/(cell_data.max()-cell_data.min())

    # calculate a baseline for all cells
    for name, single in cell_data.items():
        # skip if there are only zeros
        if np.sum(single) == 0:
            continue
        # get the baseline
        baseline = np.percentile(single[single>0], 8)
        # get the dF/F
#         single = (single-baseline)/baseline
        # clip the trace
        single[single<baseline] = 0
        # store
        cell_data[name] = single

    # remove the nans after normalization
    cell_data[np.isnan(cell_data)] = 0
    # assemble a new data frame with only the matched cells and the rest of the data
    data.append(pd.concat((non_cell_data, cell_data), axis=1))
    
print(data[0].shape)
print(data[0].columns)


In [None]:
# set up the feature and calcium matrices
# list the radial features in the dataset
radial_features = ['cricket_0_delta_heading', 'cricket_0_visual_angle', 'mouse_heading', 
                   'cricket_0_delta_head', 'cricket_0_heading', 'head_direction']
# define the design matrix
feature_list = ['mouse_speed', 'cricket_0_speed', 'mouse_x', 'mouse_y', 'cricket_0_x', 'cricket_0_y',
                'cricket_0_delta_heading', 'cricket_0_mouse_distance', 'cricket_0_visual_angle',
               'mouse_heading', 'cricket_0_delta_head', 'cricket_0_heading', 'head_direction',
               'latent_0', 'latent_1', 'latent_2', 'latent_3', 'latent_4',
               'latent_5', 'latent_6', 'latent_7', 'latent_8', 'latent_9']
# feature_list = ['mouse_speed']

# define the frame rate (fps)
frame_rate = 10
# define the width of the kernel (s), multiplied to convert to frames
sigma = 1*frame_rate
# calculate the kernel
kernel = ss.gaussian(sigma*5, sigma)
# define the number of basis functions per regressor
basis_number = 9
# define the kernel spacing (in s)
kernel_spacing = 0.2*frame_rate
# get the total length of the kernel
total_length = kernel_spacing*(basis_number-1) + kernel.shape[0]
# # get the start positions of the basis functions (assume sigma defines the interval)
# basis_starts = [int(el) for el in np.arange(-sigma*((basis_number-1)/2), 
#                                        sigma*((basis_number-1)/2)+1, sigma)]
# allocate memory for the output
feature_trials = []
# allocate memory for a data frame without the encoding model features
feature_raw_trials = []
# allocate memory for the calcium
calcium_trials = []
# get the number of trials
trial_number = len(data)
# get the features
for idx, el in enumerate(data):
    # get the intersection of the labels
    label_intersect = [feat for feat in feature_list if feat in el.columns]
    
    if len(label_intersect) != len(feature_list):
        continue
    # get the features of interest
    target_features = el.loc[:, feature_list]
    # save the original features for simpler calculations
#     feature_raw_trials.append(target_features.copy())
    # get the original columns
    original_columns = target_features.columns
    
    # turn the radial variables into linear ones
    # for all the columns
    for label in original_columns:
        # calculate head speed
        if label == 'head_direction':
            # get the head direction
            head = target_features[label].copy().to_numpy()
            # get the angular speed and acceleration of the head
            speed = np.concatenate(([0], np.diff(ss.medfilt(head, 21))), axis=0)
            acceleration = np.concatenate(([0], np.diff(head)), axis=0)
            # add to the features
            target_features['head_speed'] = speed
            target_features['head_acceleration'] = acceleration
#         # check if the feature is radial
#         if label in radial_features:
#             # get the feature
#             rad_feature = target_features[label].copy().to_numpy()
#             # convert to radians
#             rad_feature = np.deg2rad(rad_feature)
#             # perform angular decomposition (assume unit circle)
#             x = np.cos(rad_feature)
#             y = np.sin(rad_feature)
#             # replace the original column by the extracted ones
#             target_features[label+'_x'] = x
#             target_features[label+'_y'] = y
#             # drop the original column
#             target_features.drop(labels=label, axis=1, inplace=True)
        # check if the label is a speed and calculate acceleration
        if 'speed' in label:
            # get the speed
            speed = target_features[label].copy().to_numpy()
            # calculate the acceleration with the smoothed speed
            acceleration = np.concatenate(([0], np.diff(ss.medfilt(speed, 21))), axis=0)
            # add to the features
            target_features[label.replace('speed', 'acceleration')] = acceleration
        # smooth the feature
        target_features[label] = ss.medfilt(target_features[label], 21)

    # store the features
    feature_raw_trials.append(target_features)
    
    # get the calcium data
    cells = [cell for cell in el.columns if 'cell' in cell]
    cells = el.loc[:, cells].to_numpy()

    # store
    calcium_trials.append(cells)
    

print(f'Time by features: {feature_raw_trials[0].shape}')
print(f'Time by ROIs: {calcium_trials[0].shape}')
print(feature_raw_trials[0].columns)

In [None]:
# combine trials for single days

# get the unique dates
unique_dates = np.unique(trial_dates)
print(unique_dates)
# allocate memory for the output
merged_features = []
merged_calcium = []
# for all the unique dates
for dates in unique_dates:
    # accumulate the trials for that date
    day_features = [el for idx, el in enumerate(feature_raw_trials) if dates in trial_dates[idx]]
    day_calcium = [el for idx, el in enumerate(calcium_trials) if dates in trial_dates[idx]]

    # concatenate and store
    merged_features.append(pd.concat(day_features))
    merged_calcium.append(np.vstack(day_calcium))
# replace the original list
feature_raw_trials = merged_features
calcium_trials = merged_calcium
    

In [None]:
%%time
# Calculate the half and full tuning of each cell 

# define the number of calcium shuffles
shuffle_number = 100
# define the confidence interval cutoff
percentile = 95
# define the number of bins for the TCs
bins = 10
# allocate memory for the output
tc_all = []
tc_full = []
tc_resp = []
# get the number of features
feature_number = feature_raw_trials[0].shape[1]
# for all the trials
for idx, trial in enumerate(feature_raw_trials):
    # get the calcium data
    current_calcium = calcium_trials[idx]
    # get the number of cells
    cell_number = current_calcium.shape[1]
    # allocate memory for the trial TCs
    tc_trial = {}
    tc_trial_full = {}
    tc_trial_resp = {}
    # for all the features
    for feature in np.arange(feature_number):
        # get the current feature
        current_feature = trial.to_numpy()[:, feature]
        # get the name
        feature_name = trial.columns[feature]
        # allocate a list for the 2 halves
        tc_half = []
        # exclude nan values
        keep_vector_full = ~np.isnan(current_feature)
        counts_feature = current_feature[keep_vector_full]
        # get the counts
        feature_counts = stat.binned_statistic(counts_feature, counts_feature, 
                                               statistic='count', bins=bins)[0]
        # zero the positions with less than 3 counts
        feature_counts[feature_counts<3] = 0
        # for first and second half
        for half in np.arange(2):
            # get the half vector
#             print(current_feature.shape)
            half_bound = int(np.floor(current_feature.shape[0]/2))
            half_vector = np.arange(half_bound) + half_bound*half
            half_feature = current_feature[half_vector]
            # exclude nan values
            keep_vector = ~np.isnan(half_feature)
            keep_feature = half_feature[keep_vector]

            # allocate a list for the cells
            tc_cell = []
            # for all the cells
            for cell in np.arange(cell_number):
                # get the current cell
                half_cell = current_calcium[half_vector, cell]
                keep_cell = half_cell[keep_vector]

                # calculate the TC
                current_tc = stat.binned_statistic(keep_feature, keep_cell, 
                                                   statistic='sum', bins=bins)[0]
                # normalize the TC
                norm_tc = current_tc/feature_counts
                # remove nans
                norm_tc[np.isnan(norm_tc)] = 0
                # store
                tc_cell.append(norm_tc)
            # store the cells
            tc_half.append(tc_cell)
        # allocate memory for the full tc per cell
        tc_cell_full = []
        tc_cell_resp = np.zeros((cell_number, 2))
        # calculate the full TC
        for cell in np.arange(cell_number):
            keep_cell = current_calcium[keep_vector_full, cell]
            tc_cell = stat.binned_statistic(counts_feature, keep_cell, statistic='sum')[0]
            tc_cell = tc_cell/feature_counts
            tc_cell[np.isnan(tc_cell)] = 0
            tc_cell[np.isinf(tc_cell)] = 0
            # allocate memory for the shuffles
            shuffle_array = np.zeros((shuffle_number, bins))
            # generate the shuffles
            for shuffle in np.arange(shuffle_number):
                # randomize the calcium activity
                random_cell = keep_cell.copy()
                np.random.shuffle(random_cell)
                tc_random = stat.binned_statistic(counts_feature, random_cell, statistic='sum')[0]
                tc_random = tc_random/feature_counts
                tc_random[np.isnan(tc_random)] = 0
                tc_random[np.isinf(tc_random)] = 0
                shuffle_array[shuffle, :] = tc_random
            # get the threshold
            resp_threshold = np.percentile(np.abs(shuffle_array.flatten()), percentile)
            # threshold the TC
#             tc_cell[np.abs(tc_cell)<resp_threshold] = 0
            # fill up the responsivity matrix
            tc_cell_resp[cell, 0] = np.abs(np.max(np.abs(tc_cell)) - resp_threshold)/resp_threshold
            tc_cell_resp[cell, 1] = np.sum(np.abs(tc_cell)>resp_threshold) > 0
            # store
            tc_cell_full.append(tc_cell)
#             tc_cell_resp.append(resp_threshold)
        # store the halfs and fulls
        tc_trial[feature_name] = tc_half
        tc_trial_full[feature_name] = tc_cell_full
        tc_trial_resp[feature_name] = tc_cell_resp
    # store the trials
    tc_all.append(tc_trial)
    tc_full.append(tc_trial_full)
    tc_resp.append(tc_trial_resp)



In [None]:
%%time
# calculate consistency

# define the number of shuffles
shuffle_number = 100
# define the percentile
percentile = 95
# allocate memory for the output
tc_cons = []
# get the number of features
feature_number = feature_raw_trials[0].shape[1]
# for all the trials
for idx, trial in enumerate(feature_raw_trials):
#     # get the calcium data
#     current_calcium = calcium_trials[idx]
    # get the number of cells
    cell_number = calcium_trials[idx].shape[1]
    # allocate memory for the trial TCs
    tc_trial = {}
    # for all the features
    for feature in np.arange(feature_number):

        # get the name
        feature_name = trial.columns[feature]
        # allocate an array for the correlations and tests
        tc_half = np.zeros([cell_number, 2])
        # get the two halves
        halves = tc_all[idx][feature_name]
        
        # calculate the real and shuffle correlation
        for cell in np.arange(cell_number):
            # get the current cell first and second half
            current_first = halves[0][cell]
            current_second = halves[1][cell]
            # real correlation
            real_correlation = np.corrcoef(current_first, current_second)[1][0]
            
            # shuffle array
            shuffle_array = np.zeros([shuffle_number, 1])
            # calculate the confidence interval
            for shuffle in np.arange(shuffle_number):
                random_second = halves[1][random.randrange(cell_number)]
                shuffle_array[shuffle] = np.corrcoef(current_first, random_second)[1][0]
            # turn nans into 0
            shuffle_array[np.isnan(shuffle_array)] = 0
            # get the confidence interval
            conf_interval = np.percentile(shuffle_array, percentile)
            # store the correlation and whether it passes the criterion
            tc_half[cell, 0] = real_correlation
            tc_half[cell, 1] = real_correlation > conf_interval
#             freq, edges = np.histogram(shuffle_array.flatten())
#             hist = hv.Histogram((edges, freq)).opts(title=str(real_correlation))
#             raise ValueError
            
        # store for the variable
        tc_trial[feature_name] = tc_half
    # store for the trial
    tc_cons.append(tc_trial)

In [None]:
# %%time
# # calculate responsivity
# # define the number of shuffles
# shuffle_number = 1000
# # define the percentile
# percentile = 80
# # allocate memory for the output
# tc_resp = []
# # get the number of features
# feature_number = feature_raw_trials[0].shape[1]
# # for all the trials
# for idx, trial in enumerate(feature_raw_trials):

#     # get the number of cells
#     cell_number = calcium_trials[idx].shape[1]
#     # allocate memory for the trial TCs
#     tc_trial = {}
#     # for all the features
#     for feature in np.arange(feature_number):

#         # get the name
#         feature_name = trial.columns[feature]
#         # allocate an array for the correlations and tests
#         tc_resp_cell = np.zeros([cell_number, 2])
#         # get the current TCs
#         trial_tcs = np.array(tc_full[idx][feature_name])
#         # build a distribution with the data for the whole trial
#         null_estimator = kds().fit(trial_tcs.flatten().reshape(-1, 1))
# #         null_estimator = kds().fit(trial_tcs)
#         # allocate memory for the ref cosim
#         ref_cosim = np.zeros([shuffle_number, 1])
#         # get the unity line
#         unity_line = np.ones_like(trial_tcs[:1, :])
#         # generate samples from the distribution
#         for shuffle in np.arange(shuffle_number):
#             # generate a sample TC
#             sample_tc = null_estimator.sample(10).reshape(1, -1)
# #             sample_tc = null_estimator.sample(1)
#             # calculate the ref cosim and save
# #             ref_cosim[shuffle] = 1 - np.abs(smet.pairwise.cosine_similarity(sample_tc, unity_line)[0][0])
#         # get the threshold
#         cosim_threshold = np.percentile(ref_cosim, percentile)
#         # calculate the real and shuffle responsivity
#         for cell in np.arange(cell_number):    

#             # calculate the real cosim
#             current_tc = trial_tcs[cell:cell+1, :]
#             # calculate the cosine similarity to the unity line
#             real_cosim = 1 - np.abs(smet.pairwise.cosine_similarity(current_tc, unity_line)[0][0])
#             # determine if it passes the threshold
#             test_cosim = real_cosim > cosim_threshold
#             # store both values
#             tc_resp_cell[cell, 0] = real_cosim
#             tc_resp_cell[cell, 1] = test_cosim
# #             freq, edges = np.histogram(ref_cosim.flatten())
# #             hist = hv.Histogram((edges, freq)).opts(title=str(real_cosim))
# #             print(smet.pairwise.euclidean_distances(current_tc.T))
#             hist = hv.Image((smet.pairwise.euclidean_distances(current_tc.T)))
#             hist.opts(tools=['hover'])
#             raise ValueError
            
#         # store the feature
#         tc_trial[feature_name] = tc_resp_cell
#     # store the trial
#     tc_resp.append(tc_trial)
            

In [None]:
# %%time
# plot the results

# get the number of features
feature_number = feature_raw_trials[0].shape[1]
plot_list = []
# for all features
for feature in np.arange(feature_number):
     # get the name
    feature_name = feature_raw_trials[0].columns[feature]
    # collect data across trials
    across_cons = np.vstack([el[feature_name] for el in tc_cons])
    across_resp = np.vstack([el[feature_name] for el in tc_resp])
    
    # also generate maps to identify the trial and cell
    trial_map = np.hstack([np.ones([el[feature_name].shape[0]])*idx 
                           for idx, el in enumerate(tc_cons)]).T.astype(int)
    cell_map = np.hstack([np.arange(el[feature_name].shape[0]) 
                          for el in tc_cons]).T.astype(int)
    
    # plot
    both_pass = (across_cons[:, 1]==1) & (across_resp[:, 1]==1)
    both_plot = hv.Scatter((across_cons[both_pass, 0], across_resp[both_pass, 0]), 
                      kdims=['Consistency'], vdims=['Responsivity'])
    both_plot.opts(title=feature_name, shared_axes=False)
    none_plot = hv.Scatter((across_cons[~both_pass, 0], across_resp[~both_pass, 0]), 
                      kdims=['Consistency'], vdims=['Responsivity'])
    plot_list.append(both_plot*none_plot)
hv.Layout(plot_list).opts(shared_axes=False)
    
    

In [None]:
def plot_map(feature_0, feature_1, current_calcium, target_features, cmap='kbc_r', bins=10):
    # remove nans
    keep_vector = ~np.isnan(feature_0) & ~np.isnan(feature_1)
    feature_0 = feature_0[keep_vector]
    feature_1 = feature_1[keep_vector]
    current_calcium = current_calcium[keep_vector]
    
    # get the counts and actual maps
    counts = stat.binned_statistic_2d(feature_0, feature_1, feature_1, statistic='count', bins=bins)[0]
    values, x_edge, y_edge, idx = \
        stat.binned_statistic_2d(feature_0, feature_1, current_calcium, statistic='sum', bins=bins)
    
    # generate the map
    counts[counts<3] = 0
    norm_map = values/counts
    norm_map[np.isnan(norm_map)] = 0
    norm_map[np.isinf(norm_map)] = 0
    
    plot = hv.Image((x_edge, y_edge, norm_map.T), kdims=target_features)
    plot.opts(tools=['hover'], cmap=cmap)
    return plot, x_edge, y_edge, counts

In [None]:
# Plot examples

# define the target variables (first is used for the thresholds)
target_features = ['cricket_0_mouse_distance', 'cricket_0_delta_heading']

# define the number of cells to plot
example_number = 20
# get the pass vector
across_cons_0 = np.vstack([el[target_features[0]] for el in tc_cons])
across_resp_0 = np.vstack([el[target_features[0]] for el in tc_resp])
both_pass_0 = (across_cons_0[:, 1]==1) & (across_resp_0[:, 1]==1)

across_cons_1 = np.vstack([el[target_features[1]] for el in tc_cons])
across_resp_1 = np.vstack([el[target_features[1]] for el in tc_resp])
both_pass_1 = (across_cons_1[:, 1]==1) & (across_resp_1[:, 1]==1)

both_pass = (both_pass_0)# & (both_pass_1)

# find the top cells
pass_cells = np.argwhere(both_pass).flatten()
# sort based on responsivity
sort_idx = np.argsort(across_cons[both_pass, 0])
pass_cells = pass_cells[sort_idx[::-1]]
# get the corresponding trial and cell idx
trial_idx = trial_map[pass_cells]
cell_idx = cell_map[pass_cells]
print(f'Trials with target cells: {trial_idx}')
print(f'Target cells: {cell_idx}')
# define the number to plot based on the available cells
plot_number = np.min([example_number, pass_cells.shape[0]])
print(f'Number of cells found: {pass_cells.shape[0]}')
# allocate a list for the map
plot_list = []

# for each cell
for cell in np.arange(plot_number):
#     # get the calcium activity
#     current_calcium = calcium_trials[trial_idx[cell]][:, cell_idx[cell]]
#     # get the values for both properties
#     feature_0 = feature_raw_trials[trial_idx[cell]].loc[:, target_features[0]].to_numpy()
#     feature_1 = feature_raw_trials[trial_idx[cell]].loc[:, target_features[1]].to_numpy()
    
#     plot, x_edge, y_edge, counts = \
#         plot_map(feature_0, feature_1, current_calcium, target_features, bins=20)
    # get the full TC
    current_tc = tc_full[trial_idx[cell]][target_features[0]][cell_idx[cell]]
    plot = hv.Curve((current_tc))
    plot.opts(color='red')
    plot_list.append(plot)

# count_plot = hv.Image((x_edge, y_edge, np.log10(counts.T)), kdims=target_features)
# count_plot.opts(cmap='viridis')
# plot_list.append(count_plot)
hv.Layout(plot_list).opts(shared_axes=True).cols(3)

In [None]:
# plot all maps on a day
# define the target variables 
# target_features = ['mouse_speed','latent_0']
# define the target trial
target_trial = 0
# get the number of cells
cell_number = calcium_trials[target_trial].shape[1]
# allocate the plot list
plot_list = []
# for each cell
for cell in np.arange(cell_number):
#     # get the calcium activity
#     current_calcium = calcium_trials[target_trial][:, cell]
#     # get the values for both properties
#     feature_0 = feature_raw_trials[target_trial].loc[:, target_features[0]].to_numpy()
#     feature_1 = feature_raw_trials[target_trial].loc[:, target_features[1]].to_numpy()

    # define the colormap based on the the criteria
    if (target_trial in trial_idx) & (cell in cell_idx):
        cmap = 'cet_linear_kry_5_98_c75_r'
        color = 'red'
    else:
        cmap = 'kbc_r'
        color = 'blue'
    # get the full TC
    current_tc = tc_full[target_trial][target_features[0]][cell]
    plot = hv.Curve((current_tc))
    plot.opts(color=color)
#     plot, x_edge, y_edge, counts = \
#         plot_map(feature_0, feature_1, current_calcium, target_features, cmap=cmap, bins=20)
    plot_list.append(plot)
# count_plot = hv.Image((x_edge, y_edge, np.log10(counts.T)), kdims=target_features)
# count_plot.opts(cmap='viridis')
# plot_list.append(count_plot)
hv.Layout(plot_list).opts(shared_axes=True).cols(3)

In [None]:
# visualize the two halves
# define the target variables 
# target_features = ['mouse_speed','latent_0']
# define the target trial
target_trial = 0
# get the number of cells
cell_number = calcium_trials[target_trial].shape[1]
# get the interval length
interval = int(np.floor(calcium_trials[target_trial].shape[0]/2))
# allocate the plot list
plot_list = []
# for each cell
for cell in np.arange(cell_number):
#     # get the calcium activity
#     current_calcium_all = calcium_trials[target_trial][:, cell]
#     # get the values for both properties
#     feature_0_all = feature_raw_trials[target_trial].loc[:, target_features[0]].to_numpy()
#     feature_1_all = feature_raw_trials[target_trial].loc[:, target_features[1]].to_numpy()
    # for both halves
#     for half in np.arange(2):
#         selection_vector = np.array(np.arange(interval)) + half*interval
#         feature_0 = feature_0_all[selection_vector]
#         feature_1 = feature_1_all[selection_vector]
#         current_calcium = current_calcium_all[selection_vector]
        # define the colormap based on the the criteria
#         if (target_trial in trial_idx) & (cell in cell_idx):
#             cmap = 'cet_linear_kry_5_98_c75_r'
#         else:
#             cmap = 'kbc_r'
#         plot, x_edge, y_edge, counts = \
#             plot_map(feature_0, feature_1, current_calcium, target_features, cmap=cmap, bins=20)
        # get the half tuning curves
        first_half = tc_all[target_trial][target_features[0]][0][cell]
        second_half = tc_all[target_trial][target_features[0]][1][cell]
        if (target_trial in trial_idx) & (cell in cell_idx):
            color = 'red'
        else:
            color = 'blue'
        current_cons = np.round(tc_cons[target_trial][target_features[0]][cell, 0], 2)
        current_resp = np.round(tc_resp[target_trial][target_features[0]][cell, 0], 2)
        first_plot = hv.Curve((first_half))
        first_plot.opts(color=color, title=str(current_cons)+'_'+str(current_resp))
        second_plot = hv.Curve((second_half)).opts(color=color)
        
        plot = first_plot*second_plot
        plot_list.append(plot)
# count_plot = hv.Image((x_edge, y_edge, np.log10(counts.T)), kdims=target_features)
# count_plot.opts(cmap='viridis')
# plot_list.append(count_plot)
hv.Layout(plot_list).opts(shared_axes=False).cols(2)

In [None]:
# Generate 2D "tuning curves"
raise ValueError
# define trial groups
n_groups = 2
interval = np.floor(len(calcium_trials)/n_groups)
temp_vector = np.array(np.arange(interval))
trial_groups = [(temp_vector+idx*interval).astype(int) for idx in np.arange(n_groups)]

# define the features to use
tc_features = ['cricket_0_mouse_distance', 'cricket_0_delta_heading']
# tc_features = ['mouse_speed', 'latent_0']
# tc_features = ['latent_1', 'latent_0']
# tc_features = ['cricket_0_x', 'cricket_0_y']
# tc_features = ['mouse_speed', 'cricket_0_mouse_distance']
# tc_features = ['mouse_y', 'mouse_x']
# tc_features = ['cricket_0_visual_angle', 'cricket_0_delta_head']

print(feature_raw_trials[0].columns)
# allocate the meta plot list
meta_list = []
hist_list = []
# define the font size
fontsize = {
    'ticks': 11,
    'labels': 13
}

# define the plot labels
label_x = tc_features[1]
label_y = tc_features[0]
# for all the trial groups
for trial_idx in trial_groups:
    # allocate the plot list
    tc_list = []
    hist_temp_list = []
    
    feature_raw = [feature_raw_trials[el] for el in trial_idx]
    calcium_matrix = np.concatenate([calcium_trials[el] for el in trial_idx], axis=0)
    # get the relevant features
    feature0 = pd.concat([el[tc_features[0]] for el in feature_raw])
    feature0[np.isnan(feature0)] = 0
    feature0 = ss.medfilt(feature0, 21)
    feature1 = pd.concat([el[tc_features[1]] for el in feature_raw])
    feature1[np.isnan(feature1)] = 0
    feature1 = ss.medfilt(feature1, 21)

    print(calcium_matrix.shape)
    roi_number = calcium_matrix.shape[1]


    st_base, x_edge, y_edge, idx = \
        stat.binned_statistic_2d(feature0, feature1, [], bins=20, statistic='count')
    st_base[st_base==0] = 1
    plot_st_base = np.log10(st_base)
    plot_st_base[np.isinf(plot_st_base)] = 0
    # st_base = (st_base-np.nanmin(st_base))/(np.nanmax(st_base)-np.nanmin(st_base))
#     im_plot = hv.Image((plot_st_base), kdims=[tc_features[0], tc_features[1]], bounds=[0, 10, 0, 20])
    im_plot = hv.Image((y_edge, x_edge, plot_st_base), kdims=[label_x, label_y])
    im_plot.opts(width=250, tools=['hover'], fontsize=fontsize, xrotation=45)#, clim=(0, 0.2))

    tc_list.append(im_plot)
    # st_base = 1
    # store the matrices for averaging
    st_store = []
    # for all the cells
    for cells in np.arange(roi_number)[:]:
        # get the calcium
        current_calcium = calcium_matrix[:, cells]
    #     current_calcium = ss.medfilt(pd.concat([el['mouse_speed'] for el in feature_raw]), 21)

        # build 2d distribution
        st, x_edge, y_edge, idx = \
            stat.binned_statistic_2d(feature0, feature1, current_calcium, bins=20, statistic='sum')
    #     st[np.isnan(st)] = 0
    #     st = (st-np.nanmin(st))/(np.nanmax(st)-np.nanmin(st))
    #     st[np.isnan(st)] = 0
        st[st_base<3] = 0
        st_plot = (st/st_base)
        st_plot[np.isnan(st_plot)] = 0
        im_plot = hv.Image((y_edge, x_edge, st_plot), kdims=[label_x, label_y])
        im_plot.opts(width=250, tools=['hover'], cmap='viridis', fontsize=fontsize, xrotation=45)#, clim=(0, 0.2))
        tc_list.append(im_plot)
        st_store.append(st)
        
        # store the actual map too
        hist_temp_list.append(st_plot)

    plot_ave = np.mean(st_store, axis=0)/st_base
    # plot_ave[np.isinf(plot_ave)] = 0
    im_plot = hv.Image((y_edge, x_edge, plot_ave), kdims=[label_x, label_y])
    im_plot.opts(width=250, tools=['hover'], fontsize=fontsize, xrotation=45)#, clim=(0, 0.2))

    tc_list.append(im_plot)
    meta_list.append(tc_list)
    hist_list.append(hist_temp_list)
# reorder the list

# top_half = [el for idx, el in enumerate(tc_list) if idx < len(tc_list)/2]
# bottom_half = [el for idx, el in enumerate(tc_list) if idx >= len(tc_list)/2]
# tc_list = [val for pair in zip(top_half, bottom_half) for val in pair]
# lists = [l1, l2, ...]
# [val for tup in zip(*lists) for val in tup]
meta_list = [val for tup in zip(*meta_list) for val in tup]
# create the layout
hv.Layout(meta_list).opts(shared_axes=False)


In [None]:
# calculate correlations between maps and plot

# allocate memory for the correlations
correlation_array = []
# for all the cells
for one, two in zip(hist_list[0], hist_list[1]):
    # calculate the correlation between the pairs
    corr_coef = np.corrcoef(one.flatten(), two.flatten())[1][0]
    if np.isnan(corr_coef):
#         continue
        corr_coef = 0
    correlation_array.append(corr_coef)

correlation_array = np.array(correlation_array)

# define the number of shuffles
shuffle_number = 100
# allocate memory for the shuffle results
shuffle_array = np.zeros((correlation_array.shape[0], shuffle_number))
# for all the shuffles
for shuffle in np.arange(shuffle_number):
    # allocate memory for the correlations
    correlation_shuffle = []
    # shuffle the lists
    list_0 = random.sample(hist_list[0], len(hist_list[0]))
    list_1 = random.sample(hist_list[1], len(hist_list[0]))
    # for all the cells
    for one, two in zip(list_0, list_1):
        # calculate the correlation between the pairs
        corr_coef = np.corrcoef(one.flatten(), two.flatten())[1][0]
        if np.isnan(corr_coef):
            corr_coef = 0
#             continue
        correlation_shuffle.append(corr_coef)
    shuffle_array[:, shuffle] = correlation_shuffle

    
# turn into an array
freq, bins = np.histogram(correlation_array, density=True, bins=20)
bin_centers = bins[:-1] + np.diff(bins)/2
hist_ori = hv.Curve((bin_centers, np.cumsum(freq)), kdims=['Correlation', 'Probability'])
hist_ori.opts(fontsize=fontsize)

freq, bins = np.histogram(shuffle_array.flatten(), density=True, bins=bins)
bin_centers = bins[:-1] + np.diff(bins)/2
hist_shuffle = hv.Curve((bin_centers, np.cumsum(freq)))


(hist_ori*hist_shuffle)

In [None]:
%%time
# Calculate the half and full tuning of each cell 
importlib.reload(processing_parameters)

# define the pairs to quantify
target_pairs = [['cricket_0_mouse_distance', 'cricket_0_delta_heading'], ['mouse_x', 'mouse_y']]
# target_pairs = [['mouse_speed', 'cricket_0_speed']]
# get the number of pairs
pair_number = len(target_pairs)
# define the number of calcium shuffles
shuffle_number = 100
# define the confidence interval cutoff
percentile = 95
# define the number of bins for the TCs
bin_number = 10
# allocate memory for the output
tc_all = []
tc_full = []
tc_resp = []
# get the number of features
feature_number = feature_raw_trials[0].shape[1]
# for all the trials
for idx, trial in enumerate(feature_raw_trials):
    # get the calcium data
    current_calcium = calcium_trials[idx]
    # get the number of cells
    cell_number = current_calcium.shape[1]
    # allocate memory for the trial TCs
    tc_trial = {}
    tc_trial_full = {}
    tc_trial_resp = {}
    # for all the features
    for pair_idx in np.arange(pair_number):
        # get the current feature
#         current_feature = trial.to_numpy()[:, feature]
        feature_names = target_pairs[pair_idx]
        # concatenate for the dict
        feature_name = '|'.join(feature_names)
        current_feature_0 = trial.loc[:, feature_names[0]].to_numpy()
        current_feature_1 = trial.loc[:, feature_names[1]].to_numpy()
        # get the bins from the parameters file
        bin_ranges = processing_parameters.tc_params[feature_name]
        # calculate the bin edges based on the ranges
        bins = [np.linspace(el[0], el[1], num=bin_number+1) for el in bin_ranges]
        # get the name
#         feature_name = trial.columns[feature]
        # allocate a list for the 2 halves
        tc_half = []
        # exclude nan values
        keep_vector_full = (~np.isnan(current_feature_0)) & (~np.isnan(current_feature_1))
        counts_feature_0 = current_feature_0[keep_vector_full]
        counts_feature_1 = current_feature_1[keep_vector_full]
        # get the counts
        feature_counts = stat.binned_statistic_2d(counts_feature_0, counts_feature_1, counts_feature_0, 
                                               statistic='count', bins=bins)[0]
        # zero the positions with less than 3 counts
        feature_counts[feature_counts<3] = 0
        # for first and second half
        for half in np.arange(2):
            # get the half vector
#             print(current_feature.shape)
            half_bound = int(np.floor(current_feature_0.shape[0]/2))
            half_vector = np.arange(half_bound) + half_bound*half
            half_feature_0 = current_feature_0[half_vector]
            half_feature_1 = current_feature_1[half_vector]
            # exclude nan values
            keep_vector = (~np.isnan(half_feature_0)) & (~np.isnan(half_feature_1))
            keep_feature_0 = half_feature_0[keep_vector]
            keep_feature_1 = half_feature_1[keep_vector]

            # allocate a list for the cells
            tc_cell = []
            # for all the cells
            for cell in np.arange(cell_number):
                # get the current cell
                half_cell = current_calcium[half_vector, cell]
                keep_cell = half_cell[keep_vector]

                # calculate the TC
                current_tc = stat.binned_statistic_2d(keep_feature_0, keep_feature_1, keep_cell, 
                                                   statistic='sum', bins=bins)[0]
                # normalize the TC
                norm_tc = current_tc/feature_counts
                # remove nans and infs
                norm_tc[np.isnan(norm_tc)] = 0
                norm_tc[np.isinf(norm_tc)] = 0
                # store
                tc_cell.append(norm_tc)
            # store the cells
            tc_half.append(tc_cell)
        # allocate memory for the full tc per cell
        tc_cell_full = []
        tc_cell_resp = np.zeros((cell_number, 2))
        # calculate the full TC
        for cell in np.arange(cell_number):
            keep_cell = current_calcium[keep_vector_full, cell]
            tc_cell = stat.binned_statistic_2d(counts_feature_0, counts_feature_1, 
                                            keep_cell, statistic='sum', bins=bins)[0]
            tc_cell = tc_cell/feature_counts
            tc_cell[np.isnan(tc_cell)] = 0
            tc_cell[np.isinf(tc_cell)] = 0
            # allocate memory for the shuffles
            shuffle_array = np.zeros((shuffle_number, bin_number, bin_number))
            # generate the shuffles
            for shuffle in np.arange(shuffle_number):
                # randomize the calcium activity
                random_cell = keep_cell.copy()
                np.random.shuffle(random_cell)
                tc_random = stat.binned_statistic_2d(counts_feature_0, counts_feature_1,
                                                     random_cell, statistic='sum', bins=bins)[0]
                tc_random = tc_random/feature_counts
                tc_random[np.isnan(tc_random)] = 0
                tc_random[np.isinf(tc_random)] = 0
                shuffle_array[shuffle, :, :] = tc_random
            # get the threshold
            resp_threshold = np.percentile(np.abs(shuffle_array.flatten()), percentile)
            # threshold the TC
#             tc_cell[np.abs(tc_cell)<resp_threshold] = 0
            # fill up the responsivity matrix
#             tc_cell_resp[cell, 0] = np.abs(np.max(np.abs(tc_cell)) - resp_threshold)/resp_threshold
#             tc_cell_resp[cell, 0] = np.max(np.abs(tc_cell))/resp_threshold
            tc_cell_resp[cell, 0] = np.mean(np.sort(np.abs(tc_cell), axis=None)[-3:])/resp_threshold
            tc_cell_resp[cell, 1] = np.sum(np.abs(tc_cell)>resp_threshold) > 3
            # store
            tc_cell_full.append(tc_cell)
#             tc_cell_resp.append(resp_threshold)
        # store the halfs and fulls
        tc_trial[feature_name] = tc_half
        tc_trial_full[feature_name] = tc_cell_full
        tc_trial_resp[feature_name] = tc_cell_resp
    # store the trials
    tc_all.append(tc_trial)
    tc_full.append(tc_trial_full)
    tc_resp.append(tc_trial_resp)

In [None]:
%%time
# calculate consistency

# define the number of shuffles
shuffle_number = 100
# define the percentile
percentile = 95
# allocate memory for the output
tc_cons = []
# get the number of features
feature_number = feature_raw_trials[0].shape[1]
# for all the trials
for idx, trial in enumerate(feature_raw_trials):
#     # get the calcium data
#     current_calcium = calcium_trials[idx]
    # get the number of cells
    cell_number = calcium_trials[idx].shape[1]
    # allocate memory for the trial TCs
    tc_trial = {}
    # for all the features
    for pair_idx in np.arange(pair_number):

        # get the name
#         feature_name = trial.columns[feature]
        feature_names = target_pairs[pair_idx]
        feature_name = '|'.join(feature_names)
        # allocate an array for the correlations and tests
        tc_half = np.zeros([cell_number, 2])
        # get the two halves
        halves = tc_all[idx][feature_name]
        
        # calculate the real and shuffle correlation
        for cell in np.arange(cell_number):
            # get the current cell first and second half
            current_first = halves[0][cell].flatten()
            current_second = halves[1][cell].flatten()
            # real correlation
            real_correlation = np.corrcoef(current_first, current_second)[1][0]
            
            # shuffle array
            shuffle_array = np.zeros([shuffle_number, 1])
            # calculate the confidence interval
            for shuffle in np.arange(shuffle_number):
#                 random_second = halves[1][random.randrange(cell_number)].flatten()
                random_second = current_second.copy().flatten()
                np.random.shuffle(random_second)
                shuffle_array[shuffle] = np.corrcoef(current_first, random_second)[1][0]
            # turn nans into 0
            shuffle_array[np.isnan(shuffle_array)] = 0
            # rectify
#             shuffle_array[shuffle_array<0] = 0
            # get the confidence interval
            conf_interval = np.percentile(shuffle_array, percentile)
            # store the correlation and whether it passes the criterion
            tc_half[cell, 0] = real_correlation
            tc_half[cell, 1] = (real_correlation > conf_interval) & (real_correlation > 0) & \
                               (conf_interval > 0)
#             freq, edges = np.histogram(shuffle_array.flatten())
#             hist = hv.Histogram((edges, freq)).opts(title=str(real_correlation))
#             raise ValueError
            
        # store for the variable
        tc_trial[feature_name] = tc_half
    # store for the trial
    tc_cons.append(tc_trial)

In [None]:
# plot the criteria distributions
# %%time
# plot the results

# get the number of features
# feature_number = feature_raw_trials[0].shape[1]
plot_list = []
map_dict = {}
# for all features
for pair_idx in np.arange(pair_number):
     # get the name
#     feature_name = feature_raw_trials[0].columns[feature]
    feature_name = list(tc_cons[0].keys())[pair_idx]
    # collect data across trials
    across_cons = np.vstack([el[feature_name] for el in tc_cons])
    across_resp = np.vstack([el[feature_name] for el in tc_resp])
    
    # also generate maps to identify the trial and cell
    trial_map = np.hstack([np.ones([el[feature_name].shape[0]])*idx 
                           for idx, el in enumerate(tc_cons)]).T.astype(int)
    cell_map = np.hstack([np.arange(el[feature_name].shape[0]) 
                          for el in tc_cons]).T.astype(int)
    
    # plot
    both_pass = (across_cons[:, 1]==1) & (across_resp[:, 1]==1)
    both_plot = hv.Scatter((across_cons[both_pass, 0], across_resp[both_pass, 0]), 
                      kdims=['Consistency'], vdims=['Responsivity'])
    both_plot.opts(title=feature_name, shared_axes=False)
    none_plot = hv.Scatter((across_cons[~both_pass, 0], across_resp[~both_pass, 0]), 
                      kdims=['Consistency'], vdims=['Responsivity'])
    plot_list.append(both_plot*none_plot)
    
    # store the maps and vector for plotting later
    map_dict[feature_name] = [trial_map, cell_map, both_pass]
    print(f'Number of cells passing the thresholds for {feature_name}: {np.sum(both_pass)}')
hv.Layout(plot_list).opts(shared_axes=False)

In [None]:
# plot the individual TCs

# define the target trial
target_trial = 0
# define the target feature
target_feature = 'cricket_0_mouse_distance|cricket_0_delta_heading'
# target_feature = 'mouse_speed|cricket_0_speed'
# target_feature = 'mouse_x|mouse_y'
# get the maps and cell vector
trial_map = map_dict[target_feature][0]
cell_map = map_dict[target_feature][1]
both_pass = map_dict[target_feature][2]
# get the number of cells
cell_number = calcium_trials[target_trial].shape[1]

# get the half TCs
half_0 = tc_all[target_trial][target_feature][0]
half_1 = tc_all[target_trial][target_feature][1]
fulls = tc_full[target_trial][target_feature]

# get the list of cells that pass the threshold
trial_idx = trial_map[both_pass]
cell_idx = cell_map[both_pass]

# allocate the plot list
plot_list = []
# for each cell
for cell in np.arange(cell_number):
    # get the responsivity and consistency values
    current_cons = np.round(tc_cons[target_trial][target_feature][cell, 0], 2)
    current_resp = np.round(tc_resp[target_trial][target_feature][cell, 0], 2)    
    # plot them

    # define the colormap based on the the criteria
    if (target_trial in trial_idx) & (cell in cell_idx):
        cmap = 'cet_linear_kry_5_98_c75_r'
    else:
        cmap = 'kbc_r'
#     plot, x_edge, y_edge, counts = \
#         plot_map(feature_0, feature_1, current_calcium, target_features, cmap=cmap, bins=20)
    half_plot_0 = hv.Image(half_0[cell])
    half_plot_1 = hv.Image(half_1[cell])
    full_plot = hv.Image(fulls[cell])
    full_plot.opts(title=str(current_cons)+'_'+str(current_resp), cmap=cmap)
    plot_list.append(half_plot_0)
    plot_list.append(half_plot_1)
    plot_list.append(full_plot)
#     first_plot.opts(color=color, title=str(current_cons)+'_'+str(current_resp))

# count_plot = hv.Image((x_edge, y_edge, np.log10(counts.T)), kdims=target_features)
# count_plot.opts(cmap='viridis')
# plot_list.append(count_plot)
hv.Layout(plot_list).opts(shared_axes=False).cols(3)

In [None]:
# turn into an array
freq, bins = np.histogram(correlation_array, density=True, bins=20)
bin_centers = bins[:-1] + np.diff(bins)/2
hist_ori = hv.Curve((bin_centers, np.cumsum(freq)), kdims=['Correlation', 'Probability'])
hist_ori.opts(fontsize=fontsize)

freq, bins = np.histogram(shuffle_array.flatten(), density=True, bins=bins)
bin_centers = bins[:-1] + np.diff(bins)/2
hist_shuffle = hv.Curve((bin_centers, np.cumsum(freq)))


(hist_ori*hist_shuffle)