In [None]:
# imports
import os
import sys
sys.path.insert(0, os.path.abspath(r'D:\Code Repos\prey_capture'))

import functions_plotting as fp
import functions_loaders as fl
import paths
import processing_parameters
import importlib

import numpy as np
import pandas as pd
import PSID
from PSID.evaluation import evalPrediction
import scipy.signal as signal
import sklearn.preprocessing as preproc
import sklearn.cross_decomposition as cross
import umap

import panel as pn
import holoviews as hv
from holoviews import opts, dim
hv.extension('bokeh')

In [None]:
# set up the figure config
importlib.reload(fp)
importlib.reload(processing_parameters)
# define the target saving path
save_path = os.path.join(paths.figures_path, 'PSID_vis')

# define the printing mode
save_mode = True
# define the target document
target_document = 'paper'
# set up the figure theme
fp.set_theme()
# load the label dict
label_dict = processing_parameters.label_dictionary
variable_list = processing_parameters.variable_list

In [None]:
# load the data
importlib.reload(processing_parameters)
importlib.reload(fl)

# get the paths from the database using search_list
all_paths, all_queries = fl.query_search_list()
# print(all_paths)

data_list = []
# load the data
for path, queries in zip(all_paths, all_queries):
    
    data, _, _  = fl.load_preprocessing(path, queries)
    data_list.append(data)

# print(all_paths)
print(f'Number of trials: {len(data_list)}')


In [None]:
# define the target trials to evaluate
test_trials = np.arange(57, 67)

print(test_trials)

In [None]:
%%time
# run PSID on the data

# define the test/train percentage
test_perc = 0.3

# get the unique dates and mice
unique_dates_mice = np.unique([(el.loc[0, 'datetime'][:10], el.loc[0, 'mouse']) for el in data_list[0]], axis=0)

# allocate a list to store the psid objects
psid_list = []

# for all the pairs
for pair in unique_dates_mice:
    # get the relevant trials
    # target_trials = [el for el in data_list[0] if (target_day in el.loc[0, 'datetime']) & (target_mouse in el.loc[0, 'mouse'])]
    target_trials = [el for el in data_list[0] if (pair[0] in el.loc[0, 'datetime']) & (pair[1] in el.loc[0, 'mouse'])]
#     target_trials = data_list[0]

    # define the target behaviors
    target_behavior = variable_list

    # allocate memory for the training and test sets
    ca_train = []
    ca_test = []
    beh_train = []
    beh_test = []
    # for all the trials
    for trial in target_trials:

        # get the available columns
        labels = list(trial.columns)
        cells = [el for el in labels if 'cell' in el]
        # get the cell data
        calcium_data = np.array(trial[cells].copy())
        # get rid of the super small values
        calcium_data[np.isnan(calcium_data)] = 0

        try:
            # get the parameter
            beh_data = trial[target_behavior].to_numpy()

            # smooth the parameter
            beh_data = signal.medfilt(beh_data, (21, 1))
        except KeyError:
            continue

        # skip if empty
        if (calcium_data.shape[0] == 0) | (calcium_data.shape[1] < 3):
            continue

    #     downsamp = 1
    #     # bin the data
    #     if downsamp > 1:
    #         beh_data = ss.decimate(beh_data, downsamp, axis=0)
    #         calcium_data = ss.decimate(calcium_data, downsamp, axis=0)

        # get the threshold index
        threshold_idx = int(calcium_data.shape[0]*(test_perc))
        # split the data
        ca_trial_train = calcium_data[threshold_idx:, :] 
        ca_trial_test = calcium_data[:threshold_idx, :] 
        beh_trial_train = beh_data[threshold_idx:, :]
        beh_trial_test = beh_data[:threshold_idx, :] 

        # store the data
        ca_train.append(ca_trial_train)
        ca_test.append(ca_trial_test)
        beh_train.append(beh_trial_train)
        beh_test.append(beh_trial_test)    

    # skip if empty arrays
    if len(ca_train) == 0:
        continue
    # scale the data
    # ca_scaler = preprocessing.StandardScaler().fit(np.concatenate(ca_train))
    # beh_scaler = preprocessing.StandardScaler().fit(np.concatenate(beh_train))

    # ca_train = [ca_scaler.transform(el) for el in ca_train]
    # ca_test = [ca_scaler.transform(el) for el in ca_test]
    # beh_train = [beh_scaler.transform(el) for el in beh_train]
    # beh_test = [beh_scaler.transform(el) for el in beh_test]

    # scale each trial separately
    ca_scaler_list = [preproc.StandardScaler().fit(el) for el in ca_train]
    beh_scaler_list = [preproc.StandardScaler().fit(el) for el in beh_train]

    ca_train = [ca_scaler_list[idx].transform(el) for idx, el in enumerate(ca_train)]
    ca_test = [ca_scaler_list[idx].transform(el) for idx, el in enumerate(ca_test)]
    beh_train = [beh_scaler_list[idx].transform(el) for idx, el in enumerate(beh_train)]
    beh_test = [beh_scaler_list[idx].transform(el) for idx, el in enumerate(beh_test)]


    # train the PSID model
    idSys = PSID.PSID(ca_train, beh_train, nx=20, n1=10, i=20)
    # idSys = PSID.PSID(ca_train, beh_train, nx=1, n1=1, i=20) # for cricket distance
    # idSys = PSID.PSID(ca_train, beh_train, nx=20, n1=10, i=35)
    
    # store the element
    psid_list.append([pair, idSys, ca_scaler_list])

    # allocate memory for the predictions
    beh_pred = []
    ca_pred = []
    latent_pred = []
    # predict each trial
    for trial in ca_test:
        beh_p, ca_p, latent_p = idSys.predict(trial)
        beh_pred.append(beh_p)
        ca_pred.append(ca_p)
        latent_pred.append(latent_p)

    combo_beh_test = np.vstack(beh_test)
    combo_beh_pred = np.vstack(beh_pred)

    combo_ca_test = np.vstack(ca_test)
    combo_ca_pred = np.vstack(ca_pred)

    R2TrialBased_beh = evalPrediction(combo_beh_test, combo_beh_pred, 'CC')
    R2TrialBased_ca = evalPrediction(combo_ca_test, combo_ca_pred, 'CC')

    print('Number of cells that have a larger than 0 CC:', np.sum(R2TrialBased_ca != 0))
    print('Mean Ca CC:', np.nanmean(R2TrialBased_ca))
    print('CC of behavior:', R2TrialBased_beh)

In [None]:
%%time
# run the model on all experiments

# allocate memory for the predictions
final_beh = []
final_ca = []
final_latent = []
final_pairs = []
final_scaled_beh = []
# for all the pairs
for pair in unique_dates_mice[test_trials]:
    # get the trials
    target_trials = [el for el in data_list[0] if (pair[0] in el.loc[0, 'datetime']) & (pair[1] in el.loc[0, 'mouse'])]
    tag_vector = [False if (el[0][0] == pair[0]) & (el[0][1] == pair[1]) else True for el in psid_list]
    # see if the pair was calculated
    if all(tag_vector):
        continue
        
    # get the index of the corresponding psid element
    idx = np.argwhere(~np.array(tag_vector))[0][0]
    
    # get the corresponding psid element
    idSys = psid_list[idx][1]
    scalers = psid_list[idx][2]
    # predict each trial
    for trial_idx, trial in enumerate(target_trials):

        # get the available columns
        labels = list(trial.columns)
        cells = [el for el in labels if 'cell' in el]
        
        # get the cell data
        calcium_data = np.array(trial[cells].copy())
        # skip if empty
        if (calcium_data.shape[0] == 0) | (calcium_data.shape[1] < 3):
            continue
        
        # scale the data
        # get rid of the super small values
        calcium_data[np.isnan(calcium_data)] = 0
        calcium_data = scalers[trial_idx].transform(calcium_data)
        
        # predict and store
        beh_p, ca_p, latent_p = idSys.predict(calcium_data)
        final_beh.append(beh_p)
        final_ca.append(ca_p)
        final_latent.append(latent_p)
        final_pairs.append(pair)
        # get the behavior
        final_scaled_beh.append(trial[variable_list].to_numpy())

In [None]:
# define the trial to plot
layout_list = []

target_trial = 15
dim0 = 0
dim1 = 1
x = final_latent[target_trial][:, dim0].reshape((-1, 1))
y = final_latent[target_trial][:, dim1].reshape((-1, 1))
color = np.arange(x.shape[0]).reshape((-1, 1))
plot_array = np.concatenate((x, y, color), axis=1)
plot = hv.Scatter(plot_array, kdims=['x'], vdims=['y', 'Param'])
plot.opts(width=700, height=600, color='Param', cmap='Spectral', size=5, colorbar=True, xlabel=f'Dim {dim0}', ylabel=f'Dim {dim1}')
layout_list.append(plot)

param0 = 'cricket_0_delta_heading'
param1 = 'cricket_0_mouse_distance'
param_idx0 = [idx for idx, el in enumerate(variable_list) if el == param0]
param_idx1 = [idx for idx, el in enumerate(variable_list) if el == param1]
x = final_scaled_beh[target_trial][:, param_idx0].reshape((-1, 1))
y = final_scaled_beh[target_trial][:, param_idx1].reshape((-1, 1))
color = np.arange(x.shape[0]).reshape((-1, 1))
plot_array = np.concatenate((x, y, color), axis=1)
plot = hv.Scatter(plot_array, kdims=['x'], vdims=['y', 'Param'])
plot.opts(width=700, height=600, color='Param', cmap='Spectral', size=5, colorbar=True, xlabel=param0, ylabel=param1)
layout_list.append(plot)

hv.Layout(layout_list).cols(2).opts(shared_axes=False)

In [None]:
# print(psid_list[0][1].__dict__.keys())
t_trial = 67
print(final_latent[t_trial].shape, final_ca[t_trial].shape, final_beh[t_trial].shape)

In [None]:
# print(np.random.randint(0, np.size(transformed_data), np.size(transformed_data)))
print(np.random.shuffle(np.arange(np.size(transformed_data))))

In [None]:
# %%time
# plot_list = []
# # for all the models
# for model in psid_list[:10]:
#     # get the idsys element
#     idsys = model[1]
    
#     # get the target matrix
#     target_matrix = idsys.Cz
    
#     plot = hv.Raster(target_matrix)
    
#     plot_list.append(plot)
# layout = hv.Layout(plot_list)
# layout

target_metric = [el[1].C for el in psid_list]
# target_metric = [el[1].A.reshape((-1, 1)) for el in psid_list]
# target_metric = [el.T for el in final_latent]

target_metric = [preproc.StandardScaler().fit_transform(el) for el in target_metric]

# target_metric = pd.DataFrame(target_metric)

# define the template trial
template_idx = 67
print(target_metric[template_idx].shape)

second_dim = target_metric[template_idx].shape[1]
# define the number of dimensions
dimension_number = 10
# cca trials
cca_list = []
cca_score = []
# for all the trials
for idx, trial in enumerate(target_metric):
# for [(day, mouse), _, _] in psid_list:

#     if idx == template_trial:
#         cca_list.append(trial)
#         continue
#     if trial.shape[1] < dimension_number:
#         continue
    template_trial = target_metric[template_idx]#[:10, :10]
    trial = trial#[:10, :10]
    
#     np.random.shuffle(trial.ravel())
#     trial = trial.reshape((-1, second_dim), order='F')
    
#     cca_trial = cross.CCA(n_components=dimension_number).fit(template_trial, trial)
#     cca_score.append(cca_trial.score(template_trial, trial))
# #     print(cca_trial.x_rotations_.shape, cca_trial.y_rotations_.shape, trial.shape)
#     transformed_data = np.dot(np.dot(trial, cca_trial.y_rotations_), cca_trial.x_rotations_.T)

    transformed_data = trial
#     print(cca_trial[0].shape, cca_trial[1].shape)
#     print(transformed_data.shape)
#     raise ValueError
    cca_list.append(transformed_data.reshape([-1, 1]))
target_metric = cca_list
cca_score = np.array(cca_score)
print(transformed_data.shape)
# target_metric = [(el - el.min(axis=0))/(el.max(axis=0) - el.min(axis=0)) for el in target_metric]
# print([el.shape for el in target_metric])
# target_metric[np.isnan(target_metric)] = 0
# print(target_metric[0].min(axis=0).shape)
# raise ValueError
print(f'Number of trials: {len(target_metric)}')



In [None]:
freq, bins = np.histogram(cca_score[cca_score>0], bins=20)
print(np.sum(cca_score>0))
plot = hv.Bars((bins, freq))
plot.opts(width=600, xrotation=45, tools=['hover'])
plot

In [None]:
# plot the aligned matrices
plotting_array = []


# ticks = [(idx+0.5, el) for idx, el in enumerate(variable_list)]
for trial in target_metric[:20]:
    # get the trial shape per side
#     square_shape = int(np.sqrt(trial.shape[0]))
#     reshaped_matrix = trial.reshape((square_shape, square_shape), order='F')
    reshaped_matrix = trial.reshape((-1, second_dim), order='F')
    plot = hv.Raster(reshaped_matrix)
    plot.opts(width=800, height=400, cmap='Spectral', tools=['hover'], xrotation=90, ylabel='', xlabel='')
#     yticks = [(idx+0.5, f'C')]
#     plot.opts(yticks=yticks)
    plotting_array.append(plot)
layout = hv.Layout(plotting_array).cols(2).opts(shared_axes=False)
layout


In [None]:
def gini2(array, bins=30):
    """Calculate the Gini coefficient according to de Oliveira and Kim et al."""
    # bin the data
    counts, bin_edges, _ = stat.binned_statistic(np.abs(array), array, bins=bins, statistic='count')
    
    # get the fractions
    fractions = counts/counts.sum()
    # multiply by the counts
    values = (bin_edges[1:] + bin_edges[:-1])/2
    s = np.cumsum(fractions * values)
    s0 = np.concatenate(([0], s[:-1]), axis=0)

    # calculate the coefficient
    gini_coefficient = 1 - np.sum(fractions*(s0 + s))/s[-1]
    
    return gini_coefficient

In [None]:
gini_array = []
# for all the trials
for idx, trial in enumerate(target_metric):
    gini_array.append(gini2(trial.flatten()))
    

In [None]:
# perform UMAP
compiled_target = np.concatenate(target_metric, axis=0)
compiled_target[np.isnan(compiled_target)] = 0

reducer1 = umap.UMAP(min_dist=0.01, n_neighbors=20)
embedded_data1 = reducer1.fit_transform(compiled_target)

print(f'Number of samples: {embedded_data1.shape[0]}')

In [None]:
# plot the trials
umap_plot = hv.Scatter(embedded_data1, vdims=['Dim 2'], kdims=['Dim 1'])
# umap_plot = hv.HexTiles(umap_data, kdims=['Dim 1', 'Dim 2'])
umap_plot.opts(colorbar=True, cmap='Spectral', tools=['hover'], alpha=1)
umap_plot.opts(width=1200, height=1000, size=5)

In [None]:
# embed the latents


compiled_latent = np.concatenate(final_latent, axis=0)
reducer = umap.UMAP(min_dist=0.1, n_neighbors=20)
embedded_data = reducer.fit_transform(compiled_latent[:, :])



In [None]:
# plot the UMAP results

frames = [np.arange(el.shape[0]).reshape((-1, 1)) for el in final_scaled_beh]
trial = [np.ones((el.shape[0], 1))*idx for idx, el in enumerate(final_scaled_beh)]
# print(frames[0].shape, trial[0].shape, final_scaled_beh[0].shape)

behavior = [np.concatenate([el0, el1, el2], axis=1) for el0, el1, el2 in zip(final_scaled_beh, frames, trial)]

behavior = pd.DataFrame(np.concatenate(behavior, axis=0), columns=variable_list+['time', 'trial'])


# get the labels
# compiled_labels = np.expand_dims(np.hstack(distance_list), axis=1)
compiled_labels = np.expand_dims(behavior.loc[:, 'trial'].to_numpy().copy(), axis=1)
# need to threshold, for some reason there's some weird distances
# compiled_labels[compiled_labels>50] = 50
# compiled_labels[compiled_labels<0] = 0
compiled_labels = signal.medfilt(compiled_labels, kernel_size=[21, 1])

# define the sampling ratio
sampling_ratio = 10

umap_data = np.concatenate((embedded_data[::sampling_ratio, :],compiled_labels[::sampling_ratio, :]), axis=1)

print(umap_data.shape)
                            
                            
umap_plot = hv.Scatter(umap_data, vdims=['Dim 2','parameter'], kdims=['Dim 1'])
# umap_plot = hv.HexTiles(umap_data, kdims=['Dim 1', 'Dim 2'])
umap_plot.opts(color='parameter', colorbar=True, cmap='Spectral', tools=['hover'], alpha=1)
umap_plot.opts(width=1200, height=1000, size=5)
# umap_plot.opts(width=1200, height=1000)
umap_plot
