In [None]:
# imports
# import pixiedust
import logging
logging.getLogger("param.Dimension").setLevel(logging.CRITICAL)
logging.getLogger("param.ParameterizedMetaclass").setLevel(logging.CRITICAL)
logging.getLogger("param.SpreadPlot").setLevel(logging.CRITICAL)
logging.getLogger("param.CurvePlot").setLevel(logging.CRITICAL)
logging.getLogger("param.AdjointLayout").setLevel(logging.CRITICAL)
logging.getLogger("param.HoloMap").setLevel(logging.CRITICAL)
logging.getLogger("param.OverlayPlot").setLevel(logging.CRITICAL)
logging.getLogger("param.BarPlot").setLevel(logging.CRITICAL)
logging.getLogger("param.ErrorPlot").setLevel(logging.CRITICAL)
logging.getLogger("param.RasterPlot").setLevel(logging.CRITICAL)
logging.getLogger("param.Layout").setLevel(logging.CRITICAL)
logging.getLogger("param.PointPlot").setLevel(logging.CRITICAL)

import os
import sys
sys.path.insert(0, os.path.abspath(r'D:\Code Repos\prey_capture'))

import panel as pn
import holoviews as hv
from holoviews import opts, dim
hv.extension('bokeh')
from bokeh.resources import INLINE

import paths
import functions_bondjango as bd
import pandas as pd
import numpy as np
import functions_plotting as fp
import functions_data_handling as fd
import functions_kinematic as fk
from scipy.stats import sem
import sklearn.decomposition as decomp
import umap
import sklearn.mixture as mix
from scipy.stats import sem
import pickle as pk
import itertools as it
import processing_parameters
from scipy.signal import medfilt

import importlib
from pprint import pprint as pp

# # define the name to be used for the saved figures
# save_name = 'acrossTrials'
# line_width = 5
target_folder = r'J:\Drago Guggiana Nilo\Prey_capture\temp_VAME\mouse_15'

In [None]:
%%time
# load the data

# Load the desired files
importlib.reload(processing_parameters)

# define the threshold for matched cells
match_threshold = 10
# get the data paths
try: 
    data_path = snakemake.input[0]
except NameError:
    # get the search list
    search_list = processing_parameters.search_list
    # allocate memory for the data
    beh_data = []
    ca_data = []
    
    
    beh_motif = []
    ca_motif = []
    
    beh_latents = []
    ca_latents = []
    # allocate a list for all paths (need to preload to get the dates)
    all_paths = []
    # for all the search strings
    for search_string in search_list:

        # query the database for data to plot
        data_all = bd.query_database('analyzed_data', search_string)
        data_all = [el for el in data_all if 'preproc' in el['slug']]
        data_path = [el['analysis_path'] for el in data_all if '_preproc' in el['slug']]
        all_paths.append(data_path)
    # now load the files
    for data_path in all_paths:
        # load the calcium data
        
        # for all the files
        for files in data_path:
            # load the data
            with pd.HDFStore(files) as h:
                try:
    #                 beh_data.append(h['full_traces'])
                    if '/matched_calcium' in h.keys():
                        # get the cell matches
                        ca_motif.append(h['motifs'])
                        ca_latents.append(h['latents'])
                        ca_data.append(h['matched_calcium'])
                    else:
                        beh_motif.append(h['motifs'])
                        beh_latents.append(h['latents'])
                        beh_data.append(h['full_traces'])
                except KeyError:
                    continue
    
                    
    print(f'Number of calcium files: {len(ca_data)}')
    print(f'Number of behavior only files: {len(beh_data)}')

In [None]:
%%time
# create a UMAP embedding of the latents

# compile the data
try:
    compiled_latent = np.vstack([np.vstack(ca_latents), np.vstack(beh_latents)])
except ValueError:
    compiled_latent = np.vstack(ca_latents)
compiled_latent = compiled_latent[:, 1:]

# embed using UMAP
# original parameters 0.5 and 10
# 0.1 and 30 also works
# 0.05 and 30 works too
reducer = umap.UMAP(min_dist=0.9, n_neighbors=10)
embedded_data = reducer.fit_transform(compiled_latent)

# # save the embedding
# np.save(os.path.join(target_folder, 'UMAP_result'), embedded_data)

# # generate the model name
# model_name = os.path.join(target_folder, 'UMAP_model.pk')
# # save the estimator
# with open(model_name, 'wb') as file:
#     pk.dump(reducer, file)

In [None]:
# Load a pre-existing embedding
embedded_data = np.load(os.path.join(target_folder, 'UMAP_result.npy'))

# generate the model name
model_name = os.path.join(target_folder, 'UMAP_model.pk')
with open(model_name, 'rb') as file:
    reducer = pk.load(file)

In [None]:
# plot the embedding

# plot the UMAP clusters
print(ca_motif[0].shape)

label_list = np.hstack([ca_motif, beh_motif])
print(label_list.shape)
# get the labels
# compiled_labels = np.expand_dims(np.hstack(label_list).T, axis=1)
compiled_labels = np.vstack(label_list)

# print(motif_sort)
# compiled_labels = motif_revsort[compiled_labels]

# define the sampling ratio
sampling_ratio = 1

umap_data = np.concatenate((embedded_data[::sampling_ratio, :],
                            compiled_labels[::sampling_ratio, :]), axis=1)

plot_data = umap_data[(umap_data[:, 2]!= 5) & (umap_data[:, 2]!= 8), :]
          
print(umap_data.shape)
                            
                            
umap_plot = hv.Scatter(plot_data, vdims=['Dim 2','cluster'], kdims=['Dim 1'])
print(umap_plot)
umap_plot.opts(color='cluster', colorbar=True, cmap='Spectral', tools=['hover'])
umap_plot.opts(opts.Scatter(width=800, height=600))
umap_plot

In [None]:
# get the features to plot

# define the target property
target_field = 'mouse_speed'

# define the time window
time_window = int(30/2)

# allocate memory for the feature list
feature_list = []

# for all the calcium
for dataframe in ca_data + beh_data:
    try:
        data = dataframe.loc[:, target_field].to_numpy()
#         print(data.shape)
#         raise ValueError
        data = medfilt(data, 41)
        feature_list.append(data[time_window:-time_window])
    except KeyError:
        empty_array = np.zeros((dataframe.iloc[time_window:-time_window, 0].to_numpy().shape[0]))
        feature_list.append(empty_array)
        
# # now go through the pure behavior
# for dataframe in beh_data:
#     try:
#         data = dataframe[target_field].to_numpy()
#         feature_list.append(data[time_window:-time_window])
#     except KeyError:
#         continue

print(len(feature_list))
# turn into a single array
features = np.expand_dims(np.hstack(feature_list), 1)

print(features.shape)
print(compiled_labels.shape)
    


In [None]:
# plot the UMAP embedding with a chosen feature

selection_vector = (features<50) & (compiled_labels!=5) & (compiled_labels!=8)
# selection_vector = np.ones((embedded_data.shape[0], 1)) == 1
# selection_vector = (features > 0)
print(selection_vector.shape)
print(embedded_data.shape)
cropped_data = embedded_data[selection_vector[:, 0], :]
 
feature_labels = features[selection_vector[:, 0], :]
# define the sampling ratio
sampling_ratio = 1

umap_feature = np.concatenate((cropped_data[::sampling_ratio, :],
                            feature_labels[::sampling_ratio, :]), axis=1)

# umap_feature = umap_feature[umap_data[:, 2]!= 5, :]
# umap_feature = umap_feature[umap_data[:, 2]!= 8, :]
          
print(umap_feature.shape)
                            
                            
umap_plot = hv.Scatter(umap_feature, vdims=['Dim 2','cluster'], kdims=['Dim 1'])
print(umap_plot)
umap_plot.opts(color='cluster', colorbar=True, cmap='Spectral', tools=['hover'])
umap_plot.opts(opts.Scatter(width=800, height=600))
umap_plot

# freq, bins = np.histogram(features, bins=10)
# hv.Bars((bins, freq)).opts(width=600)

In [None]:
# Calculate and plot the average usage
# compiled_usage = np.vstack(usage_list)
# motif_number = beh_latents[0].shape[1]
motif_number = 15#np.unique(compiled_labels).shape[0]
print(motif_number)
results_list = np.hstack(all_paths)
print(len(results_list))
print(len(label_list))
# allocate memory for the output usages
usage_all = np.zeros((len(label_list), motif_number))

# for all the files
for idx, labels in enumerate(label_list):
    # get the unique numbers and their counts
#     unique_nums, unique_counts = np.unique(motif_revsort[labels], return_counts=True)
    unique_nums, unique_counts = np.unique(labels, return_counts=True)
    # fill in the corresponding indexes in the matrix
    usage_all[idx, unique_nums] = unique_counts

# average
average_usage = np.mean(usage_all, axis=0)
sem_usage = sem(usage_all, axis=0)

# plot
def motif_usage_plot(data_in, std_in, axis_limits):
    bars = hv.Bars(data_in, kdims=['Motif'], vdims=['Fraction'])
    bars.opts(width=600, height=400, ylim=(0, 150))
    errorbars = hv.ErrorBars((np.arange(axis_limits), data_in, std_in))

    return bars*errorbars

# calculate the succ and fail averages
succ_usages = np.array([el for idx, el in enumerate(usage_all) if 'succ' in results_list[idx]])
succ_average = np.mean(succ_usages, axis=0)
succ_std = sem(succ_usages, axis=0)/np.max(succ_average)
succ_average /= np.max(succ_average)

succ_plot = motif_usage_plot(succ_average, succ_std, motif_number).opts(ylim=(0, 1.2))

fail_usages = np.array([el for idx, el in enumerate(usage_all) if 'fail' in results_list[idx]])
fail_average = np.mean(fail_usages, axis=0)
fail_std = sem(fail_usages, axis=0)/np.max(fail_average)
fail_average /= np.max(fail_average)

fail_plot = motif_usage_plot(fail_average, fail_std, motif_number).opts(ylim=(0, 1.2))

img = succ_plot+fail_plot
img.opts(shared_axes=False).cols(1)
img
