In [None]:
# imports
# suppress holoviews warning. Using warnings module didn't work
import logging
logging.getLogger("param.Dimension").setLevel(logging.CRITICAL)
logging.getLogger("param.ParameterizedMetaclass").setLevel(logging.CRITICAL)
logging.getLogger("param.HistogramPlot").setLevel(logging.CRITICAL)
logging.getLogger("param.AdjointLayout").setLevel(logging.CRITICAL)
logging.getLogger("param.OverlayPlot").setLevel(logging.CRITICAL)
logging.getLogger("param.HoloMap").setLevel(logging.CRITICAL)
logging.getLogger("param.CurvePlot").setLevel(logging.CRITICAL)
logging.getLogger("param.Layout").setLevel(logging.CRITICAL)
logging.getLogger("param.LayerPlot").setLevel(logging.CRITICAL)
logging.getLogger("param.RasterPlot").setLevel(logging.CRITICAL)
import os
import sys
sys.path.insert(0, os.path.abspath(r'D:\Code Repos\prey_capture'))


import panel as pn
import holoviews as hv
from holoviews import opts, dim
hv.extension('bokeh')
from bokeh.resources import INLINE

import paths
import functions_bondjango as bd
import functions_misc as fm
import functions_plotting as fp
import pandas as pd
import numpy as np
import sklearn.mixture as mix
import sklearn.decomposition as decomp
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn import svm, datasets
from sklearn import preprocessing
import sklearn.linear_model as lin
import sklearn.metrics as smet
import scipy.signal as ss

import random
import functions_data_handling as fd
import importlib
import processing_parameters


In [None]:
# Load the desired files
importlib.reload(processing_parameters)
# get the data paths
try: 
    data_path = snakemake.input[0]
except NameError:
    # define the search string
    search_string = processing_parameters.search_string
    # query the database for data to plot
    data_all = bd.query_database('analyzed_data', search_string)
    data_path = [el['analysis_path'] for el in data_all if 'preproc' in el['slug']]
    print(data_path)
    # load the calcium data
    beh_data = []
    data = []
    # for all the files
    for files in data_path:
        # load the data
        with pd.HDFStore(files) as h:
            beh_data.append(h['full_traces'])
            if '/matched_calcium' in h.keys():
                data.append(h['matched_calcium'])
    # concatenate the data
    data = pd.concat(data)

In [None]:
# Classify a given variable

# define the target variable
target_behavior = 'cricket_0_mouse_distance'
direct_label = 0
# define the number of bins
bin_number = 5;

images = {}
scores = []

prediction = {}

#     for day in ['2020_09_02']:
# print(str(mouse)+str(day))
# get the table
# sub_data = data[mouse][day]
sub_data = data
# get the available columns
labels = list(sub_data.columns)
cells = [el for el in labels if 'cell' in el]
not_cells = [el for el in labels if 'cell' not in el]
# get the cell data
calcium_data = np.array(sub_data[cells].copy())
# get rid of the super small values
calcium_data[np.isnan(calcium_data)] = 0

# if calcium_data.shape[0] == 0:
#     continue
# scale (convert to float to avoid warning, potentially from using too small a dtype)
calcium_data = preprocessing.StandardScaler().fit_transform(calcium_data)

# get the distance to cricket
# distance = ss.medfilt(sub_data.loc[:, target_behavior].to_numpy(), 21)
distance = sub_data.loc[:, target_behavior].to_numpy()
print(distance)

# remove repeated values in a row
# remove_vector = np.argwhere(np.diff(distance)!=0) + 1
# remove_vector = np.array([el[0] for el in remove_vector])
keep_vector = np.pad(np.diff(distance)!=0, (1, 0), mode='constant', constant_values=0)
# keep_vector = (keep_vector) & (distance<3)

distance = distance[keep_vector]
calcium_data = calcium_data[keep_vector, :]

# if direct bins are selected, just use the behavior vector directly
if direct_label:
    label_vector = np.round(distance)
else:
    # determine the speed bin edges
    bins = np.percentile(distance,np.linspace(0, 100, bin_number+1))   
    bins[-1] = np.ceil(bins[-1])
    bins[0] = np.floor(bins[0])

    # bin the distance to cricket
    label_vector = np.digitize(distance,bins)-1

# if np.min(calcium_data.flatten()) == 0 and np.max(calcium_data.flatten())== 0:
#     continue
print([np.min(calcium_data.flatten()), np.max(calcium_data.flatten())])

# # shufle the label vector
random.shuffle(label_vector)
# random.shuffle(distance)

# train the classifier
linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovr', random_state=0, \
                 probability=False)

# linear = lin.ElasticNet(positive=False, l1_ratio=0.5, alpha=0.1, normalize=False, fit_intercept=True)
# # linear = lin.TweedieRegressor(alpha=.1, max_iter=5000, fit_intercept=True, power=2)
# linear.fit(calcium_data, distance)
# linear_pred = linear.predict(calcium_data)

# linear = lin.PoissonRegressor(alpha=1000, max_iter=5000)
linear_distance = distance/np.max(distance)
print(linear_distance.shape)
print(calcium_data.shape)
# linear_pred = cross_val_predict(linear, calcium_data, distance, cv=5, verbose=3, 
#                                 n_jobs=None)

# print('Exp variance:'+str(smet.r2_score(distance, linear_pred)))


linear_pred_save = cross_val_predict(linear, calcium_data, label_vector, cv=5, verbose=3, 
                                n_jobs=None, method='decision_function')

# # take the max of the decision function
linear_pred = np.argmax(linear_pred_save, axis=1)

In [None]:
confusion_linear = confusion_matrix(label_vector, linear_pred)
# normalize the confusion matrix to rows
confusion_linear = confusion_linear/np.sum(confusion_linear, axis=1)
print('Accuracy:'+str(smet.balanced_accuracy_score(label_vector, linear_pred)))

# y_labels = [(-(idx+0.5)/(len(bins))+0.5, el) for idx, el in enumerate(bins)]
y_labels = [((len(bins)-1) - idx - 0.5, el) for idx, el in enumerate(np.arange(len(bins)-1))]
x_labels = [(idx + 0.5, el) for idx, el in enumerate(np.arange(len(bins)-1))]
print(y_labels)

# # plot
delta_image = hv.Image(confusion_linear, kdims=['Predicted distance', 'Real distance'],
                       bounds=[0, 0, confusion_linear.shape[1], confusion_linear.shape[0]])
delta_image.opts(yticks=y_labels, xticks=x_labels, colorbar=True, cmap='viridis', width=400, height=300)

# # images[mouse,day] = delta_image
# scores.append(np.sum(np.diag(confusion_linear))/np.sum(confusion_linear.flatten()))
# # prediction[mouse, day] = linear_pred_save

# print(confusion_linear)
# print(scores)

delta_image        
        # save the results
# hv.Histogram((edges, frequencies))        

In [None]:
# Plot the prediction over time

# get the number of time points
time_number = linear_pred.shape[0]
# print(time_number)

normalized_param = distance
# normalized_param = normalized_param/np.max(normalized_param)

# normalized_prediction = ss.medfilt(bins[linear_pred], 21)
# normalized_prediction = ss.medfilt(bins[linear_pred], 21)
# normalized_prediction = ss.medfilt(linear_pred, 21)
normalized_prediction = linear_pred.astype(float)

# normalized_prediction = normalized_prediction/np.max(normalized_prediction)
# normalized_prediction = ss.medfilt(linear_pred, 21)

# normalized_labels = ss.medfilt(bins[label_vector], 21)

quadrant_info = sub_data.loc[keep_vector, 'cricket_0_quadrant']
# quadrant_info = ss.medfilt(quadrant_info/np.max(quadrant_info),21)

# filter the prediction by the quadrant
normalized_vis = normalized_prediction.copy()
normalized_vis[quadrant_info>0] = np.nan
normalized_nonvis = normalized_prediction.copy()
normalized_nonvis[quadrant_info<1] = np.nan

angle_info = sub_data.loc[keep_vector, 'cricket_0_visual_angle']
# angle_info = ss.medfilt(angle_info/np.max(angle_info),21)

speed_info = sub_data.loc[keep_vector, 'mouse_speed']
# speed_info = ss.medfilt(speed_info/np.max(speed_info),21)

prediction = hv.Curve((range(time_number), normalized_prediction)).opts(height=600, width=800)
prediction_vis = hv.Curve((range(time_number), normalized_vis)).opts(height=600, width=800)
prediction_nonvis = hv.Curve((range(time_number), normalized_nonvis)).opts(height=600, width=800)
parameter = hv.Curve((range(time_number), normalized_param)).opts(height=600, width=800)
# labels = hv.Curve((range(time_number), normalized_labels)).opts(height=600, width=800)

# quadrant = hv.Curve((range(time_number), quadrant_info)).opts(height=600, width=800)
angle = hv.Curve((range(time_number), angle_info)).opts(height=600, width=800, tools=['hover'])
speed = hv.Curve((range(time_number), speed_info)).opts(height=600, width=800)

# prediction+parameter
# compiled = parameter*prediction#*angle*speed
# compiled = parameter*prediction_vis*prediction_nonvis
compiled = parameter*prediction
compiled

In [None]:
# Multivariate regression

# define the target variable
# target_behavior = ['cricket_0_x', 'cricket_0_y']
# target_behavior = ['mouse_x', 'mouse_y']
target_behavior = ['cricket_0_mouse_distance']
direct_label = 0
# define the number of bins
bin_number = 5

images = {}
scores = []

prediction = {}

#     for day in ['2020_09_02']:
# print(str(mouse)+str(day))
# get the table
# sub_data = data[mouse][day]
sub_data = data
# get the available columns
labels = list(sub_data.columns)
cells = [el for el in labels if 'cell' in el]
not_cells = [el for el in labels if 'cell' not in el]
# get the cell data
calcium_data = np.array(sub_data[cells].copy())
# get rid of the super small values
calcium_data[np.isnan(calcium_data)] = 0

# if calcium_data.shape[0] == 0:
#     continue
# scale (convert to float to avoid warning, potentially from using too small a dtype)
calcium_data = preprocessing.StandardScaler().fit_transform(calcium_data)

# get the distance to cricket
# distance = ss.medfilt(sub_data.loc[:, target_behavior].to_numpy(), 21)
parameter = sub_data.loc[:, target_behavior].to_numpy()
distance_to_prey = sub_data.loc[:, 'cricket_0_mouse_distance'].to_numpy()
print(distance.shape)
keep_vector = np.pad(np.diff(parameter[:, 0])!=0, (1, 0), mode='constant', constant_values=0)
distance_vector = distance_to_prey>3
# random.shuffle(distance_vector)

# keep_vector = (keep_vector) & (distance_vector)
parameter = parameter[keep_vector, :]
calcium_data = calcium_data[keep_vector, :]

# random.shuffle(parameter)

# if np.min(calcium_data.flatten()) == 0 and np.max(calcium_data.flatten())== 0:
#     continue

# # shufle the label vector
# random.shuffle(label_vector)
# random.shuffle(distance)

# train the classifier

calcium_train, calcium_test, parameter_train, parameter_test = \
    train_test_split(calcium_data, parameter, test_size=0.2)
# linear = lin.ElasticNet(positive=False, l1_ratio=0.5, alpha=0.1, normalize=False, fit_intercept=True)
# linear = lin.TweedieRegressor(alpha=.1, max_iter=5000, fit_intercept=True, power=2)
# linear = lin.MultiTaskElasticNet(alpha=.1, max_iter=5000, l1_ratio=0.5)
linear = lin.MultiTaskElasticNetCV(max_iter=5000, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], n_jobs=7)
linear.fit(calcium_train, parameter_train)
linear_pred = linear.predict(calcium_test)

# linear = lin.PoissonRegressor(alpha=1000, max_iter=5000)
# linear_distance = parameter/np.max(parameter)
print(calcium_data.shape)


print('Exp variance:'+str(smet.r2_score(parameter_test, linear_pred)))
print(linear.alpha_)
print(linear.l1_ratio_)
# print(linear.mse_path_)
# print(linear.alphas_)

In [None]:
# Plot the prediction over time

# get the number of time points
time_number = linear_pred.shape[0]
# define the x axis
x_axis = np.array(range(time_number))
# print(time_number)

normalized_param = parameter_test
# normalized_param = normalized_param/np.max(normalized_param)

# normalized_prediction = ss.medfilt(bins[linear_pred], 21)
# normalized_prediction = ss.medfilt(bins[linear_pred], 21)
# normalized_prediction = ss.medfilt(linear_pred, 21)
# normalized_prediction = linear_pred.astype(float)
normalized_prediction = ss.medfilt(linear_pred.astype(float), (1, 1))

# normalized_prediction = normalized_prediction/np.max(normalized_prediction)
# normalized_prediction = ss.medfilt(linear_pred, 21)

# normalized_labels = ss.medfilt(bins[label_vector], 21)

# quadrant_info = sub_data.loc[keep_vector, 'cricket_0_quadrant']
quadrant_info = sub_data.loc[:, 'cricket_0_quadrant'].to_numpy()
quadrant_info = quadrant_info[keep_vector]
# quadrant_info = ss.medfilt(quadrant_info/np.max(quadrant_info),21)

distance_info = distance_to_prey[keep_vector]
# # filter the prediction by the quadrant
# normalized_vis = normalized_prediction.copy()
# normalized_vis[quadrant_info>0] = np.nan
# normalized_nonvis = normalized_prediction.copy()
# normalized_nonvis[quadrant_info<1] = np.nan

# angle_info = sub_data.loc[keep_vector, 'cricket_0_visual_angle']
# angle_info = ss.medfilt(angle_info/np.max(angle_info),21)

# speed_info = sub_data.loc[keep_vector, 'mouse_speed']
# speed_info = ss.medfilt(speed_info/np.max(speed_info),21)

heading_info = sub_data.loc[:, 'cricket_0_delta_heading'].to_numpy()
heading_info = heading_info[keep_vector]

# prediction = hv.Curve((range(time_number), normalized_prediction)).opts(height=600, width=800)
# prediction_vis = hv.Curve((range(time_number), normalized_vis)).opts(height=600, width=800)
# prediction_nonvis = hv.Curve((range(time_number), normalized_nonvis)).opts(height=600, width=800)
parameter1 = hv.Curve((x_axis, normalized_param[:, 0])).opts(height=300, width=800)
parameter2 = hv.Curve((x_axis, normalized_param[:, 1])).opts(height=300, width=800)

prediction1 = hv.Curve((x_axis, normalized_prediction[:, 0])).opts(height=300, width=800)
prediction2 = hv.Curve((x_axis, normalized_prediction[:, 1])).opts(height=300, width=800)
# labels = hv.Curve((range(time_number), normalized_labels)).opts(height=600, width=800)
mse_info = np.sqrt((normalized_param - normalized_prediction)**2)
mse1 = hv.Curve((x_axis, mse_info[:, 0])).opts(height=300, width=800)
# quadrant = hv.Curve((range(time_number), quadrant_info)).opts(height=600, width=800)
# quadrant = hv.Spikes(x_axis[quadrant_info==0]).opts(alpha=.3, spike_length=40)
# distance = hv.Curve((x_axis, distance_info))
# heading = hv.Curve((x_axis, heading_info))
# angle = hv.Curve((range(time_number), angle_info)).opts(height=600, width=800, tools=['hover'])
# speed = hv.Curve((range(time_number), speed_info)).opts(height=600, width=800)

# prediction+parameter
# compiled = parameter*prediction#*angle*speed
# compiled = parameter*prediction_vis*prediction_nonvis
# compiled = mse1*quadrant*distance*heading + parameter2*prediction2*quadrant
# compiled = parameter1*prediction1 + parameter2*prediction2
scatter_x = hv.Scatter((normalized_param[:, 0], normalized_prediction[:, 0]), kdims=['Real', 'Predicted'])
scatter_y = hv.Scatter((normalized_param[:, 1], normalized_prediction[:, 1]), kdims=['Real', 'Predicted'])
compiled = parameter1*prediction1 + parameter2*prediction2 + scatter_x + scatter_y
compiled.cols(1)
compiled

In [None]:
# plot a histogram of the weights

freq0, edges0 = np.histogram(linear.coef_[0, :], 20)
freq1, edges1 = np.histogram(linear.coef_[1, :], 20)

# bar0 = hv.Bars((edges0, freq0), kdims=['Weights'], vdims=['Counts']).opts(xrotation=45)
# bar1 = hv.Bars((edges1, freq1), kdims=['Weights'], vdims=['Counts']).opts(xrotation=45)

bar0 = hv.Curve((edges0, np.cumsum(freq0)))
bar1 = hv.Curve((edges1, np.cumsum(freq1)))

combined = bar0*bar1
# combined.cols(2)
combined

In [None]:
%%time
# calculate regression for prediction into past and future

# define the target variable
# target_behavior = ['cricket_0_x', 'cricket_0_y']
target_behavior = ['mouse_x', 'mouse_y']
# target_behavior = ['cricket_0_mouse_distance']
# target_behavior = ['mouse_speed']
# target_behavior = ['cricket_0_delta_heading']
direct_label = 0
# define the number of bins
bin_number = 5;

images = {}
scores = []

prediction = {}

#     for day in ['2020_09_02']:
# print(str(mouse)+str(day))
# get the table
# sub_data = data[mouse][day]
sub_data = data
# get the available columns
labels = list(sub_data.columns)
cells = [el for el in labels if 'cell' in el]
not_cells = [el for el in labels if 'cell' not in el]
# get the cell data
calcium_data = np.array(sub_data[cells].copy())
# get rid of the super small values
calcium_data[np.isnan(calcium_data)] = 0

# if calcium_data.shape[0] == 0:
#     continue
# scale (convert to float to avoid warning, potentially from using too small a dtype)
calcium_data = preprocessing.StandardScaler().fit_transform(calcium_data)

# get the distance to cricket
# distance = ss.medfilt(sub_data.loc[:, target_behavior].to_numpy(), 21)
parameter = sub_data.loc[:, target_behavior].to_numpy()
distance_to_prey = sub_data.loc[:, 'cricket_0_mouse_distance'].to_numpy()
keep_vector = np.pad(np.diff(parameter[:, 0])!=0, (1, 0), mode='constant', constant_values=0)
distance_vector = distance_to_prey>3
# random.shuffle(distance_vector)

# keep_vector = (keep_vector) & (distance_vector)
keep_vector = np.ones_like(distance_to_prey, dtype=bool)
parameter = parameter[keep_vector, :]
calcium_data = calcium_data[keep_vector, :]



# random.shuffle(parameter)

# if np.min(calcium_data.flatten()) == 0 and np.max(calcium_data.flatten())== 0:
#     continue

# # shufle the label vector
# random.shuffle(label_vector)
# random.shuffle(distance)

# train the classifier

# define the lag range
lag_range = 20
# define the number of shuffles
shuffle_number = 50
# get the set of lags
lag_vector = np.array(np.arange(-lag_range, lag_range))
# allocate memory for the coefficients
r2_list = []

# for all the shuffles
for shuff in np.arange(shuffle_number):
    # allocate memory for the within shuffle scores
    within_list = []

    # get a random vector to separate train and test set
    # get the size
    number_points = parameter.shape[0]
    # generate an empty vector
    shuffle_vector = np.zeros((number_points))
    # turn 80% of them to 1
    shuffle_vector[:int(np.round(number_points*0.8))] = 1
    # shuffle the vector
    random.shuffle(shuffle_vector)

    # for all the lags
    for lag in lag_vector:

        if lag > 0:
            parameter_lag = parameter[lag:, :]
            calcium_data_lag = calcium_data[:-lag, :]
            shuffle_lag = shuffle_vector[:-lag]
        elif lag < 0:
            parameter_lag = parameter[:lag, :]
            calcium_data_lag = calcium_data[-lag:, :]
            shuffle_lag = shuffle_vector[-lag:]
        else:
            parameter_lag = parameter
            calcium_data_lag = calcium_data
            shuffle_lag = shuffle_vector

    #     calcium_train, calcium_test, parameter_train, parameter_test = \
    #         train_test_split(calcium_data_lag, parameter_lag, test_size=0.2)

        calcium_train = calcium_data_lag[shuffle_lag==1, :]
        parameter_train = parameter_lag[shuffle_lag==1]
        calcium_test = calcium_data_lag[shuffle_lag==0, :]
        parameter_test = parameter_lag[shuffle_lag==0]
        
#         # shuffle training labels
#         random.shuffle(parameter_train)

        # linear = lin.ElasticNet(positive=False, l1_ratio=0.5, alpha=0.1, normalize=False, fit_intercept=True)
        # linear = lin.TweedieRegressor(alpha=.1, max_iter=5000, fit_intercept=True, power=2)
        linear = lin.MultiTaskElasticNet(alpha=.1, max_iter=5000, l1_ratio=0.5)
    #     linear = lin.MultiTaskElasticNetCV(max_iter=5000, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], n_jobs=7)
        linear.fit(calcium_train, parameter_train)
        linear_pred = linear.predict(calcium_test)
        within_list.append(smet.r2_score(parameter_test, linear_pred))
    # save the scores for this shuffle
    r2_list.append(within_list)

# average across the shuffles
r2_average = np.mean(np.array(r2_list), axis=0)
r2_sem = np.std(np.array(r2_list), axis=0)/np.sqrt(shuffle_number)

print(lag_vector[np.argmax(r2_average)])
line = hv.Curve((lag_vector, r2_average), kdims=['Lags (frames)'], vdims=['r2'])
shade = hv.Spread((lag_vector, r2_average, r2_sem))
line * shade
