## Testing for a relationship between physical distance and statistical dependence

In this notebook, we test for the presence and nature of this relation through an ANOVA (parametric) test.

In [31]:
''' Dependencies '''
from sklearn.decomposition import PCA
from tqdm.auto import tqdm
import scipy.io as sio
import warnings
import pandas as pd
import numpy as np
import scipy
import sys
import statsmodels.api as sm
from tqdm import tqdm as pbar

# my scripts
from pyaldata import * 
import preprocess

import utils
import importlib
importlib.reload(utils)
from utils import *

# Plotting
# from matplotlib.offsetbox import AnchoredText
import matplotlib.pyplot as plt
import seaborn as sns

%config InlineBackend.figure_format = 'retina'

Dict = {
    'Sessions' : {
        'Var exp': {
            # some lambda fuction EXP_VAR_THRESHOLDS
        }
    }
    
    'Session 1' : {
        'Var exp': {
            # some lambda fuction EXP_VAR_THRESHOLDS
        }
    }
    
}

In [62]:
''' Parameters '''
OTHER_ARRAY_D = 50 # Distance value to indicate other array

EXP_VAR_THRESHOLDS = np.arange(0.2, 0.95, 0.025)

TYPE_ANALYSIS = 'pooled' # alternative: 'm1', 'pmd' #TODO

SESSIONS = ['Chewie_CO_VR_2016-09-09', 'Chewie_CO_VR_2016-09-12', 'Chewie_CO_VR_2016-09-14', 'Chewie_CO_VR_2016-10-06', 
            'Chewie_CO_FF_2016-09-15', 'Chewie_CO_FF_2016-09-21', 'Chewie_CO_FF_2016-10-05', 'Chewie_CO_FF_2016-10-07',
            'Mihili_CO_VR_2014-03-04', 'Mihili_CO_VR_2014-03-06', 'Mihili_CO_FF_2014-02-03', 'Mihili_CO_FF_2014-02-17', 
            'Mihili_CO_FF_2014-02-18', 'Mihili_CO_FF_2014-03-07', 'Mihili_CO_VR_2014-03-03']

In [63]:
''' Experimental sessions and electrode maps '''
m1_emap  = localize_elecs(read_cmp(file_path='/Users/Daphne/Data/Chewie Left M1 SN 6250-001474.cmp'),  elecs=range(1,97))
pmd_emap = localize_elecs(read_cmp(file_path='/Users/Daphne/Data/Chewie Left PMd SN 6251-001469.cmp'), elecs=range(1,97))

In [96]:
''' Perform preprocessing and store in sessions dictionary '''

main_dict = {}

for s in pbar(range(len(SESSIONS[:2]))):
    main_dict[f'{SESSIONS[s][-10:]}'] = {
        'df' : preprocess.preprocess_data(SESSIONS[s], '/Users/Daphne/Data/'),
    }

100%|██████████| 2/2 [00:28<00:00, 14.33s/it]


In [35]:
''' Trial-concatenate and perform dim reduction all sessions '''

#### ITERATE THROUGH SESSIONS ####
for s in pbar(main_dict.keys()):
    
    # Select trial data for session by idx
    td = list(main_dict[s]['df'].values())
    
    if TYPE_ANALYSIS == 'pooled':
        X = np.concatenate(td['both_rates'].values, axis=0) # All neurons (M1 + PMd)
        N = td.both_rates[0].shape[1]
        
    elif TYPE_ANALYSIS == 'm1':   
        X = np.concatenate(td['M1_rates'].values, axis=0)
        N = td.M1_rates[0].shape[1]

    elif TYPE_ANALYSIS == 'pmd':  
        X = np.concatenate(td['PMd_rates'].values, axis=0)
        N = td.PMd_rates[0].shape[1]
        
    # PSEUDO SURROGATE #TODO
    X_surr = np.random.normal(size=(X.shape))
    
    #### DIM REDUCTION ####
    # True data
    model = PCA(n_components=N)
    model.fit(X)
    main_dict[s]['model'] = model
    main_dict[s]['pcs'] = 
    model_sessions.append(model)
    pcs_sessions.append(model.components_.T)
    
    # Surrogate
    model_surr = PCA(n_components=N)
    model_surr.fit(X_surr)
    model_surr_sessions.append(model_surr)
    pcs_surr_sessions.append(model_surr.components_.T)
    
    session_dict[session]['pcs'] = 

100%|██████████| 8/8 [00:30<00:00,  3.86s/it]


In [36]:
# Sanity check, pcs must be N x N

pcs_sessions[0].shape

(218, 218)

In [37]:
# TODO: Generate surrogate data with TME
# ...

### Compute correlations and spatial distances for all sessions

In [42]:
''' CHOOSE VAR EXPLAINED LEVEL '''

VAR_EXPLAINED = 0.4

In [43]:
sessions_dfs, sessions_dfs_surr = get_df_sessions_varexp(sessions=sessions, var_exp=VAR_EXPLAINED, model_sessions=model_sessions, 
                                                         pcs_sessions=pcs_sessions, model_surr_sessions=model_surr_sessions, 
                                                         pcs_surr_sessions=pcs_surr_sessions, td_sessions=td_sessions, m1_emap=m1_emap, pmd_emap=pmd_emap, OTHER_ARRAY_D=50)

100%|██████████| 8/8 [01:34<00:00, 11.85s/it]


### Make violin plots for all sessions, given var. explained level

In [4]:
# ''' Concatenate surrogate and real data dfs, make violins '''

# plt.rcParams['axes.prop_cycle'] = plt.cycler(color=['#69A5FF', '#DCDFE3'])
# plt.rcParams['axes.labelsize'] = 16

# for s in range(len(sessions)):
    
#     print(sessions[s])
    
#     df_emp_and_surr = pd.concat([sessions_dfs[s], sessions_dfs_surr[s]])
#     fig, axs = plt.subplots(1, 2, figsize=(17, 6))

#     sns.violinplot(x='category', y='correlation', hue='Type', data=df_emp_and_control, inner='quartile', 
#                 split=True, order=['same electrode', 'same array', 'other array'], legend=False, ax=axs[0])

#     sns.violinplot(x='within distance', y='correlation', hue='Type', data=df_emp_and_control, inner='quartile',
#                 split=True, order=['0', '(0, 2]','(2, 4]', '(4, inf)'], ax=axs[1], legend=True)

#     axs[0].set_ylim=(-1, 1.1)
#     axs[0].get_legend().remove()
#     axs[0].set_ylabel(r'$\rho$')
#     axs[1].set_ylim=(-1, 1.1)
#     axs[1].set_ylabel(r'$\rho$')
#     axs[1].legend(bbox_to_anchor=(1.3, 1))

#     # Make means different color
#     for l in axs[0].lines[1::3]:
#         l.set_linewidth(1.3)
#         l.set_color('#000000')

#     for l in axs[1].lines[1::3]:
#         l.set_linewidth(1.3)
#         l.set_color('#000000')
#     sns.despine(), plt.show()

### Run ANOVA tests

In [44]:
sessions_dfs[0]

Unnamed: 0,correlation,distance,on array,category,within distance,Type
0,-0.148665,3.000000,M1,same array,"(2, 4]",Actual
1,0.180301,2.236068,M1,same array,"(2, 4]",Actual
2,0.370257,1.414214,M1,same array,"(0, 2]",Actual
3,-0.076905,2.236068,M1,same array,"(2, 4]",Actual
4,-0.520833,2.236068,M1,same array,"(2, 4]",Actual
...,...,...,...,...,...,...
23648,-0.388244,1.000000,PMd,same array,"(0, 2]",Actual
23649,0.631326,1.000000,PMd,same array,"(0, 2]",Actual
23650,0.675981,0.000000,PMd,same electrode,0,Actual
23651,0.407076,0.000000,PMd,same electrode,0,Actual


In [46]:
''' Choose session '''

SESSION_ID = 0

In [53]:
variants = {
  'Empirical study' : sessions_dfs[SESSION_ID],
  'Control study: TME': sessions_dfs_surr[SESSION_ID],
}

#############
# WORKS; DON'T EDIT
#############

# Fixed within intervals
INTERVALS = [(0, 2), (0, 4)]

# Dummy functions
int_dum = lambda df, lb, ub: df['distance'].apply(lambda x: 1 if lb < x <= ub else 0)
on_arr = lambda df, arr: df['on array'].apply(lambda x: 1 if x == arr else 0)
same_ele = lambda df: df['distance'].apply(lambda x: 1 if x == 0 else 0)
same_arr = lambda df: df['distance'].apply(lambda x: 1 if 0 < x < OTHER_ARRAY_D else 0)

# Analyses methods
analyses = {
  'single category' : {
    'fixed cols' : {
      'OA Constant' : lambda df: [1] * len(df),
      'SE (d = 0)' : same_ele,
      'SA (d exists)' : same_arr,
    },
    'variable cols' : {}
  },
  'multiple categories' : {
    'fixed cols' : {
      'M1 Constant' : lambda df: on_arr(df, 'M1'),
      'OA Constant' : lambda df: on_arr(df, 'OA'),
      'PMd Constant' : lambda df: on_arr(df, 'PMd'),
      'SE (d = 0)' : same_ele
    },
    'variable cols' : {
      'd in ' : lambda df, lb, ub: int_dum(df, lb, ub),
    },
  },
  'separate' : {
    'fixed cols' : {
      'M1 Constant' : lambda df: on_arr(df, 'M1'),
      'OA Constant' : lambda df: on_arr(df, 'OA'),
      'PMd Constant' : lambda df: on_arr(df, 'PMd'),
      'M1 SE (d = 0)' : lambda df: on_arr(df, 'M1') * same_ele(df),
      'PMd SE (d = 0)' : lambda df: on_arr(df, 'PMd') * same_ele(df),
    },
    'variable cols' : {
      'M1 d in ' : lambda df, lb, ub: int_dum(df, lb, ub) * on_arr(df, 'M1'),
      'PMd d in ' : lambda df, lb, ub: int_dum(df, lb, ub) * on_arr(df, 'PMd'),
    }
  },
}
res_dict = {}
for study, df in variants.items():
    for analysis, cols_dict in analyses.items():
        df_X = pd.DataFrame()
        for col_name, col_func in cols_dict['fixed cols'].items():
            df_X[col_name] = col_func(df)
        for (lb, ub) in INTERVALS:
            for col_name, col_func in cols_dict['variable cols'].items():
                df_X[col_name + f'({lb:.1f}, {ub:.1f}]'] = col_func(df, lb, ub)
        df_X = df_X.reindex(sorted(df_X.columns), axis=1)
        res = sm.OLS(df.correlation, df_X, hasconst=True).fit()
        title = study + f' ({analysis})'
        #print(res.summary(title=title, alpha=.05), end='\n\n\n\n')
        res_dict[title] = res

In [55]:
res_dict

{'Empirical study (single category)': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1b2334190>,
 'Empirical study (multiple categories)': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1b2334ad0>,
 'Empirical study (separate)': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1b2334590>,
 'Control study: TME (single category)': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1b234d650>,
 'Control study: TME (multiple categories)': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1b234d910>,
 'Control study: TME (separate)': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1b234dd10>}

In [51]:
res_dict['Empirical study (multiple categories)'].params

M1 Constant        0.100014
OA Constant        0.030235
PMd Constant       0.095465
SE (d = 0)         0.111069
d in (0.0, 2.0]   -0.008658
d in (0.0, 4.0]   -0.000177
dtype: float64

In [52]:
res_dict['Empirical study (multiple categories)'].pvalues

M1 Constant         9.820144e-22
OA Constant         2.080413e-14
PMd Constant       1.149939e-132
SE (d = 0)          5.456913e-05
d in (0.0, 2.0]     4.160525e-01
d in (0.0, 4.0]     9.798400e-01
dtype: float64

### Summary plots

In [58]:
res_dict['Empirical study (single category)'].params

OA Constant      0.030235
SA (d exists)    0.064564
SE (d = 0)       0.176832
dtype: float64

In [None]:
''' Compute physical distances and correlations for range of manifold dims, across all sessions '''

exp_var_levels = np.arange(0.2, 0.95, 0.025)
exp_var_to_pcs = []
df_both_sessions = []
df_surr_sessions = []

for s in pbar(range(len(sessions))):
    
    # Select session data
    components_range = [np.argmax(model_both_sessions[s].explained_variance_ratio_.cumsum() > exp_var_levels[i]) for i in range(len(exp_var_levels))]
    exp_var_to_pcs.append(components_range)
    
    for d, k in enumerate(components_range):
        if k == 0: 
            k += 2
            warnings.warn('First PC explaines > Var. explained')
        elif k == 1: 
            k += 1
            warnings.warn('Second PC explaines > Var. explained')
        
        # Take latent variables
        L_both = pcs_both_sessions[s][:, :k]
        
        # Get correlations and 
        C_both, D_both, A_both = compute_stat_and_phys_distances(L_both, td_sessions[s]['M1_unit_guide'][0], td_sessions[s]['PMd_unit_guide'][0], m1_emap, pmd_emap)
        
        #TODO: compute compute_stat_and_phys_distances() for surrogates
    
        df_both = pd.DataFrame(data={'correlation': C_both, 'abs correlation': abs(C_both), 'distance': D_both, 'on array': D_both})
        df_both['category'] = df_both['distance'].apply(lambda d: 'same electrode' if d == 0 else ('same array' if d < OTHER_ARRAY_D else ('other array')))
        df_both['within distance'] = pd.cut(df_both['distance'], bins=[-0.1, 0.001, 2.01, 4.01, OTHER_ARRAY_D], labels=['0', '(0, 2]','(2, 4]', '(4, inf)'])
        # Indicator for when concatenating data
        df_both['Type'] = 'Actual'
        
#         df_surr = pd.DataFrame(data={'correlation': C_surr, 'abs correlation': abs(C_surr), 'distance': D_surr, 'on array': D_surr})
#         df_surr['category'] = df_surr['distance'].apply(lambda d: 'same electrode' if d == 0 else ('same array' if d < OTHER_ARRAY_D else ('other array')))
#         df_surr['within distance'] = pd.cut(df_surr['distance'], bins=[-0.1, 0.001, 2.01, 4.01, OTHER_ARRAY_D], labels=['0', '(0, 2]','(2, 4]', '(4, inf)'])
#         # Indicator for when concatenating data
#         df_surr['Type'] = 'Surrogate'
        
        df_both_sessions.append(df_both)
#         df_surr_sessions.append(df_surr)

In [84]:
p_vals_OA = []
p_vals_SA = []
p_vals_SE = []
coeffs = []

In [92]:
res.pvalues[0]

7.779819932251951e-10

In [59]:
# variants = {
#   'Empirical study' : df_emp,
# #   'Control study: TME': df_surr, 
# }

# #############
# # WORKS; DON'T EDIT
# #############

# # Fixed within intervals
# INTERVALS = [(0, 2), (0, 4)]

# # Dummy functions
# int_dum = lambda df, lb, ub: df['distance'].apply(lambda x: 1 if lb < x <= ub else 0)
# on_arr = lambda df, arr: df['on array'].apply(lambda x: 1 if x == arr else 0)
# same_ele = lambda df: df['distance'].apply(lambda x: 1 if x == 0 else 0)
# same_arr = lambda df: df['distance'].apply(lambda x: 1 if 0 < x < OTHER_ARRAY_D else 0)

# # Analyses methods
# analyses = {
#   'single category' : {
#     'fixed cols' : {
#       'OA Constant' : lambda df: [1] * len(df),
#       'SE (d = 0)' : same_ele,
#       'SA (d exists)' : same_arr,
#     },
#     'variable cols' : {}
#   },
#   'multiple categories' : {
#     'fixed cols' : {
#       'M1 Constant' : lambda df: on_arr(df, 'M1'),
#       'OA Constant' : lambda df: on_arr(df, 'OA'),
#       'PMd Constant' : lambda df: on_arr(df, 'PMd'),
#       'SE (d = 0)' : same_ele
#     },
#     'variable cols' : {
#       'd in ' : lambda df, lb, ub: int_dum(df, lb, ub),
#     },
#   },
#   'separate' : {
#     'fixed cols' : {
#       'M1 Constant' : lambda df: on_arr(df, 'M1'),
#       'OA Constant' : lambda df: on_arr(df, 'OA'),
#       'PMd Constant' : lambda df: on_arr(df, 'PMd'),
#       'M1 SE (d = 0)' : lambda df: on_arr(df, 'M1') * same_ele(df),
#       'PMd SE (d = 0)' : lambda df: on_arr(df, 'PMd') * same_ele(df),
#     },
#     'variable cols' : {
#       'M1 d in ' : lambda df, lb, ub: int_dum(df, lb, ub) * on_arr(df, 'M1'),
#       'PMd d in ' : lambda df, lb, ub: int_dum(df, lb, ub) * on_arr(df, 'PMd'),
#     }
#   },
# }
# res_dict = {}
# for study, df in variants.items():
#     for analysis, cols_dict in analyses.items():
#         df_X = pd.DataFrame()
#         for col_name, col_func in cols_dict['fixed cols'].items():
#             df_X[col_name] = col_func(df)
#         for (lb, ub) in intervals:
#             for col_name, col_func in cols_dict['variable cols'].items():
#                 df_X[col_name + f'({lb:.1f}, {ub:.1f}]'] = col_func(df, lb, ub)
#         df_X = df_X.reindex(sorted(df_X.columns), axis=1)
#         res = sm.OLS(df.correlation, df_X, hasconst=True).fit()
#         title = study + f' ({analysis})'
#         print(res.summary(title=title, alpha=.05), end='\n\n\n\n')
#         res_dict[title] = res