## Parametric testing - least squares regression

- Run all analyses (including preprocessing)
- Save each step in dictionary: `main_dict`

In [1]:
''' Dependencies '''
from sklearn.decomposition import PCA
from tqdm.auto import tqdm
import warnings
import pandas as pd
import numpy as np
import scipy
import sys
import statsmodels.api as sm
from tqdm import tqdm as pbar
from scipy.io import savemat, loadmat

# my scripts
from pyaldata import * 
import preprocess

import TME
import utils
import importlib
importlib.reload(utils)
importlib.reload(TME)
importlib.reload(preprocess)
from utils import *

  import pandas.util.testing as tm


## 0 Load data and parameterize

In [2]:
''' Parameters '''
OTHER_ARRAY_D = 50 # Distance value to indicate other array

TYPE_ANALYSIS = 'pooled' # alternative: 'm1', 'pmd' #TODO

SESSIONS = ['Chewie_CO_VR_2016-09-09', 'Chewie_CO_VR_2016-09-12', 'Chewie_CO_VR_2016-09-14', 'Chewie_CO_VR_2016-10-06', 
            'Chewie_CO_FF_2016-09-15', 'Chewie_CO_FF_2016-09-21', 'Chewie_CO_FF_2016-10-05', 'Chewie_CO_FF_2016-10-07',
            'Mihili_CO_VR_2014-03-04', 'Mihili_CO_VR_2014-03-06', 'Mihili_CO_FF_2014-02-03', 'Mihili_CO_FF_2014-02-17', 
            'Mihili_CO_FF_2014-02-18', 'Mihili_CO_FF_2014-03-07', 'Mihili_CO_VR_2014-03-03']

THRESHOLDS_VARE = np.round(np.arange(0.2, 0.95, 0.025), 3)
INTERVALS = [(0, 2), (0, 4)] # Fixed within intervals
dim_range = [5, 10, 15, 20]

In [3]:
''' Experimental sessions and electrode maps '''
m1_emap  = localize_elecs(read_cmp(file_path='/Users/Daphne/Data/Chewie Left M1 SN 6250-001474.cmp'),  elecs=range(1,97))
pmd_emap = localize_elecs(read_cmp(file_path='/Users/Daphne/Data/Chewie Left PMd SN 6251-001469.cmp'), elecs=range(1,97))

## 1 Analyse the real data 

In [5]:
''' Perform preprocessing and store '''

main_dict = {} # Initialize dictionary for empirical data

for s in pbar(range(len(SESSIONS))):
    main_dict[f'{SESSIONS[s][-10:]}'] = {
        'df' : preprocess.preprocess_data(SESSIONS[s], '/Users/Daphne/Data/'),
    }

100%|██████████| 15/15 [01:30<00:00,  6.02s/it]


In [6]:
''' Trial-concatenate and perform dim reduction all sessions '''

for s in pbar(main_dict.keys()):
    
    # Select trial data for session by idx
    td = main_dict[s]['df']
    
    if TYPE_ANALYSIS == 'pooled':
        X = np.concatenate(td['both_rates'].values, axis=0) 
        N = td.both_rates[0].shape[1]

    model = PCA(n_components=N)
    model.fit(X)
    main_dict[s]['model'] = model
    main_dict[s]['pcs'] = model.components_.T

100%|██████████| 15/15 [00:04<00:00,  3.10it/s]


In [7]:
# ''' Prepare dictionaries to store each step for level of var exp '''

# for s in main_dict.keys():
#     main_dict[s]['vare'] = {}
#     for th in THRESHOLDS_VARE:
#         main_dict[s]['vare'][f'{th}'] = { 'distances_df': {} }

In [8]:
''' Prepare dictionaries to store each step for level of var exp '''

for s in main_dict.keys():
    main_dict[s]['dim'] = {}
    for d in dim_range:
        main_dict[s]['dim'][f'{d}'] = { 'distances_df': {} }

In [10]:
''' Compute correlations and physical distances '''

for s in pbar(main_dict.keys()):
    
    # Get trial data
    td = main_dict[s]['df']
    #components_range = [np.argmax(main_dict[s]['model'].explained_variance_ratio_.cumsum() > THRESHOLDS_VARE[i]) for i in range(len(THRESHOLDS_VARE))]
    
    for i, r in enumerate(dim_range):

        if r < 2: r += 1 # Can't compute correlation between two values

        L = main_dict[s]['pcs'][:, :r] # Get the first r PCs

        # Get correlations and physical distances
        C, PD, A, _  = compute_stat_and_phys_distances(L, td['M1_unit_guide'][0], td['PMd_unit_guide'][0], m1_emap, pmd_emap)
        
        # Convert to dataframe
        df = pd.DataFrame(data={'correlation': C, 'distance': PD, 'on array': A})
        df['category'] = df['distance'].apply(lambda d: 'same electrode' if d == 0 else ('same array' if d < OTHER_ARRAY_D else ('other array')))
        df['within distance'] = pd.cut(df['distance'], bins=[-0.1, 0.001, 2.01, 4.01, OTHER_ARRAY_D], labels=['0', '(0, 2]','(2, 4]', '(4, inf)'])
        df['Type'] = 'Actual'
        
        # Store dataframe in main dictionary
        #main_dict[s]['vare'][f'{THRESHOLDS_VARE[i]}']['distances_df'] = df
        
        # Based on manifold dim
        main_dict[s]['dim'][f'{dim_range[i]}']['distances_df'] = df

100%|██████████| 15/15 [01:04<00:00,  4.28s/it]


In [11]:
main_dict[s]['dim'].keys()

dict_keys(['5', '10', '15', '20'])

In [12]:
for s in pbar(main_dict.keys()):
    
    for v in main_dict[s]['dim'].keys(): 
        
        # Empirical and surrogate data
        variants = {
          'Actual' : main_dict[s]['dim'][f'{v}']['distances_df'],
        }
        # Dummy functions
        int_dum = lambda df, lb, ub: df['distance'].apply(lambda x: 1 if lb < x <= ub else 0)
        on_arr = lambda df, arr: df['on array'].apply(lambda x: 1 if x == arr else 0)
        same_ele = lambda df: df['distance'].apply(lambda x: 1 if x == 0 else 0)
        same_arr = lambda df: df['distance'].apply(lambda x: 1 if 0 < x < OTHER_ARRAY_D else 0)

        # Analyses methods
        analyses = {
          'single_cat' : {
            'fixed cols' : {
              'OA Constant' : lambda df: [1] * len(df),
              'SE (d = 0)' : same_ele,
              'SA (d exists)' : same_arr,
            },
            'variable cols' : {}
          },
          'within' : {
            'fixed cols' : {
              'M1 Constant' : lambda df: on_arr(df, 'M1'),
              'OA Constant' : lambda df: on_arr(df, 'OA'),
              'PMd Constant' : lambda df: on_arr(df, 'PMd'),
              'SE (d = 0)' : same_ele
            },
            'variable cols' : {
              'd in ' : lambda df, lb, ub: int_dum(df, lb, ub),
            },
          },
          'within_separate' : {
            'fixed cols' : {
              'M1 Constant' : lambda df: on_arr(df, 'M1'),
              'OA Constant' : lambda df: on_arr(df, 'OA'),
              'PMd Constant' : lambda df: on_arr(df, 'PMd'),
              'M1 SE (d = 0)' : lambda df: on_arr(df, 'M1') * same_ele(df),
              'PMd SE (d = 0)' : lambda df: on_arr(df, 'PMd') * same_ele(df),
            },
            'variable cols' : {
              'M1 d in ' : lambda df, lb, ub: int_dum(df, lb, ub) * on_arr(df, 'M1'),
              'PMd d in ' : lambda df, lb, ub: int_dum(df, lb, ub) * on_arr(df, 'PMd'),
            }
          },
        }
        
        for study, df in variants.items():
            
            for analysis, cols_dict in analyses.items():
                df_X = pd.DataFrame()
                for col_name, col_func in cols_dict['fixed cols'].items():
                    df_X[col_name] = col_func(df)
                for (lb, ub) in INTERVALS:
                    for col_name, col_func in cols_dict['variable cols'].items():
                        df_X[col_name + f'({lb:.1f}, {ub:.1f}]'] = col_func(df, lb, ub)
                df_X = df_X.reindex(sorted(df_X.columns), axis=1)
                res = sm.OLS(df.correlation, df_X, hasconst=True).fit()
                title = f'{study}_{analysis}'
                
                #if study == 'Actual': main_dict[s]['vare'][f'{v}'][title] = res
                
                if study == 'Actual': main_dict[s]['dim'][f'{v}'][title] = res 

100%|██████████| 15/15 [00:10<00:00,  1.46it/s]


In [13]:
np.save('main_dict_dims.npy', main_dict) 

## 2 Analyse surrogate data

In [5]:
''' Load unit guides '''
M1_UG = np.load('M1_UG.npy')
PMD_UG = np.load('PMD_UG.npy')

S = scipy.io.loadmat('/Users/Daphne/Data/SurrTensor.mat')['S'] # Get surrogates
# Select subset 
S = S[:, :, 20:50]

In [13]:
S.shape

(5000, 218, 30)

In [14]:
''' Initialize dictionary '''

surr_dict = {}
for s in range(S.shape[2]): surr_dict[f'{s+20}'] = {}

In [16]:
N = S.shape[1]

for surr in pbar(surr_dict.keys()):
    
    X_surr = S[:, :, int(surr)-20]
    
    model_surr = PCA(n_components=N)
    model_surr.fit(X_surr)
    surr_dict[surr]['model'] = model_surr
    surr_dict[surr]['pcs'] = model_surr.components_.T

100%|██████████| 30/30 [00:05<00:00,  5.01it/s]


In [17]:
''' Prepare dictionaries to store each step for level of var exp '''

for s in surr_dict.keys():
    surr_dict[s]['vare'] = {}
    for th in THRESHOLDS_VARE:
        surr_dict[s]['vare'][f'{th}'] = { 'distances_df': {} }

In [18]:
for s in pbar(surr_dict.keys()):
    
    components_range = [np.argmax(surr_dict[s]['model'].explained_variance_ratio_.cumsum() > THRESHOLDS_VARE[i]) for i in range(len(THRESHOLDS_VARE))]

    for i, r in enumerate(components_range):

        if r == 0: r += 2 # Can't compute correlation between two values
        elif r == 1: r += 1 
            
        L_surr = surr_dict[s]['pcs'][:, :r] # Get the first r PCs

        # Get correlations and physical distances
        C, PD, A, _ = compute_stat_and_phys_distances(L_surr, M1_UG, PMD_UG, m1_emap, pmd_emap)
        
        # Convert to dataframe
        df = pd.DataFrame(data={'correlation': C, 'distance': PD, 'on array': A})
        df['category'] = df['distance'].apply(lambda d: 'same electrode' if d == 0 else ('same array' if d < OTHER_ARRAY_D else ('other array')))
        df['within distance'] = pd.cut(df['distance'], bins=[-0.1, 0.001, 2.01, 4.01, OTHER_ARRAY_D], labels=['0', '(0, 2]','(2, 4]', '(4, inf)'])
        df['Type'] = 'Surrogate'
        
        # Store dataframe in surrogate dictionary
        surr_dict[s]['vare'][f'{THRESHOLDS_VARE[i]}']['distances_df'] = df

100%|██████████| 30/30 [16:16<00:00, 32.54s/it]


In [19]:
for s in pbar(surr_dict.keys()):
    
    for v in surr_dict[s]['vare'].keys(): 
        
        # Empirical and surrogate data
        variants = {
          'Surr' : surr_dict[s]['vare'][f'{v}']['distances_df'],
        }
        # Dummy functions
        int_dum = lambda df, lb, ub: df['distance'].apply(lambda x: 1 if lb < x <= ub else 0)
        on_arr = lambda df, arr: df['on array'].apply(lambda x: 1 if x == arr else 0)
        same_ele = lambda df: df['distance'].apply(lambda x: 1 if x == 0 else 0)
        same_arr = lambda df: df['distance'].apply(lambda x: 1 if 0 < x < OTHER_ARRAY_D else 0)

        # Analyses methods
        analyses = {
          'single_cat' : {
            'fixed cols' : {
              'OA Constant' : lambda df: [1] * len(df),
              'SE (d = 0)' : same_ele,
              'SA (d exists)' : same_arr,
            },
            'variable cols' : {}
          },
          'within' : {
            'fixed cols' : {
              'M1 Constant' : lambda df: on_arr(df, 'M1'),
              'OA Constant' : lambda df: on_arr(df, 'OA'),
              'PMd Constant' : lambda df: on_arr(df, 'PMd'),
              'SE (d = 0)' : same_ele
            },
            'variable cols' : {
              'd in ' : lambda df, lb, ub: int_dum(df, lb, ub),
            },
          },
          'within_separate' : {
            'fixed cols' : {
              'M1 Constant' : lambda df: on_arr(df, 'M1'),
              'OA Constant' : lambda df: on_arr(df, 'OA'),
              'PMd Constant' : lambda df: on_arr(df, 'PMd'),
              'M1 SE (d = 0)' : lambda df: on_arr(df, 'M1') * same_ele(df),
              'PMd SE (d = 0)' : lambda df: on_arr(df, 'PMd') * same_ele(df),
            },
            'variable cols' : {
              'M1 d in ' : lambda df, lb, ub: int_dum(df, lb, ub) * on_arr(df, 'M1'),
              'PMd d in ' : lambda df, lb, ub: int_dum(df, lb, ub) * on_arr(df, 'PMd'),
            }
          },
        }
        
        for study, df in variants.items():
            
            for analysis, cols_dict in analyses.items():
                df_X = pd.DataFrame()
                for col_name, col_func in cols_dict['fixed cols'].items():
                    df_X[col_name] = col_func(df)
                for (lb, ub) in INTERVALS:
                    for col_name, col_func in cols_dict['variable cols'].items():
                        df_X[col_name + f'({lb:.1f}, {ub:.1f}]'] = col_func(df, lb, ub)
                df_X = df_X.reindex(sorted(df_X.columns), axis=1)
                res = sm.OLS(df.correlation, df_X, hasconst=True).fit()
                title = f'{study}_{analysis}'
                
                surr_dict[s]['vare'][f'{v}'][title] = res

100%|██████████| 30/30 [03:05<00:00,  6.18s/it]


In [20]:
np.save('surr_dict_30.npy', surr_dict) 

In [None]:
#main_dict[s]['vare']['0.2']['Actual_single_cat'].params.values

In [None]:
# coeffs = []

# for i, v in enumerate(main_dict[s]['vare'].keys()):
    
#     coeffs.append(main_dict[s]['vare'][f'{v}']['Actual_single_cat'].params.values)

In [None]:
#main_dict.keys()

In [34]:
''' Select real data to generate surrogates '''
# # X = np.concatenate(main_dict['2016-09-09']['df']['both_rates'].values, axis=0) 
# # X = X[:5000, :]

# # savemat('X_concat_full.mat', {'X_CF':X}) 