In [1]:
import numpy as np
from scipy import linalg as LA
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os

def pca_on_np_array(nparray):
    '''rows are members (mice), columns are variables (syllables)'''
    nparray -= np.mean(nparray, axis = 0)  
    #calculate covariance matrix
    cov = np.cov(nparray, rowvar = False)
    #get eigenvectors and eigenvalues
    evals , evecs = LA.eigh(cov)

    #formatting and SORT by eigenvalues (aka sort by how much variance each eigen vector explains)
    idx = np.argsort(evals)[::-1]
    evecs = evecs[:,idx]
    evals = evals[idx]

    #this is your data represented as PCS 
    nparray_represented_as_pcs = np.dot(nparray, evecs)
    
    return nparray_represented_as_pcs,evecs,evals

def var_explained(evals): #UNUSED FXN
    var_explained=evals/sum(evals)
    return var_explained

def pc_dist(nparray1,nparray2):
    '''
    calculate the distance between two groups and within each group
    two groups should have the same number of variables columns
    '''
    
    # first do pca on COMBINED data
    a,b,c=pca_on_np_array(np.concatenate((nparray1,nparray2),axis=0))
    #a=data represented as pc scores rather than variable (syllable) values; rows=mice, columns=PCs
    #b=eigenvectors
    #c=eigenvalue corresponding to each eigen vector (use this to calculate variance explained by each pc)
    
    #within array 1 dist
    dist1=[]
    for combos in list(itertools.combinations(a[0:len(nparray1)], 2)): #goes through pairs of mice (e.g. 1vs2,1vs3,1vs4,etc)
        #calculate difference by finding difference in each pc and doing squareroot of sum of squares
        dist1.append(sum(np.square(np.subtract(combos[0],combos[1])))**.5)
        #print(sum(np.square(np.subtract(combos[0],combos[1])))**.5)
    
   
    #within array 2 dist
    dist2=[]
    for combos in list(itertools.combinations(a[len(nparray1):len(a)], 2)):
        #calculate difference by finding difference in each pc and doing squareroot of sum of squares
        dist2.append(sum(np.square(np.subtract(combos[0],combos[1])))**.5)
        #print(sum(np.square(np.subtract(combos[0],combos[1])))**.5)
        
        
    #between array 1 and 2 dist
    betweendist=[]
    for combos in list(itertools.product(nparray1, nparray2)):
        betweendist.append(sum(np.square(np.subtract(combos[0],combos[1])))**.5)
        
    return betweendist,dist1,dist2

In [2]:
# processing my data
n_mice=9 #12 #9 ########################
n_syllables=14
print(n_mice)

#STatiStiCal SIGniFicAncE
#the cutoff should be interpreted as: while all syllables contribute to the distance between groups mathematically,
#only (cutoff-1)% of unintended (randomly walking) syllables should fall above the line.
#this is not the fdr! In all examples, we draw from a random uniform distribution, however, 
#biologial variables are often drawn from normal or exponential distributions
cutoff=.95
compounds=int(np.ceil(np.log2(1/(1-cutoff))))
comp=[]

for i in range(compounds):
    comp.append((1.0/n_syllables)/(2.0*(i+1.0)))
cutoff=(1.0/n_syllables) + sum(comp)

9


In [3]:
curr_dataset = 'Dataset_20190723' #Dataset_20190723 #Dataset_20191007
which_day    = 'N1';         #N1
sesh_len     = '10min';      #10min #10min_noB
event_type   = 'poke_syl10'; #poke #poke_syl10
group1_name  = 'stim_noB';   #stim
group2_name  = 'cont';       #cont

group1_file_name = curr_dataset+'/'+curr_dataset+'_sylExpr_'+which_day+'_'+event_type+'_'+group1_name+'.csv'
group2_file_name = curr_dataset+'/'+curr_dataset+'_sylExpr_'+which_day+'_'+event_type+'_'+group2_name+'.csv'
#group1_file_name = curr_dataset+'/'+curr_dataset+'_sylExpr_'+which_day+'_'+group1_name+'_'+sesh_len+'.csv'
#group2_file_name = curr_dataset+'/'+curr_dataset+'_sylExpr_'+which_day+'_'+group2_name+'_'+sesh_len+'.csv'
file_path = '/Users/cakiti/Dropbox (Uchida Lab)/Korleki Akiti/Behavior/Standard setup/CombineAnalysis/'

csv_group1 = pd.read_csv(os.path.join(file_path,group1_file_name), header = None)
csv_group2 = pd.read_csv(os.path.join(file_path,group2_file_name), header = None)
csv_group1 = np.float64(csv_group1)
csv_group2 = np.float64(csv_group2)

print(csv_group1.shape)
print(csv_group2.shape)

save_where = os.path.join(file_path,curr_dataset)
print(save_where)

(8, 14)
(9, 14)
/Users/cakiti/Dropbox (Uchida Lab)/Korleki Akiti/Behavior/Standard setup/CombineAnalysis/Dataset_20190723


In [6]:
# extract PC distances array (first step in pc_dist)
nparray = np.concatenate((csv_group1,csv_group2),axis=0)
a,b,c = pca_on_np_array(nparray)
display(a.shape)
#a=data represented as pc scores rather than variable (syllable) values; rows=mice, columns=PCs
#b=eigenvectors
#c=eigenvalue corresponding to each eigen vector (use this to calculate variance explained by each pc)

#save_where = os.path.join(file_path,curr_dataset)
#np.savetxt(save_where+'/'+curr_dataset+'_'+which_day+'_'+sesh_len+'_PCSarray.txt', nparray_represented_as_pcs, delimiter=',', fmt="%.2f")

display(save_where+'/'+curr_dataset+'_'+which_day+'_'+sesh_len+'_varExplained.txt')
np.savetxt(save_where+'/'+curr_dataset+'_'+which_day+'_'+sesh_len+'_varExplained.txt',var_explained(c),delimiter=',')

(17, 14)

'/Users/cakiti/Dropbox (Uchida Lab)/Korleki Akiti/Behavior/Standard setup/CombineAnalysis/Dataset_20190723/Dataset_20190723_N1_10min_varExplained.txt'

In [None]:
# PCA reconstruction = PC_scores * eigenvectors^T + mean
#     normalized reconstruction error = (|| X - X^recon ||^2) / ||X||^2 where ||X|| = frobenius norm of X
array_mean = np.mean(nparray, axis = 0)
#R=3

recon_err_norm = []
for R in np.arange(0,len(nparray)):
    reconstr = np.dot(a[:,0:R],np.transpose(b[:,0:R])) + array_mean
    recon_err = np.linalg.norm((nparray-reconstr),'fro') ** 2
    norm = recon_err / (np.linalg.norm(nparray,'fro') ** 2)
    recon_err_norm.append(norm)


#np.savetxt(save_where+'/test_reconstruction_first3PCs.txt',reconstr, delimiter=',')
#np.savetxt(save_where+'/test_inputArray.txt',nparray, delimiter=',')
np.savetxt(save_where+'/'+curr_dataset+'_'+which_day+'_'+sesh_len+'_recon_err_norm_R=1-'+str(len(nparray))+'.txt',recon_err_norm,delimiter=',')