# Bootstrap tools

> Bootstrap cells from seq cell dataframe

In [None]:
#| default_exp bootstrap_tools

In [322]:
#| hide
import nbdev; nbdev.nbdev_export()

# Import

In [1]:
# | export 

import pandas as pd
import numpy as np
from scipy.signal import correlate

# Data Test

In [26]:
data = [('ACCA', 1, 1, 0, -1), ('ACCA', 2, 1, 1,-2), 
        ('CAAC', 1, 2, 0,-1), ('ACCA', 3, 4, 1,-1), 
        ('CAAC', 2, 2, 1,-2), ('CCCA', 3, 3, 1, -1)]
df = pd.DataFrame(data, columns=['seq', 'cb_encode', 'counts', 'sort_population', 'log_units_mnase'])
df

Unnamed: 0,seq,cb_encode,counts,sort_population,log_units_mnase
0,ACCA,1,1,0,-1
1,ACCA,2,1,1,-2
2,CAAC,1,2,0,-1
3,ACCA,3,4,1,-1
4,CAAC,2,2,1,-2
5,CCCA,3,3,1,-1


In [27]:
metadata = pd.DataFrame([[1,-1.0, 0], [2,-2.0, 1], [3,-1, 1] ], columns=['cb_encode', 'log_units_mnase', 'sort_population'])
metadata

Unnamed: 0,cb_encode,log_units_mnase,sort_population
0,1,-1.0,0
1,2,-2.0,1
2,3,-1.0,1


# Functions

In [28]:
# | export 

def sample_cells(df, n_sample):
    selected_cells = np.random.choice(df.cb_encode.unique(), size=n_sample)
    subsampled_df = df.set_index('cb_encode').loc[selected_cells,:].reset_index()
    return subsampled_df


In [29]:
b = sample_cells(df, 2)
assert len(b['cb_encode'].unique()) <= 2

In [30]:
df.pivot(index='seq',columns='cb_encode', values='counts').fillna(0)

cb_encode,1,2,3
seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACCA,1.0,1.0,4.0
CAAC,2.0,2.0,0.0
CCCA,0.0,0.0,3.0


In [40]:
# | export

def digestion_profile_matrix(df, aggregation='mean'):
    averages = df.groupby(['seq','log_units_mnase']).aggregate(aggregation).reset_index()
    averages = averages.sort_values('log_units_mnase').reset_index(drop=True)
    dig_matrix = averages.pivot(index='seq', columns='log_units_mnase', values='counts').fillna(0)
    return dig_matrix


In [41]:
dig = digestion_profile_matrix(filled_df)
assert (dig.values == np.array([[1.0,2.5],[2.0,1.0],[0.0,1.5]])).all()

In [42]:
seq_allc, cb_encode_allc = pd.core.reshape.util.cartesian_product([df['seq'].unique(), df['cb_encode'].unique()])
df_all_comb = pd.DataFrame({'cb_encode':list(cb_encode_allc), 'seq':list(seq_allc)})
partial = pd.merge(df,df_all_comb, on=['cb_encode', 'seq'], how='outer')
partial['counts'] = partial['counts'].fillna(0)
partial = partial.set_index('cb_encode').fillna(metadata.set_index('cb_encode').to_dict()).reset_index()

In [43]:
# | export

def fill_not_found_sequences(df, metadata):
    seq_allc, cb_encode_allc = pd.core.reshape.util.cartesian_product([df['seq'].unique(), df['cb_encode'].unique()])
    df_all_comb = pd.DataFrame({'cb_encode':list(cb_encode_allc), 'seq':list(seq_allc)})
    partial = pd.merge(df,df_all_comb, on=['cb_encode', 'seq'], how='outer')
    partial['counts'] = partial['counts'].fillna(0)
    partial = partial.set_index('cb_encode').fillna(metadata.set_index('cb_encode').to_dict()).reset_index()
    return partial

In [44]:
filled_df = fill_not_found_sequences(df, metadata)
filled_df.pivot(index='seq', columns='cb_encode', values='counts')

cb_encode,1,2,3
seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACCA,1.0,1.0,4.0
CAAC,2.0,2.0,0.0
CCCA,0.0,0.0,3.0


In [45]:
digestion_profile_matrix(filled_df)

log_units_mnase,-2.0,-1.0
seq,Unnamed: 1_level_1,Unnamed: 2_level_1
ACCA,1.0,2.5
CAAC,2.0,1.0
CCCA,0.0,1.5


In [46]:
# | export 

def bootstrap_population(seqcell, pop, samples_size, metadata):
    all_g0 = seqcell.loc[seqcell['sort_population']==pop]
    g0_boot = sample_cells(all_g0, samples_size)
    filled_df = fill_not_found_sequences(g0_boot, metadata)
    g0_digprof = digestion_profile_matrix(g0_boot)
    return g0_digprof

In [47]:
bootstrap_population(df, 1, 3, metadata)

log_units_mnase,-2,-1
seq,Unnamed: 1_level_1,Unnamed: 2_level_1
ACCA,1.0,4.0
CAAC,2.0,0.0
CCCA,0.0,3.0


In [48]:
# | export

def correlate_sequence(row, other, seq_name):
    try:
        other_row = other.loc[seq_name, :]
        cc = correlate(row, other_row, "full")
        return cc
    except KeyError:
        return np.array([np.nan] * other.shape[1]*2-1)

In [49]:
dig

log_units_mnase,-2.0,-1.0
seq,Unnamed: 1_level_1,Unnamed: 2_level_1
ACCA,1.0,2.5
CAAC,2.0,1.0
CCCA,0.0,1.5


In [50]:
row = dig.loc['ACCA']
other = dig
c = correlate_sequence(row, other, 'CAAC')
c
assert c.max() == 5.
assert c.argmax() == 2

In [51]:
# | export

def cross_correlation_on_all_sequences(g0_profile, inter_profile):
    corr_df_shape = (g0_profile.shape[0], g0_profile.shape[1]*2-1)
    range_values = (-1*(g0_profile.shape[1]-1), g0_profile.shape[1] )
    corr_df = pd.DataFrame(np.zeros(corr_df_shape), index=g0_profile.index, columns=list(range(*range_values)))

    for nm,row in g0_profile.iterrows():
        corr = correlate_sequence(row, inter_profile, nm)
        corr_df.loc[nm, :] = corr
    corr_df = corr_df.dropna()
    return corr_df


In [52]:
corr_df

Unnamed: 0_level_0,-1,0,1
seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACCA,2.5,7.25,2.5
CAAC,2.0,5.0,2.0
CCCA,0.0,2.25,0.0


In [53]:
corr_df = cross_correlation_on_all_sequences(dig, dig)
assert (corr_df.loc[:, -1] == corr_df.loc[:, 1]).all()

In [54]:
corr_df

Unnamed: 0_level_0,-1,0,1
seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACCA,2.5,7.25,2.5
CAAC,2.0,5.0,2.0
CCCA,0.0,2.25,0.0


In [55]:
# | export 

def gmean_score(pop1_digmatrix, pop2_digmatrix, threshold):
    gm_score = (pop1_digmatrix.aggregate('mean', axis=1) * pop2_digmatrix.aggregate('mean', axis=1)).dropna()
    i = gm_score.index[gm_score>threshold]
    return gm_score, i
    

In [57]:
pop1_digmatrix = pd.DataFrame([['AAA', 10, 1, 1],
                              ['BBB', 1, 0, 0], 
                              ['CCC', 1,1,10]], columns=['seq', -1, 0, 1]).set_index('seq')
pop2_digmatrix = pd.DataFrame([['AAA', 5, 1, 0],
                              ['BBB', 1, 1, 1], 
                              ['DDD', 1,1,10]], columns=['seq', -1, 0, 1]).set_index('seq')
gm_score, idx = gmean_score(pop1_digmatrix, pop2_digmatrix, threshold=1)
idx

Index(['AAA'], dtype='object', name='seq')

## Main bootstrap function

In [58]:
# | export 

def single_iteration_bootstrap_cc(seqcell, populations, samples_bootstrap, metadata, threshold=1):
    g0_digprof = bootstrap_population(seqcell, populations[0], samples_bootstrap, metadata)
    inter_digprof = bootstrap_population(seqcell, populations[1], samples_bootstrap, metadata)
    # filter low mean counts
    gmean, idx = gmean_score(g0_digprof, inter_digprof, threshold=threshold)
    g0_digprof = g0_digprof.loc[idx, :]
    inter_digprof = inter_digprof.loc[idx, :]
    
    cc = cross_correlation_on_all_sequences(g0_digprof, inter_digprof)
    return cc


In [59]:
df

Unnamed: 0,seq,cb_encode,counts,sort_population,log_units_mnase
0,ACCA,1,1,0,-1
1,ACCA,2,1,1,-2
2,CAAC,1,2,0,-1
3,ACCA,3,4,1,-1
4,CAAC,2,2,1,-2
5,CCCA,3,3,1,-1


In [60]:
b = single_iteration_bootstrap_cc(df, [1,1],100, metadata, threshold=0)
assert (b.loc[:, -1] == b.loc[:, 1]).all()

In [61]:
# | export 

def iter_bootstrap_cc(seqcell, populations= [0,1], samples_bootstrap=200, n_iter=100, apply_filter=True):    
    cross_correlations = []

    for _ in range(n_iter):
        cc = single_iteration_bootstrap_cc(seqcell, populations, samples_bootstrap)
        cross_correlations.append(cc)
        
    return cross_correlations


In [62]:
# | export 

def average_results_bootstrap(cross_correlations):
    cat_correlations = pd.concat(cross_correlations, axis=0, join='inner')
    mean_cc = cat_correlations.groupby(cat_correlations.index).mean()
    se_cc = cat_correlations.groupby(cat_correlations.index).sem()
    max_cc = mean_cc.idxmax(axis = 1)
    return mean_cc, se_cc, max_cc

In [68]:
corr_df2 = corr_df.copy()
corr_df2.loc[:,1] = [7.5,2.,4.]
cross_correlations = [corr_df, corr_df2]
mean_cc, se_cc, max_cc = average_results_bootstrap(cross_correlations)
assert (mean_cc.loc[1, :] == np.array([2.5,7.25,5])).all()
assert (max_cc == 0).all()

KeyError: 1

In [66]:
cross_correlations

[       -1     0    1
 seq                 
 ACCA  2.5  7.25  2.5
 CAAC  2.0  5.00  2.0
 CCCA  0.0  2.25  0.0,
        -1     0    1
 seq                 
 ACCA  2.5  7.25  7.5
 CAAC  2.0  5.00  2.0
 CCCA  0.0  2.25  4.0]

In [67]:
mean_cc

Unnamed: 0_level_0,-1,0,1
seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACCA,2.5,7.25,5.0
CAAC,2.0,5.0,2.0
CCCA,0.0,2.25,2.0
