# Bootstrap tools

> Bootstrap cells from seq cell dataframe

In [None]:
#| default_exp bootstrap_tools

In [107]:
#| hide
import nbdev; nbdev.nbdev_export()

# Import

In [45]:
# | export 

import pandas as pd
import numpy as np
from scipy.signal import correlate

# Data Test

In [46]:
data = [('ACCA', 1, 1, 0, -1), ('ACCA', 2, 1, 1,-1), 
        ('CAAC', 1, 2, 0,-2), ('ACCA', 3, 4, 1,-1), 
        ('CAAC', 2, 2, 1,-2), ('CCCA', 3, 3, 1, -2)]
df = pd.DataFrame(data, columns=['seq', 'cb_encode', 'counts', 'sort_population', 'log_units_mnase'])
df

Unnamed: 0,seq,cb_encode,counts,sort_population,log_units_mnase
0,ACCA,1,1,0,-1
1,ACCA,2,1,1,-1
2,CAAC,1,2,0,-2
3,ACCA,3,4,1,-1
4,CAAC,2,2,1,-2
5,CCCA,3,3,1,-2


# Functions

In [47]:
# | export 

def sample_cells(df, n_sample):
    selected_cells = np.random.choice(df.cb_encode.unique(), size=n_sample)
    subsampled_df = df.set_index('cb_encode').loc[selected_cells,:].reset_index()
    return subsampled_df


In [48]:
b = sample_cells(df, 2)
assert len(b['cb_encode'].unique()) <= 2

In [49]:
# | export

def digestion_profile_matrix(df, aggregation='mean'):
    averages = df.groupby(['seq','log_units_mnase']).aggregate(aggregation).reset_index()
    averages = averages.sort_values('log_units_mnase').reset_index(drop=True)
    dig_matrix = averages.pivot(index='seq', columns='log_units_mnase', values='counts').fillna(0)
    return dig_matrix


In [50]:
dig = digestion_profile_matrix(df)
assert (dig.values == np.array([[0.0,2.0],[2.0,0.0],[3.0,0.0]])).all()

In [66]:
dig

log_units_mnase,-2,-1
seq,Unnamed: 1_level_1,Unnamed: 2_level_1
ACCA,0.0,2.0
CAAC,2.0,0.0
CCCA,3.0,0.0


In [69]:
# | export 

def bootstrap_population(seqcell, pop, samples_bootstrap):
    all_g0 = seqcell.loc[seqcell['sort_population']==pop]
    g0_boot = sample_cells(all_g0, samples_bootstrap)
    g0_digprof = digestion_profile_matrix(g0_boot)
    return g0_digprof

In [51]:
# | export

def correlate_sequence(row, other, seq_name):
    try:
        other_row = other.loc[seq_name, :]
        cc = correlate(row, other_row, "full")
        return cc
    except KeyError:
        return np.array([np.nan] * 27)

In [59]:
row = dig.loc['ACCA']
other = dig
c = correlate_sequence(row, other, 'CAAC')
assert c.max() == 4
assert c.argmax() == 2

In [64]:
# | export

def cross_correlation_on_all_sequences(g0_profile, inter_profile):
    corr_df_shape = (g0_profile.shape[0], g0_profile.shape[1]*2-1)
    range_values = (-1*(g0_profile.shape[1]-1), g0_profile.shape[1] )
    corr_df = pd.DataFrame(np.zeros(corr_df_shape), index=g0_profile.index, columns=list(range(*range_values)))

    for nm,row in g0_profile.iterrows():
        corr = correlate_sequence(row, inter_profile, nm)
        corr_df.loc[nm, :] = corr
    corr_df = corr_df.dropna()
    return corr_df


In [68]:
corr_df = cross_correlation_on_all_sequences(dig, dig)
assert (corr_df.loc[:, -1] == corr_df.loc[:, 1]).all()
assert (corr_df.loc[:, 0].values == np.array([4,4,9])).all()

In [71]:
corr_df

Unnamed: 0_level_0,-1,0,1
seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACCA,0.0,4.0,0.0
CAAC,0.0,4.0,0.0
CCCA,0.0,9.0,0.0


In [73]:
# | export 

def average_results_bootstrap(cross_correlations):
    cat_correlations = pd.concat(cross_correlations, axis=0, join='inner')
    mean_cc = cat_correlations.groupby(cat_correlations.index).mean()
    se_cc = cat_correlations.groupby(cat_correlations.index).sem()
    max_cc = mean_cc.idxmax(axis = 1)
    return mean_cc, se_cc, max_cc

In [81]:
corr_df2 = corr_df.copy()
corr_df2.loc[:,1] = [2.,2.,4.]
cross_correlations = [corr_df, corr_df2]
mean_cc, se_cc, max_cc = average_results_bootstrap(cross_correlations)
assert (mean_cc.loc[:, 1] == np.array([1,1,2])).all()
assert (max_cc == 0).all()

In [103]:
# | export 

def single_iteration_bootstrap_cc(seqcell, populations, samples_bootstrap):
    g0_digprof = bootstrap_population(seqcell, populations[0], samples_bootstrap)
    inter_digprof = bootstrap_population(seqcell, populations[1], samples_bootstrap)
    cc = cross_correlation_on_all_sequences(g0_digprof, inter_digprof)
    return cc


In [105]:
b = single_iteration_bootstrap_cc(df, [0,1],100)
assert (b.loc[:, -1] == b.loc[:, 1]).all()

In [41]:
# | export 

def iter_bootstrap_cc(seqcell, populations= [0,1], samples_bootstrap=200, n_iter=100):    
    cross_correlations = []

    for _ in range(n_iter):
        cc = single_iteration_bootstrap_cc(seqcell, populations, samples_bootstrap)
        cross_correlations.append(cc)

    mean_cc, se_cc, max_cc = average_results_bootstrap(cross_correlations)
    return mean_cc, se_cc, max_cc