# Bootstrap tools

> Bootstrap cells from seq cell dataframe

In [None]:
#| default_exp bootstrap_tools

In [34]:
#| hide
import nbdev; nbdev.nbdev_export()

# Import

In [2]:
# | export 

import pandas as pd
import numpy as np
from scipy.signal import correlate

# Data Test

In [27]:
data = [('ACCA', 1, 1, 0, -1), ('ACCA', 2, 1, 1,-1), 
        ('CAAC', 1, 2, 0,-2), ('ACCA', 3, 4, 1,-1), 
        ('CAAC', 2, 2, 1,-2), ('CCCA', 3, 3, 1, -2)]
df = pd.DataFrame(data, columns=['seq', 'cb_encode', 'counts', 'sort_population', 'log_units_mnase'])
df

Unnamed: 0,seq,cb_encode,counts,sort_population,log_units_mnase
0,ACCA,1,1,0,-1
1,ACCA,2,1,1,-1
2,CAAC,1,2,0,-2
3,ACCA,3,4,1,-1
4,CAAC,2,2,1,-2
5,CCCA,3,3,1,-2


# Functions

In [28]:
# | export 

def bootstrap_cells(df, n_sample):
    selected_cells = np.random.choice(df.cb_encode.unique(), size=n_sample)
    subsampled_df = df.set_index('cb_encode').loc[selected_cells,:].reset_index()
    return subsampled_df


In [29]:
b = bootstrap_cells(df, 2)
assert len(b['cb_encode'].unique()) <= 2

In [30]:
# | export

def digestion_profile_matrix(df, aggregation='mean'):
    averages = df.groupby(['seq','log_units_mnase']).aggregate(aggregation).reset_index()
    averages = averages.sort_values('log_units_mnase').reset_index(drop=True)
    dig_matrix = averages.pivot(index='seq', columns='log_units_mnase', values='counts').fillna(0)
    return dig_matrix


In [33]:
dig = digestion_profile_matrix(df)
assert (dig.values == np.array([[0.0,2.0],[2.0,0.0],[3.0,0.0]])).all()

In [4]:
# | export

def correlate_sequence(row, other, seq_name):
    try:
        other_row = other.loc[seq_name, :]
        cc = correlate(row, other_row, "full")
        return cc
    except KeyError:
        return [np.nan] * 27

In [5]:
# | export

def cross_correlation_on_all_sequences(g0_profile, inter_profile):
    corr_df = pd.DataFrame(np.zeros((g0_profile.shape[0], 27)), index=g0_profile.index, columns=list(range(-13, 14)))

    for nm,row in g0_profile.iterrows():
        corr = correlate_sequence(row, inter_profile, nm)
        corr_df.loc[nm, :] = corr
    corr_df = corr_df.dropna()
    return corr_df


In [43]:
# | export 

def bootstrap_population(seqcell, pop, samples_bootstrap):
    all_g0 = seqcell.loc[seqcell['sort_population']==pop]
    g0_boot = bootstrap_cells(all_g0, samples_bootstrap)
    g0_digprof = digestion_profile_matrix(g0_boot)
    return g0_digprof

In [6]:
# | export 

def bootstrap_cross_correlation(seqcell, populations= [0,1], samples_bootstrap=200, n_iter=100):    
    cross_correlations = []

    for _ in range(n_iter):
        g0_digprof = bootstrap_population(seqcell, populations[0], samples_bootstrap)
        print(g0_digprof)
        inter_digprof = bootstrap_population(seqcell, populations[0], samples_bootstrap)
        cc = cross_correlation_on_all_sequences(g0_digprof, inter_digprof)
        cross_correlations.append(cc)

    cat_correlations = pd.concat(cross_correlations, axis=0, join='inner')
    mean_cc = cat_correlations.groupby(cat_correlations.index).mean()
    se_cc = cat_correlations.groupby(cat_correlations.index).sem()
    max_cc = mean_cc.idxmax(axis = 1)
    return mean_cc, se_cc, max_cc