# Compute Deconvolution of In-House EM-Seq Clinical Batches

## Initialisation

In [1]:
import os
import numpy as np
import pandas as pd
import random
import glob
import datetime
import itertools
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
#import plotly.express as px
import nbformat

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

### Parameters and File Paths

In [14]:
FILTER_COV = 20
FILTER_COV_COEFF = 0.5
SCORE_VAR = 'frac_alpha_leq_25pct'
#--- Local paths
ROOT_DIR = '/analysis/'
PROJECT_SLUG = '2023_06_15_BCdeconvolution_AS'
PROJECT_DIR = ROOT_DIR + '/{}'.format(PROJECT_SLUG)

DATA_DIR = ROOT_DIR + '/{}'.format('2023_06_15_BCdeconvolution_AS/stage')

# Sample annotations
SAMPLE_PATH = DATA_DIR + ''
# Methylation data
SCORE_PATH = (
    PROJECT_DIR + '/output/meth_summaries/buffycoat_meth_summaries_cg_count_geq_3_deconvolution_v2.v23_conv.with_cpg_index.tsv.gz'
)
# Reference matrix
ATLAS_PATH = (
    PROJECT_DIR + '/output/ref/deconv_inhouse_v2.atlas.tsv.gz'
#    DATA_DIR + '/data/deconv/deconv_inhouse_v2.atlas.tsv.gz'
)
REGION_PATH = (
    PROJECT_DIR + '/output/ref/deconv_inhouse_v2.region_to_ref_celltype.tsv.gz'
)
# COVERAGE_PATH = (
#     PROJECT_DIR + '/data/region_coverage_estimates.deconvolution_v2.v23_conv.tsv'
# )
# Where to store graphics
RESULTS_PATH = PROJECT_DIR + '/output/deconv/deconv_inhouse_v2_blueprint'

### Sample Metadata

In [7]:
# sample_df = pd.read_csv(SAMPLE_PATH, sep='\t')
# BATCH_MAP = {
#     '1': 'batch_12', '2': 'batch_12', 'normals_2': 'batch_12',
#     '3': 'batch_345', '4': 'batch_345', '5': 'batch_345',
# }
# sample_df['batch_group'] = sample_df['batch_id'].map(BATCH_MAP)
# #sample_df.iloc[0]

### ATLAS

In [8]:
atlas = pd.read_csv(ATLAS_PATH, sep='\t')
atlas = atlas.set_index('region_id')
atlas.columns.name = 'ref_celltype'
atlas.shape

(280, 9)

In [9]:
ref_region_df = pd.read_csv(REGION_PATH, sep='\t')
ref_region_df.shape[0]

280

Add coverage information.

In [10]:
# add_df = pd.read_csv(COVERAGE_PATH, sep='\t')
# ref_region_df = ref_region_df.merge(add_df[['region_id', 'coeff']], how='left')
# ref_region_df['coeff'].fillna(0, inplace=True)
# ref_region_df.shape[0]

In [11]:
#ridxs = (ref_region_df['coeff']>=FILTER_COV_COEFF)
#ref_region_df = ref_region_df[ridxs].copy()
#ref_region_df.shape[0], ref_region_df['region_id'].nunique()

In [12]:
atlas = atlas.loc[ref_region_df['region_id'], :].copy()
atlas.shape

(280, 9)

### Methylation Scores

In [15]:
score_df = pd.read_csv(SCORE_PATH, sep='\t')
score_df['sample_id'].nunique()
#score_df['sample_id'] = score_df['tube_id'] was necessary for bip-ssm pipeline output

10

In [24]:
score_df.head()

Unnamed: 0,sample_id,region_id,number_molecules,meth_k1,unmeth_k1,total_k1,meth_k3,unmeth_k3,total_k3,meth_k4,unmeth_k4,total_k4,frac_alpha_leq_25pct,frac_alpha_geq_75pct,tube_id
0,SSM_12_kZJs0Vjyh4I,Breast_Basal_Ep-chr1:2463572-2463704,870,2892,159,3051,1106,0,1306,348,0,438,0.0,0.901149,P1816023-1
1,SSM_13_UaP270DP6ik,Breast_Basal_Ep-chr1:2463572-2463704,1535,5149,357,5506,1971,1,2425,674,0,894,0.000651,0.891205,P1816038-1
2,SSM_14_KqhRdMUdMyg,Breast_Basal_Ep-chr1:2463572-2463704,1276,4253,265,4518,1608,2,1950,498,0,680,0.001567,0.911442,P1816024-1
3,SSM_15_izoqP38MM5Z,Breast_Basal_Ep-chr1:2463572-2463704,1123,3765,243,4008,1413,0,1748,458,0,632,0.0,0.902939,P1816102-1
4,SSM_16_S00uDEZl3o8,Breast_Basal_Ep-chr1:2463572-2463704,919,3064,159,3223,1175,1,1382,359,0,465,0.001088,0.928183,P1816039-1


## Compute Mixtures

In [16]:
from scipy.optimize import nnls

def compute_mixture(b, atlas):
    
    ridxs = ~b.isna()
    b_hat = b[ridxs]
    atlas_hat = atlas.loc[b_hat.index, :]
    fit = nnls(atlas_hat, b_hat)
    rv = pd.Series(fit[0], index=atlas_hat.columns)
    
    return(rv)

In [17]:
#DATA_SAMPLES = sample_df['sample_id'].tolist()
DATA_SAMPLES = score_df['sample_id'].unique()
len(DATA_SAMPLES)

10

In [26]:
DATA_SAMPLES

array(['SSM_12_kZJs0Vjyh4I', 'SSM_13_UaP270DP6ik', 'SSM_14_KqhRdMUdMyg',
       'SSM_15_izoqP38MM5Z', 'SSM_16_S00uDEZl3o8', 'SSM_17_YaE1jtBet3h',
       'SSM_18_5lltWZyPyNZ', 'SSM_19_LPCY3EPZlY7', 'SSM_20_fxb2qd6Ed70',
       'SSM_21_C8Rl1ZVct5d'], dtype=object)

In [18]:
ridxs = score_df['region_id'].isin(atlas.index)
ridxs &= score_df['sample_id'].isin(DATA_SAMPLES)
ridxs &= (score_df['number_molecules']>=FILTER_COV)
data = score_df[ridxs]\
    .pivot_table(index='region_id', columns='sample_id', values=SCORE_VAR) \
    .fillna(0)

In [19]:
fit_nnls = data.apply(lambda x: compute_mixture(x, atlas), axis=0)\
    .reset_index()\
    .melt(id_vars='ref_celltype', var_name='sample_id', value_name='coeff')
fit_nnls.iloc[0]

ref_celltype              immune_b
sample_id       SSM_12_kZJs0Vjyh4I
coeff                     0.147703
Name: 0, dtype: object

In [20]:
compute_mixture(data.iloc[:,0], atlas)

ref_celltype
immune_b                            0.147703
immune_t                            0.480124
immune_nk                           0.001350
immune_dend_plus_macro_plus_mono    0.056035
immune_eosi                         0.006534
immune_neutro                       0.209922
immune_eryth                        0.000000
immune_mega                         0.000000
eryth_prog                          0.000000
dtype: float64

In [35]:
data.iloc[:,0].head()

region_id
Eryth_prog-chr12:122875429-122875559    0.000000
Eryth_prog-chr12:123471883-123472181    0.002107
Eryth_prog-chr14:20871852-20872002      0.000000
Eryth_prog-chr16:15790488-15790760      0.000000
Eryth_prog-chr16:4494669-4494777        0.000000
Name: SSM_12_kZJs0Vjyh4I, dtype: float64

In [36]:
data.head()

sample_id,SSM_12_kZJs0Vjyh4I,SSM_13_UaP270DP6ik,SSM_14_KqhRdMUdMyg,SSM_15_izoqP38MM5Z,SSM_16_S00uDEZl3o8,SSM_17_YaE1jtBet3h,SSM_18_5lltWZyPyNZ,SSM_19_LPCY3EPZlY7,SSM_20_fxb2qd6Ed70,SSM_21_C8Rl1ZVct5d
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Eryth_prog-chr12:122875429-122875559,0.0,0.002793,0.0,0.0,0.0,0.004762,0.0,0.0,0.002976,0.003597
Eryth_prog-chr12:123471883-123472181,0.002107,0.00156,0.000938,0.000344,0.001983,0.000522,0.001001,0.002048,0.001212,0.001544
Eryth_prog-chr14:20871852-20872002,0.0,0.0,0.000519,0.0,0.00071,0.001669,0.0,0.000588,0.001091,0.0
Eryth_prog-chr16:15790488-15790760,0.0,0.0,0.0,0.0,0.0,0.000756,0.0,0.0,0.000481,0.0
Eryth_prog-chr16:4494669-4494777,0.0,0.000758,0.0,0.001837,0.0,0.001199,0.0,0.001042,0.002435,0.002081


In [29]:
atlas.shape

(280, 9)

In [31]:
atlas.head()

ref_celltype,immune_b,immune_t,immune_nk,immune_dend_plus_macro_plus_mono,immune_eosi,immune_neutro,immune_eryth,immune_mega,eryth_prog
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Immune_Broad_B-chr1:1114772-1114971,0.954541,0.006215,0.0,0.012992,0.005208,0.002255,0.013889,0.012048,0.030417
Immune_Broad_B-chr10:45390756-45390847,0.905821,0.008588,0.045092,0.007154,0.0,0.008713,0.064953,0.021491,0.206392
Immune_Broad_B-chr10:126289835-126290065,0.947659,0.003568,0.06942,0.009761,0.0,0.003088,0.0,0.005155,0.055057
Immune_Broad_B-chr10:126290368-126290506,0.9282,0.005704,0.0,0.013708,0.048423,0.001361,0.009259,0.0,0.031746
Immune_Broad_B-chr11:75988912-75989088,0.973406,0.00664,0.006439,0.003429,0.0,0.003071,0.008475,0.007576,0.113996


## Compute Naive Cell Proportions

In [21]:
ridxs = score_df['sample_id'].isin(DATA_SAMPLES)
ridxs &= (score_df['number_molecules']>=FILTER_COV)
fit_naive = score_df[ridxs]\
    .merge(ref_region_df[['region_id', 'ref_celltype']])\
    .groupby(['sample_id', 'ref_celltype'])\
    [SCORE_VAR].mean()\
    .reset_index()\
    .rename(columns={SCORE_VAR: 'coeff'})
fit_naive.iloc[0]

sample_id       SSM_12_kZJs0Vjyh4I
ref_celltype            eryth_prog
coeff                     0.000536
Name: 0, dtype: object

## Combine Different Estimates

In [22]:
fit_combined = fit_nnls\
    .merge(fit_naive, on=['sample_id', 'ref_celltype'], 
           suffixes=['_nnls', '_naive'], how='outer')
fit_combined.iloc[0]

ref_celltype              immune_b
sample_id       SSM_12_kZJs0Vjyh4I
coeff_nnls                0.147703
coeff_naive                0.14007
Name: 0, dtype: object

## Write Out

In [23]:
# ofile = RESULTS_PATH + '.estimates.tsv.gz'
# rv = fit_combined.copy()
# rv.to_csv(ofile, sep='\t', header=True, index=False)

In [232]:
# fit_combined.groupby('sample_id')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f2152a629d0>

In [230]:
fit_combined

Unnamed: 0,ref_celltype,sample_id,coeff_nnls,coeff_naive
0,immune_b,P1816006-1,0.067293,0.067307
1,immune_t,P1816006-1,0.268491,0.235414
2,immune_nk,P1816006-1,0.045725,0.053343
3,immune_dend_plus_macro_plus_mono,P1816006-1,0.102458,0.095121
4,immune_eosi,P1816006-1,0.024208,0.031924
5,immune_neutro,P1816006-1,0.486761,0.419077
6,immune_eryth,P1816006-1,0.0,0.002813
7,immune_mega,P1816006-1,0.0,0.005244
8,eryth_prog,P1816006-1,0.0,0.001993
9,immune_b,P1816012-1,0.016973,0.020603
