# Compute Deconvolution of In-House EM-Seq Clinical Batches

## Initialisation

In [149]:
import os
import numpy as np
import pandas as pd
import random
import glob
import datetime
import itertools
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
#import plotly.express as px
import nbformat

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

### Parameters and File Paths

In [172]:
FILTER_COV = 20
FILTER_COV_COEFF = 0.5
SCORE_VAR = 'frac_alpha_leq_25pct'
#--- Local paths
ROOT_DIR = '/analysis/gh-aselewa/projects'
PROJECT_SLUG = '2023_06_15_BCdeconvolution_AS'
PROJECT_DIR = ROOT_DIR + '/{}'.format(PROJECT_SLUG)

DATA_DIR = ROOT_DIR + '/{}'.format('2023_06_15_BCdeconvolution_AS/stage')

# Sample annotations
SAMPLE_PATH = DATA_DIR + ''
# Methylation data
SCORE_PATH = (
    PROJECT_DIR + '/output/meth_summaries/blueprint_meth_summaries_cg_count_geq_3_deconvolution_v2.v23_conv.with_cpg_index.tsv.gz'
)
# Reference matrix
ATLAS_PATH = (
    PROJECT_DIR + '/output/ref/deconv_inhouse_v2.atlas.tsv.gz'
#    DATA_DIR + '/data/deconv/deconv_inhouse_v2.atlas.tsv.gz'
)
REGION_PATH = (
    PROJECT_DIR + '/output/ref/deconv_inhouse_v2.region_to_ref_celltype.tsv.gz'
)
# COVERAGE_PATH = (
#     PROJECT_DIR + '/data/region_coverage_estimates.deconvolution_v2.v23_conv.tsv'
# )
# Where to store graphics
RESULTS_PATH = PROJECT_DIR + '/output/deconv/deconv_inhouse_v2_blueprint'

### Sample Metadata

In [173]:
# sample_df = pd.read_csv(SAMPLE_PATH, sep='\t')
# BATCH_MAP = {
#     '1': 'batch_12', '2': 'batch_12', 'normals_2': 'batch_12',
#     '3': 'batch_345', '4': 'batch_345', '5': 'batch_345',
# }
# sample_df['batch_group'] = sample_df['batch_id'].map(BATCH_MAP)
# #sample_df.iloc[0]

### ATLAS

In [174]:
atlas = pd.read_csv(ATLAS_PATH, sep='\t')
atlas = atlas.set_index('region_id')
atlas.columns.name = 'ref_celltype'
atlas.shape

(280, 9)

In [175]:
ref_region_df = pd.read_csv(REGION_PATH, sep='\t')
ref_region_df.shape[0]

280

Add coverage information.

In [176]:
# add_df = pd.read_csv(COVERAGE_PATH, sep='\t')
# ref_region_df = ref_region_df.merge(add_df[['region_id', 'coeff']], how='left')
# ref_region_df['coeff'].fillna(0, inplace=True)
# ref_region_df.shape[0]

In [177]:
#ridxs = (ref_region_df['coeff']>=FILTER_COV_COEFF)
#ref_region_df = ref_region_df[ridxs].copy()
#ref_region_df.shape[0], ref_region_df['region_id'].nunique()

In [178]:
atlas = atlas.loc[ref_region_df['region_id'], :].copy()
atlas.shape

(280, 9)

### Methylation Scores

In [214]:
score_df = pd.read_csv(SCORE_PATH, sep='\t')
score_df['sample_id'].nunique()
#score_df['sample_id'] = score_df['tube_id'] was necessary for bip-ssm pipeline output

## Compute Mixtures

In [216]:
from scipy.optimize import nnls

def compute_mixture(b, atlas):
    
    ridxs = ~b.isna()
    b_hat = b[ridxs]
    atlas_hat = atlas.loc[b_hat.index, :]
    fit = nnls(atlas_hat, b_hat)
    rv = pd.Series(fit[0], index=atlas_hat.columns)
    
    return(rv)

In [217]:
#DATA_SAMPLES = sample_df['sample_id'].tolist()
DATA_SAMPLES = score_df['sample_id'].unique()
len(DATA_SAMPLES)

10

In [218]:
ridxs = score_df['region_id'].isin(atlas.index)
ridxs &= score_df['sample_id'].isin(DATA_SAMPLES)
ridxs &= (score_df['number_molecules']>=FILTER_COV)
data = score_df[ridxs]\
    .pivot_table(index='region_id', columns='sample_id', values=SCORE_VAR) \
    .fillna(0)

In [219]:
fit_nnls = data.apply(lambda x: compute_mixture(x, atlas), axis=0)\
    .reset_index()\
    .melt(id_vars='ref_celltype', var_name='sample_id', value_name='coeff')
fit_nnls.iloc[0]

ref_celltype      immune_b
sample_id       P1816006-1
coeff             0.067293
Name: 0, dtype: object

In [220]:
compute_mixture(data.iloc[:,0], atlas)

ref_celltype
immune_b                            0.067293
immune_t                            0.268491
immune_nk                           0.045725
immune_dend_plus_macro_plus_mono    0.102458
immune_eosi                         0.024208
immune_neutro                       0.486761
immune_eryth                        0.000000
immune_mega                         0.000000
eryth_prog                          0.000000
dtype: float64

## Compute Naive Cell Proportions

In [221]:
ridxs = score_df['sample_id'].isin(DATA_SAMPLES)
ridxs &= (score_df['number_molecules']>=FILTER_COV)
fit_naive = score_df[ridxs]\
    .merge(ref_region_df[['region_id', 'ref_celltype']])\
    .groupby(['sample_id', 'ref_celltype'])\
    [SCORE_VAR].mean()\
    .reset_index()\
    .rename(columns={SCORE_VAR: 'coeff'})
fit_naive.iloc[0]

sample_id       P1816006-1
ref_celltype    eryth_prog
coeff             0.001993
Name: 0, dtype: object

## Combine Different Estimates

In [222]:
fit_combined = fit_nnls\
    .merge(fit_naive, on=['sample_id', 'ref_celltype'], 
           suffixes=['_nnls', '_naive'], how='outer')
fit_combined.iloc[0]

ref_celltype      immune_b
sample_id       P1816006-1
coeff_nnls        0.067293
coeff_naive       0.067307
Name: 0, dtype: object

## Write Out

In [223]:
ofile = RESULTS_PATH + '.estimates.tsv.gz'
rv = fit_combined.copy()
rv.to_csv(ofile, sep='\t', header=True, index=False)

In [232]:
fit_combined.groupby('sample_id')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f2152a629d0>

In [230]:
fit_combined

Unnamed: 0,ref_celltype,sample_id,coeff_nnls,coeff_naive
0,immune_b,P1816006-1,0.067293,0.067307
1,immune_t,P1816006-1,0.268491,0.235414
2,immune_nk,P1816006-1,0.045725,0.053343
3,immune_dend_plus_macro_plus_mono,P1816006-1,0.102458,0.095121
4,immune_eosi,P1816006-1,0.024208,0.031924
5,immune_neutro,P1816006-1,0.486761,0.419077
6,immune_eryth,P1816006-1,0.0,0.002813
7,immune_mega,P1816006-1,0.0,0.005244
8,eryth_prog,P1816006-1,0.0,0.001993
9,immune_b,P1816012-1,0.016973,0.020603
