In [47]:
import os
import numpy as np
import pandas as pd
import random
import glob
import datetime
import itertools
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
import nbformat

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

### Parameters and File Paths

In [48]:
FILTER_COV = 20
FILTER_COV_COEFF = 0.5
SCORE_VAR = 'frac_alpha_leq_25pct'
#--- Local paths
ROOT_DIR = '/analysis/'
PROJECT_SLUG = '2023_06_15_BCdeconvolution_AS'
PROJECT_DIR = ROOT_DIR + '/{}'.format(PROJECT_SLUG)

DATA_DIR = ROOT_DIR + '/{}'.format('2023_06_15_BCdeconvolution_AS/stage')

# Sample annotations
SAMPLE_PATH = DATA_DIR + ''
# Methylation data
SCORE_PATH = (
    PROJECT_DIR + '/output/meth_summaries/buffycoat_meth_summaries_cg_count_geq_3_deconvolution_v2.v23_conv.with_cpg_index.tsv.gz'
)
# Reference matrix
ATLAS_PATH = (
    PROJECT_DIR + '/output/ref/deconv_inhouse_v2.atlas.tsv.gz'
#    DATA_DIR + '/data/deconv/deconv_inhouse_v2.atlas.tsv.gz'
)
REGION_PATH = (
    PROJECT_DIR + '/output/ref/deconv_inhouse_v2.region_to_ref_celltype.tsv.gz'
)
# COVERAGE_PATH = (
#     PROJECT_DIR + '/data/region_coverage_estimates.deconvolution_v2.v23_conv.tsv'
# )
# Where to store graphics
RESULTS_PATH = PROJECT_DIR + '/output/deconv/deconv_inhouse_v2_blueprint'

### ATLAS

In [125]:
atlas = pd.read_csv(ATLAS_PATH, sep='\t',index_col=None)
atlas = atlas.set_index('region_id')
atlas.columns.name = 'ref_celltype'
atlas.shape

(280, 9)

In [126]:
ref_region_df = pd.read_csv(REGION_PATH, sep='\t')
ref_region_df.shape[0]

280

In [127]:
atlas = atlas.loc[ref_region_df['region_id'], :].copy()
atlas.shape

(280, 9)

### Methylation Scores

In [128]:
SCORE_PATH='/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/methyl_score/test_mixture.tsv.gz'
score_df = pd.read_csv(SCORE_PATH, sep='\t')
# score_df['sample_id'].nunique()

In [129]:
atlas = atlas[atlas.index.isin(score_df.region_id)]

In [130]:
atlas.head()

ref_celltype,immune_b,immune_t,immune_nk,immune_dend_plus_macro_plus_mono,immune_eosi,immune_neutro,immune_eryth,immune_mega,eryth_prog
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Immune_Broad_B-chr1:1114772-1114971,0.954541,0.006215,0.0,0.012992,0.005208,0.002255,0.013889,0.012048,0.030417
Immune_Broad_B-chr10:45390756-45390847,0.905821,0.008588,0.045092,0.007154,0.0,0.008713,0.064953,0.021491,0.206392
Immune_Broad_B-chr10:126289835-126290065,0.947659,0.003568,0.06942,0.009761,0.0,0.003088,0.0,0.005155,0.055057
Immune_Broad_B-chr10:126290368-126290506,0.9282,0.005704,0.0,0.013708,0.048423,0.001361,0.009259,0.0,0.031746
Immune_Broad_B-chr11:75988912-75989088,0.973406,0.00664,0.006439,0.003429,0.0,0.003071,0.008475,0.007576,0.113996


In [131]:
from scipy.optimize import nnls

def compute_mixture(b, atlas):
    
    ridxs = ~b.isna()
    b_hat = b[ridxs]
    atlas_hat = atlas.loc[b_hat.index, :]
    fit = nnls(atlas_hat, b_hat)
    rv = pd.Series(fit[0], index=atlas_hat.columns)
    
    return(rv)

In [133]:
compute_mixture(b=score_df, atlas=atlas)

ref_celltype
immune_b                            0.507198
immune_t                            0.494235
immune_nk                           0.000000
immune_dend_plus_macro_plus_mono    0.000000
immune_eosi                         0.006003
immune_neutro                       0.002169
immune_eryth                        0.008058
immune_mega                         0.000000
eryth_prog                          0.000000
dtype: float64