In [1]:
import os
import numpy as np
import pandas as pd
import random
import glob
import datetime
import itertools
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
import nbformat

from scipy.optimize import nnls

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

### Parameters and File Paths

In [2]:
SCORE_VAR = 'frac_alpha_leq_25pct'

#--- Local paths
EXPERIMENT = 'BLUEPRINT_B'
ROOT_DIR = '/analysis/gh-msun/'
PROJECT_SLUG = '2023_06_26_SRT_deconvolution_MS'
PROJECT_DIR = ROOT_DIR + f'projects/{PROJECT_SLUG}/output/'
DATA_DIR = PROJECT_DIR + f'/experiment/{EXPERIMENT}/'
SCORE_DIR = DATA_DIR + 'methyl_score/'

REGION_PATH = (
    PROJECT_DIR + \
    'reference/deconv_inhouse_v2.region_to_ref_celltype.tsv.gz'
)

# Reference matrix
ATLAS_PATH = (
    PROJECT_DIR + \
    'reference/deconv_inhouse_v2.atlas.tsv.gz'
)

# Methylation score matrix
SCORE_PATH = (
    SCORE_DIR + \
    'E1B_E18CD4_E18CD8_E18NK_E18MONO_E18NEUTRO/' + \
    'mix0_seed512070.tsv.gz'
)
    
# FILTER_COV = 20
# FILTER_COV_COEFF = 0.5

# # Sample annotations
# SAMPLE_PATH = DATA_DIR + ''
# # Methylation data
# SCORE_PATH = (
#     PROJECT_DIR + '/output/meth_summaries/buffycoat_meth_summaries_cg_count_geq_3_deconvolution_v2.v23_conv.with_cpg_index.tsv.gz'
# )

# COVERAGE_PATH = (
#     PROJECT_DIR + '/data/region_coverage_estimates.deconvolution_v2.v23_conv.tsv'
# )
# # Where to store graphics
# RESULTS_PATH = PROJECT_DIR + '/output/deconv/deconv_inhouse_v2_blueprint'

In [96]:
def get_file_paths(directory):

    list_paths = []

    for filename in os.listdir(directory):
        list_paths.append(os.path.abspath(os.path.join(directory, filename)))

    return(list_paths)


def compute_deconvolution_nnls(score_df_path, score_type, atlas, match=True):
    '''
    Run nonnegative least squares ||Ax-b||_2. 
    The solution x is the deconvolution of b.
    
    Reasoning for match=True:
    Note that for lower total read count for a mixture, there be regions that
    are missing in score_df. At 1M reads, this not a problem.
    
    score_df -- methylation score dataframe
    score_type -- hypo or hyper score: e.g. 'frac_alpha_leq_25pct'
    atlas -- atlas dataframe
    '''
    # load score df
    score_df = pd.read_csv(score_df_path, sep='\t')
    score_df.index = score_df.region_id
    
    b = score_df[score_type]
    A = atlas
    
    # match index between A and b
    if match:
        region_count_before = A.shape[0]
        A = A[A.index.isin(b.index)]
        region_count_after = A.shape[0]
        region_count_diff = region_count_before - region_count_after 
        print(f'Dropped: {region_count_diff} regions.')
    
    # sort the indices for A to match b indices
    A_sorted = A.loc[b.index, :]
    
    # run NNLS
    fit = nnls(A_sorted, b)
    x = pd.Series(fit[0], index=A_sorted.columns)
    
    return(x)


def compute_deconvolution_n_times(mixture_replicates_path, score_type, atlas, match=True):
    '''
    
    mixture_replicates_path -- path to a mixture (proportion) directory of replicates (e.g. ../E1B_E18CD4_E18CD8_E18NK_E18MONO_E18NEUTRO/)
    output: pandas df
    
    '''
    # given path to mixture grab all paths to mixture replicates
    list_mixture_dir_paths = get_file_paths(mixture_replicates_path)
    
    # run deconvolution for each replicate
    results = []
    samples_name = []
    for path in list_mixture_dir_paths:
        deconv = compute_deconvolution_nnls(score_df_path=path, 
                                           atlas=atlas, 
                                           score_type=score_type, 
                                           match=match)
        results.append(deconv)
    df = pd.concat(results, axis=1)
    
    return df 


def compute_deconvolution_methyl_score_dir(path_to_methyl_score_dir, score_type, atlas, match=True):
    '''
    '''
    # grab all file paths in methyl_score directory
    list_methyl_score_dir = get_file_paths(path_to_methyl_score_dir)
    
    # run deconvolution on each mixture proportion
    results = []
    for path in list_methyl_score_dir:
        df = compute_deconvolution_n_times(mixture_replicates_path=path, 
                                               score_type=score_type, 
                                               atlas=atlas, 
                                               match=match)
        results.append(df)
    
    return(results)
    

def mse():
    pass

### ATLAS

In [97]:
atlas = pd.read_csv(ATLAS_PATH, sep='\t',index_col=None)
atlas = atlas.set_index('region_id')
atlas.columns.name = 'ref_celltype'
atlas.shape

(280, 9)

In [98]:
ref_region_df = pd.read_csv(REGION_PATH, sep='\t')
ref_region_df.shape[0]

280

In [99]:
atlas = atlas.loc[ref_region_df['region_id'], :].copy()
atlas.shape

(280, 9)

## Deconvolution

In [102]:
%%time
testing = compute_deconvolution_methyl_score_dir(path_to_methyl_score_dir='/analysis/gh-msun/projects/2023_06_26_SRT_deconvolution_MS/output/experiment/BLUEPRINT_B/methyl_score/', 
                                                  score_type=SCORE_VAR, 
                                                  atlas=atlas, 
                                                  match=False)



In [109]:
testing[0]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
ref_celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
immune_b,0.101535,0.09924,0.097298,0.097559,0.09718,0.09692,0.103476,0.098088,0.097822,0.101656
immune_t,0.347402,0.345183,0.344621,0.359225,0.342931,0.35731,0.347976,0.344807,0.344799,0.346449
immune_nk,0.171501,0.176011,0.175269,0.177142,0.17702,0.178673,0.171056,0.174771,0.175616,0.177923
immune_dend_plus_macro_plus_mono,0.182177,0.165915,0.187635,0.186075,0.184327,0.185269,0.180335,0.190779,0.18641,0.183593
immune_eosi,0.001008,0.00191,0.000571,0.0,0.00015,0.00033,0.002138,8e-06,0.001877,0.002045
immune_neutro,0.171058,0.176353,0.175993,0.180332,0.174722,0.174815,0.174809,0.16635,0.171552,0.172839
immune_eryth,0.001152,0.000806,0.0,0.0,0.000213,0.0,0.003741,0.00048,0.0,0.001633
immune_mega,0.0,8.8e-05,0.000753,0.0,0.0,0.0,0.0,0.001014,0.000429,0.000518
eryth_prog,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# sys