In [None]:
###############################################
###############################################
# Setup
# This chuck ingests pre-computed variables
###############################################
###############################################

# [Shift + Return] to run chunks 

###############################################

# settings
wkdir, s3dir = '/home/ubuntu/data/DL20181011_melanocyte_test_data', 'daniel.le-work/MEL_project'

%reload_ext autoreload
%autoreload 2
from scanpy_helpers import *
warnings.filterwarnings('ignore')
%matplotlib inline

# import data from s3
for file in ['adata_subset1', 'adata_subset2', 'full_adata', 'pre_adata','adata_subset1_KRT','raw_adata']:
#     ! aws s3 cp s3://{s3dir}/{file}.p {wkdir}/
    var = pickle.load((open(f'{wkdir}/{file}.p', 'rb')))
    exec(f'{file} = var')
    del var

# drop patients due to low cell count
adata_subset2_filtered = adata_subset2[[x not in ['A1015LM',
                                                'A1017LM',
                                                'A1012M'] for x in adata_subset2.obs.patient.tolist()]] 

# plot full data
sc.pl.umap(full_adata, color=['age','general_location'], cmap = 'magma_r')
sc.pl.umap(adata_subset2_filtered, color=['age','general_location'], cmap = 'magma_r')

print('Completed')


In [None]:
# Determine correlation between cell-wise txn noise and cell counts
input_adata = adata_subset2
med_list = []
count_list = []
patient_list = []
for x in set(input_adata.obs.patient):
    df_slice = input_adata.obs.filter(regex=x).dropna()
    count_list.append(len(df_slice))
    med_list.append(df_slice.iloc[:,0].median())
    patient_list.append(x)

plot_df = pd.DataFrame({'median':med_list,
                        'counts':count_list,
                        'patient':patient_list})

print(ggplot(plot_df, aes('counts','median',label='patient'))
      +theme_bw()   
      +geom_text())

print(plot_df.sort_values('counts').head())


In [None]:
# cell-wise transcription noise vs grouping
input_adata = adata_subset2_filtered
groupby = 'age_group'
plot_df = (input_adata
           .obs
           .loc[:, [f'{x}' for x in input_adata.obs.columns if x.startswith('noise')] + [groupby]])
plot_df = pd.melt(plot_df, id_vars=groupby)

# plot
plotnine.options.figure_size=(4,4)
print(ggplot(plot_df)
         +theme_bw()
         +theme(axis_text_x=element_text(angle=90))
         +geom_boxplot(aes(groupby,'value'))
         +labs(y='transcriptional noise',x='')
         +scale_y_log10())

# Compute the Kruskal-Wallis H-test for independent samples
groups_dict = {}
for group in list(set(plot_df[groupby])):
    group_vec = plot_df[plot_df[groupby] == group]['value'].dropna().tolist()
    groups_dict[group] = group_vec

eval_list = [f'groups_dict["{i}"]' for i in groups_dict.keys()]
eval_subcmd = ','.join(eval_list)
eval_cmd = f'stats.kruskal({eval_subcmd})'
hstat, pval = eval(eval_cmd)
print('H-test: ', hstat, 'p-value', pval)


In [None]:
# plot age vs median txn noise
input_adata = adata_subset2_filtered
med_list = []
age_list = []
patient_list = []
for x in set(input_adata.obs.patient):
    df_slice = input_adata.obs.filter(regex=x).dropna()
    age_list.append(input_adata.obs[input_adata.obs.patient == x].age.values[0])
    med_list.append(df_slice.iloc[:,0].median())
    patient_list.append(x)

plot_df = pd.DataFrame({'median':med_list,
                        'age':age_list,
                        'patient':patient_list})

print(ggplot(plot_df, aes('age','median'))
      +theme_bw()
      +geom_point(alpha = 0.5, color ='red')
      +geom_text(aes(label='patient')))

r,pval = pearsonr(plot_df.age.values.tolist(), 
         plot_df['median'].values.tolist())
print('R2: ', r**2, 'p-value', pval)


In [None]:
# # calculate transcription noise per gene
groupby='age_group'
input_adata = adata_subset2_filtered

cat, df = prepare_dataframe(input_adata, 
                  var_names=input_adata.var_names,
                  groupby=groupby)

# coefficient of variation vs age bin
gene_std = df.groupby(groupby).std()
gene_mean = df.groupby(groupby).mean()
gene_coef = gene_std/gene_mean
gene_coef = gene_coef.reset_index()
gene_coef = (gene_coef
             .set_index(groupby)
             .sort_index())

# std vs age bin
# gene_std = df.groupby(groupby).std()
# gene_mean = df.groupby(groupby).mean()
# gene_coef = gene_std
# gene_coef = gene_coef.reset_index()
# gene_coef = (gene_coef
#              .set_index(groupby)
#              .sort_index())

# SNR vs age bin
# gene_std = df.groupby(groupby).std()
# gene_mean = df.groupby(groupby).mean()
# gene_coef = gene_mean/gene_std
# gene_coef = gene_coef.reset_index()
# gene_coef = (gene_coef
#              .set_index(groupby)
#              .sort_index())

r_list=[]
p_list=[]
mu_list=[]
for x in gene_coef.columns:
    r,p = pearsonr([x for x in range(gene_coef.shape[0])], 
                   gene_coef.loc[:, x])
    r_list.append(r)
    p_list.append(p)
    mu_list.append(df.loc[:,x].mean())
coef_df = pd.DataFrame({'gene':gene_coef.columns,
                         'R':r_list,
                         'pval':p_list,
                       'mu':mu_list})
coef_df['R2'] = coef_df['R']**2
coef_df['1-R2'] = 1-coef_df['R2']
condition_cmd = '[((R2 > 0.95) and (mu > 1)) for R2,mu in zip(coef_df["R2"], coef_df.mu)]'
condition_vec = eval(condition_cmd)
coef_df['color'] = condition_vec
coef_df = coef_df.sort_values('pval', ascending=True)
coef_df = coef_df.dropna()

# plot and print 
print(ggplot(coef_df)
      +theme_bw()
      +geom_point(aes('mu','R2', color = 'color'), alpha = 0.5)
      +ggtitle(condition_cmd)
      +labs(y = 'R2', x = 'mean log(exp)'))

print(coef_df[coef_df.color == True])

gene = coef_df[coef_df.color == True].gene
out = symbol2field(gene)
full_report = ['{}: {}\n{}'.format(idx,
                            x['query'],
                            x['summary']) if ('summary' in x.keys()) else '{}: {}\n{}'.format(idx,
                            x['query'],
                            'N/A') for idx, x in enumerate(out)]
output = search_term = ''
[print(x) for x in full_report if search_term in x]

In [None]:
# look at distribution of covar with respect to age bin ==> ridge plot like
plot_df = gene_coef.T.rename(columns=str).reset_index()
plot_df = pd.melt(plot_df, id_vars='index')
plot_df[groupby] = (plot_df[groupby]
                    .astype(str)
                    .astype(CategoricalDtype(gene_coef.reset_index()[groupby].cat.categories.tolist(), ordered=True)))

print(ggplot(plot_df, aes('value', fill=groupby,
                         color=groupby))
     + theme_bw()
     + geom_density()
     + scale_x_log10()
     + facet_grid(f'{groupby}~'))

print(ggplot(plot_df, aes(groupby, 'value'))
     + theme_bw()
     + scale_y_log10()
     + geom_boxplot())

In [None]:
# plot expression
gene = 'BIRC6'
input_adata = adata_subset2_filtered
groupby='age_bin'

gene2plots(input_adata, gene, groupby)
sc.pl.umap(input_adata, color=[gene, 'age_group'], cmap='magma_r')
out = symbol2field(gene)
[print(f'{idx}: ', 
       x['query'],
       '\n',x['summary']) if ('summary' in x.keys()) else print(f'{idx}: ', 
                                                                x['query'],
                                                                '\n','N/A') for idx, x in enumerate(out)]

# Dev

In [None]:
input_adata = adata_subset2_filtered
groupby = ['age', 'age_bin', 'age_group']

# cell and gene ids from filtered adata obj
cell_names = input_adata.obs_names.tolist()
gene_names = input_adata.var_names.tolist() + [x for x in pre_adata.index.tolist() if x.startswith('ERCC-')]

# filter raw data
pre_adata_processed = pre_adata.reset_index()
pre_adata_processed = pre_adata_processed[[x in gene_names for x in pre_adata_processed.gene_name]]
pre_adata_processed = pre_adata_processed.loc[:,cell_names]

# process table
pre_adata_T = pre_adata_processed.T
pre_adata_T.columns = gene_names
pre_adata_T = (pre_adata_T
               .reset_index()
               .rename(columns = {'index':'cell_name'}))
pre_adata_T = pd.merge((input_adata
                        .obs
                        .loc[:,groupby]
                        .reset_index()
                        .rename(columns = {'index':'cell_name'})),
                       pre_adata_T,
                       'left',
                       'cell_name')



In [None]:
# get ERCC's only and plot covar vs mean
ercc_stats = pre_adata_T.filter(regex='ERCC-').describe().T
ercc_stats['covar'] = ercc_stats['std']/ercc_stats['mean']
ercc_stats['log_covar'] = np.log(ercc_stats['covar'])
ercc_stats = ercc_stats.reset_index()
ercc_stats['log10_std'] = np.log10(ercc_stats['std'])
ercc_stats['log10_mean'] = np.log10(ercc_stats['mean'])
ercc_stats['log10_covar'] = np.log10(ercc_stats['covar'])
ercc_stats = ercc_stats.sort_values('mean')

# fit exponential decay to covar
import scipy.optimize

def model_func(t, A, K, C):
    return A * np.exp(-K * t) + C
def fit_exp_nonlinear(t, y):
    opt_parms, parm_cov = scipy.optimize.curve_fit(model_func, t, y, maxfev=1000)
    A, K, C = opt_parms
    return A, K, C

A, K, C = fit_exp_nonlinear(ercc_stats['mean'].values, ercc_stats['covar'].values)
fit_y = model_func(ercc_stats['mean'].values, A, K, C)
ercc_stats['nl_fit'] = fit_y

# fit line to std    
def f(x, A, B): # this is your 'straight line' y=f(x)
    return A*x + B

A,B = scipy.optimize.curve_fit(f, ercc_stats['mean'].values, ercc_stats['std'].values)[0] # your data x, y to fit
fit_y_line = f(ercc_stats['mean'].values, A, B)
ercc_stats['l_fit'] = fit_y_line

print(ggplot(ercc_stats)
      +theme_bw()
      +geom_point(aes('mean','covar'))
      +geom_line(aes('mean', 'nl_fit'), color = 'red'))

print(ggplot(ercc_stats)
      +theme_bw()
      +geom_point(aes('mean','std'))
      +geom_line(aes('mean', 'l_fit'), color = 'red'))


In [None]:
# all_genes_table = pre_adata_T.describe().T
all_genes_table['covar'] = all_genes_table['std']/all_genes_table['mean']
all_genes_table['ercc'] = [x.startswith('ERCC-') for x in all_genes_table.index]

In [None]:
print(ggplot(all_genes_table.iloc[1:,:])
     +theme_bw()
     +geom_point(aes('mean', 'std', color = 'ercc'), alpha=0.1)
     +scale_x_log10()
     +scale_y_log10())

# entropy
entropy calculated from read count distribution

In [None]:
starting_adata = adata_subset2
pre_adata_ercc = pre_adata[[x.startswith('ERCC-') for x in pre_adata.index]]

entropy_df_ercc = bin_entropy(adata_subset2, pre_adata_ercc, log = False)
median_df_ercc = adata_median (adata_subset2, pre_adata_ercc)
mean_df_ercc = adata_mean (adata_subset2, pre_adata_ercc)
coefvar_df_ercc = adata_coefvar (adata_subset2, pre_adata_ercc)
iqr_df_ercc = adata_iqr (adata_subset2, pre_adata_ercc)
std_df_ercc = adata_std (adata_subset2, pre_adata_ercc)

groupby='patient'
for idx,sub_df in enumerate([entropy_df_ercc, median_df_ercc, coefvar_df_ercc, 
                             iqr_df_ercc, mean_df_ercc, std_df_ercc]):
    if idx == 0:
        metrics_df_ercc = sub_df
    else:
        metrics_df_ercc = pd.merge(metrics_df_ercc, sub_df, 'right', ['gene',groupby])

In [None]:
metrics_df_ercc.corr()

In [None]:
var = 'exp_entropy'
test = fit_poly(metrics_df_ercc, 'exp_mean', var, 2, 'patient')
test['res'] = test[var] - test['fit']
test['se'] = test['res'] ** 2
print(f'{var} NRMSD =', np.sqrt(test['se'].mean())/test[var].mean())
print(ggplot(test)
      +theme_bw()
      +theme(aspect_ratio = 1)
     +geom_point(aes('exp_mean', var, color='patient'))
     +geom_line(aes('exp_mean', 'fit'))
    +facet_wrap('~patient')
     )

var = 'exp_coefvar'
test = fit_exp(metrics_df_ercc.dropna(), 'exp_mean', var, 'patient')
test['res'] = test[var] - test['fit']
test['se'] = test['res'] ** 2
print(f'{var} NRMSD =', np.sqrt(test['se'].mean())/test[var].mean())
print(ggplot(test)
      +theme_bw()
      +theme(aspect_ratio = 1)
     +geom_point(aes('exp_mean', var, color='patient'))
     +geom_line(aes('exp_mean', 'fit'))
    +facet_wrap('~patient')
     )

In [None]:
test['se'] = (test['exp_coefvar'] - test['resdiual'])**2
test.head()


In [None]:
plot_df_ercc = pd.melt(metrics_df_ercc, id_vars=['patient','gene','exp_median', 'exp_mean'])
# plot_df_ercc = plot_df_ercc[plot_df_ercc.patient != 'A1015LM']

print(ggplot(plot_df_ercc)
      +theme_bw()
      +theme(aspect_ratio = 1)
     +geom_line(aes('exp_mean', 'value', color='patient'))
     +facet_grid('variable~.',scales='free')
#      +scale_x_log10() 
#      +scale_y_log10() 
     )

In [None]:
from scipy.optimize import curve_fit

def exp_model(x, a, b, c):
    return a * np.exp(-b * x) + c

def fit_exp (full_metrics, xval, yval, groupby, ercc_only = False):
    return_df = pd.DataFrame()
    for feature in list(set(full_metrics[groupby].values.tolist())):
        df_slice = full_metrics[full_metrics[groupby] == feature]
        ercc_slice = df_slice[[x.startswith('ERCC-') for x in df_slice.gene]]
        if ercc_only == True:
            input_slice = ercc_slice
        else:
            input_slice = df_slice
        
#         input_slice = input_slice.loc[:,[xval, yval]].dropna()
        x = input_slice[xval].values
        y = input_slice[yval].values
        popt, pcov = curve_fit(exp_model, x, y)
        df_slice['fit'] = exp_model(x, *popt)
        return_df = return_df.append(df_slice)
    return return_df

def fit_poly (full_metrics, xval, yval, poly_num, groupby, ercc_only = False):
    return_df = pd.DataFrame()
    for feature in list(set(full_metrics[groupby].values.tolist())):
        df_slice = full_metrics[full_metrics[groupby] == feature]
        ercc_slice = df_slice[[x.startswith('ERCC-') for x in df_slice.gene]]
        if ercc_only == True:
            input_slice = ercc_slice
        else:
            input_slice = df_slice
            
#         input_slice = input_slice.loc[:,[xval, yval]].dropna()
        x = input_slice[xval].values
        y = input_slice[yval].values
        z = np.polyfit(x, y, poly_num)
        p = np.poly1d(z)
        df_slice['fit'] = p(df_slice[xval].values)
        return_df = return_df.append(df_slice)
    return return_df

def bin_entropy(starting_adata, pre_adata, groupby = 'patient', bin_size = 10, log = True):
    if log == True:
        input_adata = (np.log(pre_adata + 1)
                        .astype(int)
                        .loc[:,starting_adata.obs_names.values.tolist()]
                      )
    else:
        input_adata = (pre_adata
                .astype(int)
                .loc[:,starting_adata.obs_names.values.tolist()]
              )
    return_df = pd.DataFrame()
    for feature in tqdm.tqdm(list(set(starting_adata.obs[groupby]))):
        feature_slice = input_adata.loc[:, starting_adata[starting_adata.obs[groupby] == feature].obs_names.tolist()]
        binned_entropy_list = []

        for row in range(feature_slice.shape[0]):
            max_value = np.max(feature_slice.iloc[row,:].values)
            if max_value == 0:
                bin_seq = [0,bin_size,int(bin_size*2)]
            else:
                bin_seq = [x for x in range(0,int(max_value),10)]
                if max_value % bin_size != 0:
                    bin_seq = bin_seq + [bin_seq[-1] + bin_size]
            binned_counts,edges = np.histogram(feature_slice.iloc[row,:].values, bins = bin_seq)
            binned_probs = binned_counts / np.sum(binned_counts)
            binned_slice_entropy = stats.entropy(binned_probs)
            binned_entropy_list.append(binned_slice_entropy)

        slice_df = pd.DataFrame({'exp_entropy':binned_entropy_list})
        slice_df[groupby] = feature
        slice_df['gene'] = feature_slice.index.values.tolist()
        return_df = return_df.append(slice_df)
    return return_df

def adata_median (starting_adata, pre_adata, groupby = 'patient', log = True):
    if log == True:
        input_adata = (np.log(pre_adata + 1)
                        .astype(int)
                        .loc[:,starting_adata.obs_names.values.tolist()]
                      )
    else:
        input_adata = (pre_adata
                .astype(int)
                .loc[:,starting_adata.obs_names.values.tolist()]
              )
    return_df = pd.DataFrame()
    for feature in tqdm.tqdm(list(set(starting_adata.obs[groupby]))):
        feature_slice = input_adata.loc[:, starting_adata[starting_adata.obs[groupby] == feature].obs_names.tolist()]
        slice_df = pd.DataFrame({'exp_median':np.median(feature_slice.values, axis = 1).flatten().tolist()})
        slice_df[groupby] = feature
        slice_df['gene'] = feature_slice.index.values.tolist()
        return_df = return_df.append(slice_df)
    return return_df

def adata_mean (starting_adata, pre_adata, groupby = 'patient', log = True):
    if log == True:
        input_adata = (np.log(pre_adata + 1)
                        .astype(int)
                        .loc[:,starting_adata.obs_names.values.tolist()]
                      )
    else:
        input_adata = (pre_adata
                .astype(int)
                .loc[:,starting_adata.obs_names.values.tolist()]
              )
    return_df = pd.DataFrame()
    for feature in tqdm.tqdm(list(set(starting_adata.obs[groupby]))):
        feature_slice = input_adata.loc[:, starting_adata[starting_adata.obs[groupby] == feature].obs_names.tolist()]
        slice_df = pd.DataFrame({'exp_mean':np.mean(feature_slice.values, axis = 1).flatten().tolist()})
        slice_df[groupby] = feature
        slice_df['gene'] = feature_slice.index.values.tolist()
        return_df = return_df.append(slice_df)
    return return_df

def adata_coefvar (starting_adata, pre_adata, groupby = 'patient', log=True):
    if log == True:
        input_adata = (np.log(pre_adata + 1)
                        .astype(int)
                        .loc[:,starting_adata.obs_names.values.tolist()]
                      )
    else:
        input_adata = (pre_adata
                .astype(int)
                .loc[:,starting_adata.obs_names.values.tolist()]
              )
    return_df = pd.DataFrame()
    for feature in tqdm.tqdm(list(set(starting_adata.obs[groupby]))):
        feature_slice = input_adata.loc[:, starting_adata[starting_adata.obs[groupby] == feature].obs_names.tolist()]
        
        coefvar = np.std(feature_slice.values, axis = 1) / np. mean(feature_slice.values, axis = 1)
        
        slice_df = pd.DataFrame({'exp_coefvar':coefvar.flatten().tolist()})
        slice_df[groupby] = feature
        slice_df['gene'] = feature_slice.index.values.tolist()
        return_df = return_df.append(slice_df)
    return return_df

def adata_iqr (starting_adata, pre_adata, groupby = 'patient', log = True):
    if log == True:
        input_adata = (np.log(pre_adata + 1)
                        .astype(int)
                        .loc[:,starting_adata.obs_names.values.tolist()]
                      )
    else:
        input_adata = (pre_adata
                .astype(int)
                .loc[:,starting_adata.obs_names.values.tolist()]
              )
    return_df = pd.DataFrame()
    for feature in tqdm.tqdm(list(set(starting_adata.obs[groupby]))):
        feature_slice = input_adata.loc[:, starting_adata[starting_adata.obs[groupby] == feature].obs_names.tolist()]
        
        q25 = np.percentile(feature_slice.values, 25, axis = 1)
        q75 = np.percentile(feature_slice.values, 75, axis = 1)
        iqr = np.abs(q75-q25)
        
        slice_df = pd.DataFrame({'exp_iqr':iqr.flatten().tolist()})
        slice_df[groupby] = feature
        slice_df['gene'] = feature_slice.index.values.tolist()
        return_df = return_df.append(slice_df)
    return return_df

def adata_std (starting_adata, pre_adata, groupby = 'patient', log=True):
    if log == True:
        input_adata = (np.log(pre_adata + 1)
                        .astype(int)
                        .loc[:,starting_adata.obs_names.values.tolist()]
                      )
    else:
        input_adata = (pre_adata
                .astype(int)
                .loc[:,starting_adata.obs_names.values.tolist()]
              )
    return_df = pd.DataFrame()
    for feature in tqdm.tqdm(list(set(starting_adata.obs[groupby]))):
        feature_slice = input_adata.loc[:, starting_adata[starting_adata.obs[groupby] == feature].obs_names.tolist()]
        
        slice_df = pd.DataFrame({'exp_std':np.std(feature_slice.values, axis = 1).flatten().tolist()})
        slice_df[groupby] = feature
        slice_df['gene'] = feature_slice.index.values.tolist()
        return_df = return_df.append(slice_df)
    return return_df

In [None]:
starting_adata = adata_subset2

# mean_df = adata_mean (adata_subset2, pre_adata)
# entropy_df = bin_entropy(adata_subset2, pre_adata, log = False)

groupby='patient'
for idx,sub_df in enumerate([entropy_df, mean_df]):
    if idx == 0:
        metrics_df = sub_df
    else:
        metrics_df = pd.merge(metrics_df, sub_df, 'right', ['gene',groupby])
        
metrics_df = fit_poly(metrics_df, 'exp_mean', 'exp_entropy', 2, 'patient', ercc_only=True)
metrics_df['residual'] = metrics_df['exp_entropy'] - metrics_df['fit']
metrics_df['residual_ratio'] = metrics_df['exp_entropy'] / metrics_df['fit']
metrics_df = metrics_df[metrics_df.exp_mean != 0]


In [None]:
plot_df = metrics_df.sample(n=10000)
print(ggplot(plot_df)
      +theme_bw()
      +theme(aspect_ratio = 1)
     +geom_point(aes('exp_mean', 'exp_entropy', color='patient'))
     +geom_line(aes('exp_mean', 'fit', group = 'patient'))
     +facet_wrap('~patient',scales='free')
#      +scale_x_log10() 
#      +scale_y_log10() 
     )

print(ggplot(plot_df)
      +theme_bw()
      +theme(aspect_ratio = 1)
     +geom_point(aes('exp_mean', 'residual', color='patient'))
     +facet_wrap('~patient',scales='free')
#      +scale_x_log10() 
#      +scale_y_log10() 
     )

print(ggplot(plot_df)
      +theme_bw()
      +theme(aspect_ratio = 1)
     +geom_point(aes('exp_mean', 'residual_ratio', color='patient'))
     +facet_wrap('~patient',scales='free')
#      +scale_x_log10() 
#      +scale_y_log10() 
     )

In [None]:
metrics_preCorr = metrics_df.pivot(index = 'patient',
                                    columns = 'gene',
                                    values = 'residual_ratio')

metrics_preCorr = pd.merge((adata_subset2
                       .obs
                       .loc[:,['patient','age']]
                       .reset_index()
                       .drop('index', axis=1)
                       .drop_duplicates()),
                      metrics_preCorr,
                      'right',
                      'patient')

#filters
metrics_preCorr = metrics_preCorr[metrics_preCorr.patient != 'A1015LM']
metrics_preCorr = metrics_preCorr[metrics_preCorr.age > 0]
metrics_preCorr.sort_values('age')

In [None]:
gene_list = []
r_list = []
pval_list = []
for col in [x for x in metrics_preCorr.columns if x not in ['patient','age']]:
    df_slice = metrics_preCorr.loc[:,['age', col]]
    df_slice = df_slice[df_slice[col] != np.nan]
    r,pval = pearsonr(df_slice.age.values, df_slice[col].values)
    gene_list.append(col)
    r_list.append(r)
    pval_list.append(pval)
    
corr_df = pd.DataFrame({'gene':gene_list,
                        'r':r_list,
                        'pval':pval_list})
corr_df = corr_df.replace(np.nan, 0)

In [None]:
corr_df.sort_values('r', ascending=False).head(20)

In [None]:
# plot expression
gene = 'TAF6'
input_adata = adata_subset2
groupby='age_bin'

print(ggplot(metrics_preCorr.loc[:,['age', gene]])
      +theme_bw()
      +geom_line(aes('age',gene))
      +labs(y='residual_ratio'))

gene2plots(input_adata, gene, groupby)
sc.pl.umap(input_adata, color=[gene, groupby], cmap='magma_r')
out = symbol2field(gene)
output = [print(f'{idx}: ', 
       x['query'],
       '\n',x['summary']) if ('summary' in x.keys()) else print(f'{idx}: ', 
                                                                x['query'],
                                                                '\n','N/A') for idx, x in enumerate(out)]


In [None]:
def adata_residual (starting_adata, metrics_df, groupby = 'patient'):
    input_adata = metrics_df
    
    return_df = pd.DataFrame()
    for feature in tqdm.tqdm(list(set(starting_adata.obs[groupby]))):
        feature_slice = input_data[input_adata[groupby] == feature]
            
        q25 = np.percentile(feature_slice.values, 25, axis = 1)
        q75 = np.percentile(feature_slice.values, 75, axis = 1)
        iqr = np.abs(q75-q25)
        
        slice_df = pd.DataFrame({'exp_iqr':iqr.flatten().tolist()})
        slice_df[groupby] = feature
        slice_df['gene'] = feature_slice.index.values.tolist()
        return_df = return_df.append(slice_df)
    return return_df




# fit line: mean vs iqr
obs_y = metrics_df.exp_iqr.values.reshape(-1, 1) # iqr
obs_x = metrics_df.exp_mean.values.reshape(-1, 1) # mean
reg = LinearRegression().fit(obs_x, 
                             obs_y)
pred_y = reg.predict(obs_x) # predicted entropy
metrics_df['fit_value'] = pred_y
res_y = obs_y-pred_y
metrics_df['residual'] = res_y


In [None]:
metrics_df = pd.merge((adata_subset2
                       .obs
                       .loc[:,['patient','age']]
                       .reset_index()
                       .drop('index', axis=1)
                       .drop_duplicates()),
                      metrics_df,
                      'right',
                      'patient')
metrics_df[metrics_df.exp_iqr > 0].sort_values('residual', ascending = False).head(10)

In [None]:
# Regress out effect of median from entropy
trimmed_return_df = return_df[[not x.startswith('ERCC-') for x in return_df.gene]]
trimmed_return_df['log10_median'] = np.log10(trimmed_return_df['median'].values.tolist())
trimmed_return_df['log10_median'] = trimmed_return_df['log10_median'].replace(-np.inf, 0)
obs_y = trimmed_return_df.binned_entropy.values.reshape(-1, 1) # entropy
obs_x = trimmed_return_df['log10_median'].values.reshape(-1, 1) # median
reg = LinearRegression().fit(obs_x, 
                             obs_y)
pred_y = reg.predict(obs_x) # predicted entropy
trimmed_return_df['fit_entropy'] = pred_y
res_y = obs_y-pred_y
trimmed_return_df['residual_entropy'] = res_y

In [None]:
# correlations among features
trimmed_return_df.corr()

In [None]:
num = 2000
plot_df = trimmed_return_df[trimmed_return_df['median'] > 0].sample(num)

print(ggplot(plot_df)
      +geom_point(aes('log10_median','residual_entropy'))
      )

print(ggplot(plot_df)
      +geom_point(aes('log10_median','binned_entropy'))
      +geom_line(aes('log10_median','fit_entropy'))
     )

In [None]:
plot_df.head()

In [None]:
full_corr_df = pd.DataFrame()
adult_only = False
for feature in ['entropy', 'binned_entropy','residual_entropy']: 
    corr_df = (pd.merge((starting_adata
             .obs
             .loc[:, ['patient', 'age']]
             .reset_index()
             .drop('index', axis=1)
             .drop_duplicates()),
            (trimmed_return_df.pivot(index = 'gene',
                            columns = 'patient',
                            values = feature)
             .T
             .reset_index()),
            'right',
            'patient')
            .drop('patient', axis = 1))
    if adult_only == True:
        corr_df = corr_df[corr_df.age > 0]
    corr_df = corr_df[corr_df.columns[1:]].apply(lambda x: x.corr(corr_df['age']))
    corr_df = pd.DataFrame(corr_df).reset_index()
    corr_df.columns = ['gene','R']
    corr_df['R'] = corr_df['R'].replace(np.nan, 0)
    corr_df['R2'] = corr_df['R'] ** 2
    corr_df['feature'] = feature
    full_corr_df = full_corr_df.append(corr_df)


In [None]:
feature = 'residual_entropy'
ascending = False
full_corr_slice = full_corr_df[full_corr_df.feature == feature].sort_values('R', ascending=ascending)

topn = 10
print(full_corr_slice.head(topn))
gene = full_corr_slice.head(topn).gene
out = symbol2field(gene)
out = [print(f'{idx}: ', 
       x['query'],
       '\n',x['summary']) if ('summary' in x.keys()) else print(f'{idx}: ', 
                                                                x['query'],
                                                                '\n','N/A') for idx, x in enumerate(out)]


In [None]:
# implement metric from https://www.biorxiv.org/content/10.1101/526491v1

###############################################
# Correlation between expression and true age
# Returns correlation statistics and gene function annotation
###############################################
###############################################

# Specify number of genes to print from top of list sorted on R2
topn = 20

# Specify whether to use all samples or only adult samples in correlation
adults_only = False

###############################################

from sklearn.linear_model import LinearRegression

ref = 'age'
input_adata = adata_subset2
if adults_only == True:
    input_adata = input_adata[input_adata.obs.age > 0]
corr_age = pd.DataFrame(input_adata.X)
corr_age.columns = input_adata.var_names
corr_age[ref] = input_adata.obs[ref].tolist()

df = corr_age.dropna()._get_numeric_data()
r_list = []
p_list = []
gene_list = []
exp_r2_list = []
for col in [x for x in df.columns.tolist() if x != ref]:
    obs_y = df[col].values.reshape(-1, 1) # expression
    obs_x = df[ref].values.reshape(-1, 1) # age
    reg = LinearRegression().fit(obs_x, 
                                 obs_y)
    pred_y = reg.predict(obs_x)
    r2 = reg.score(obs_x, obs_y, sample_weight=None)

    res_y = np.abs(pred_y - obs_y).flatten()
    r, pval = stats.pearsonr(obs_x.flatten(), res_y)
    
    r_list.append(r)
    p_list.append(pval)
    gene_list.append(col)
    exp_r2_list.append(r2)

spear_df = pd.DataFrame({'gene':gene_list,
                        'residual_r':r_list, 
                        'pval':p_list,
                        'exp_r2':exp_r2_list})
spear_df['pval'] = spear_df['pval'].replace(0, min([x for x in spear_df['pval'] if x>0]))
spear_df['neglog10_pval'] = -np.log10(spear_df['pval'])
spear_df['pass'] = spear_df.neglog10_pval > -np.log10(0.05/len(df))
spear_df['residual_r2'] = spear_df['residual_r'] ** 2



In [None]:
# rho metric has strong correlation to median expression value

num = 2000
print(ggplot(spear_df.sample(n=num))
      +geom_point(aes('exp_r2','residual_r2'), alpha = 0.1))

spear_df.corr()


In [None]:
spear_df.sort_values('residual_r2', ascending=False).head(20)