In [1]:
if __name__ == '__main__':
    %run ../../global.ipynb

In [2]:
from modules.utils import scale_df
from modules.drug_suggestion import subset_to_reasonable_drugs, simplify_syn_index
subgroups = 'G3 G4 SHH'.split() + ['p53 SHH'] #keeping it to two subgroups

# Subgroup specificity of scores from expression and screen

The idea here is to compute (mean score for the subgroup) - (mean score for other PDXs) and compare this to a null distribution generated by repeatedly shuffling the subgroup labels and recomputing the difference.

This function can consume a lot of memory. I had to close some other programs to make space. If necesssary, there is a more efficient way to write it by keeping a running count rather than appending and counting at the end.

I could also do e.g. pairwise comparisons in addition to subgroup-vs-rest, if we had reason to.

In [None]:
def compute_subgroup_meandif_pvals(lodf, subgroup, n_perm=10000):
    if subgroup == 'p53 SHH':
        pdxs = [pdx for pdx in alpha_twenty_samples if PDX2SUBGROUP_W_SHH_P53[pdx] == subgroup]
    else:
        pdxs = [pdx for pdx in alpha_twenty_samples if PDX2SUBGROUP[pdx] == subgroup]
    subdf_means = lodf.loc[pdxs].mean(axis=0)
    undf_means = lodf.drop(pdxs, axis=0).mean(axis=0)
    meandif = (subdf_means - undf_means).sort_values(ascending=False)
    meandif.name = subgroup
    perm_meandifs = []
    seeds = np.random.RandomState(RANDOM_SEED).randint(MAX_SEED, size=n_perm)
    for i in range(n_perm):
        plodf = lodf.copy()
        plodf.index = np.random.RandomState(seeds[i]).permutation(lodf.index)
        psubdf_means = plodf.loc[pdxs].mean(axis=0)
        pundf_means = plodf.drop(pdxs, axis=0).mean(axis=0)
        pdiff = (psubdf_means - pundf_means)
        perm_meandifs.append(pdiff)
    perm_meandifs = pd.concat(perm_meandifs, axis=1)
    pvals = (perm_meandifs.ge(meandif, axis=0).sum(axis=1) / n_perm).sort_values()
    pvals.name = subgroup
    return pvals

In [None]:
exp_log_odds_df = pd.read_csv(COMBINED_EXP_LOG_ODDS_FILE, index_col=0)
exp_pvals_df = pd.read_csv(COMBINED_EXP_LOG_ODDS_PVALS_FILE, index_col=0)
screen_log_odds_df = pd.read_csv(SCREEN_LOG_ODDS_FILE, index_col=0)
screen_viab_df = pd.read_csv(SCREEN_VIAB_BY_EXP_SYNONYM_FILE, index_col=0)
screen_hits_df = pd.read_csv(SCREEN_HITS_BY_EXP_SYNONYM_FILE, index_col=0)

In [None]:
n_perm = 10000

In [None]:
exp_subgroup_meandif_pvals = Parallel(n_jobs=os.cpu_count())(delayed(compute_subgroup_meandif_pvals)(exp_log_odds_df, subgroup, n_perm) for subgroup in subgroups)

exp_subgroup_meandif_pvals_df = pd.concat(exp_subgroup_meandif_pvals, axis=1).T
exp_subgroup_meandif_pvals_df.head()

In [None]:
screen_subgroup_meandif_pvals = Parallel(n_jobs=os.cpu_count())(delayed(compute_subgroup_meandif_pvals)(screen_log_odds_df, subgroup, n_perm) for subgroup in subgroups)

screen_subgroup_meandif_pvals_df = pd.concat(screen_subgroup_meandif_pvals, axis=1).T
screen_subgroup_meandif_pvals_df.head()

In [None]:
def add_sample_column(df):
    new_df = df.copy()
    new_df['sample'] = df.index.copy()
    return new_df

In [None]:
log_odds_conc_df = pd.concat([add_sample_column(df) for df in [exp_log_odds_df, screen_log_odds_df]], axis=0)
combined_log_odds_df = log_odds_conc_df.groupby('sample').sum().fillna(0)

In [None]:
combo_subgroup_meandif_pvals = Parallel(n_jobs=os.cpu_count())(delayed(compute_subgroup_meandif_pvals)(combined_log_odds_df, subgroup, n_perm) for subgroup in subgroups)

In [None]:
combo_subgroup_meandif_pvals_df = pd.concat(combo_subgroup_meandif_pvals, axis=1).T
combo_subgroup_meandif_pvals_df.head()

In [None]:
combo_subgroup_meandif_pvals_df.loc['G3'].sort_values().head(20)

In [None]:
exp_screen_drugs_to_cids = defaultdict(set)
for drug in combo_subgroup_meandif_pvals_df.columns:
    syns = drug.split(' /// ')
    for syn in syns:
        if syn.isdigit():
            exp_screen_drugs_to_cids[drug].add(int(syn))

In [None]:
sg_disc_drugs = []
for subgroup in 'G3 G4 SHH'.split():
    rdrugs_df = subset_to_reasonable_drugs(exp_subgroup_meandif_pvals_df, exp_screen_drugs_to_cids, DRUG_ANNOTATION_DIR, annot=True)
    mdps = rdrugs_df.loc[:, subgroup]
    sig = mdps[mdps < 0.05].sort_values()
    sg_disc_drugs.extend(sig.head(10).index.tolist())

toheat = exp_log_odds_df.loc[:, sg_disc_drugs]
toheat.index.name = 'PDX'
toheat.columns = simplify_syn_index(toheat.columns)
hm = sns.heatmap(scale_df(toheat.loc[G3_SAMPLES + G4_SAMPLES + SHH_SAMPLES]).T, cbar_kws={'label': 'score, row-normalized'}, square=True)
hm.axhline(10, 0, 20, c='k')
hm.axhline(20, 0, 20, c='k')
hm.axvline(10, 0, 30, c='k')
hm.axvline(14, 0, 30, c='k')
xts = hm.xaxis.get_ticklabels()
xt_colors = [official_subtype_colors['G3']] * 10 + \
            [official_subtype_colors['G4']] * 4 + \
            [official_subtype_colors['SHH']] * 6
            
            
for i, xt in enumerate(xts):
    pdx = xt.get_text()
    xt.set_color(official_subtype_colors[PDX2SUBGROUP[pdx]])
    #print(pdx)
#hm.yaxis.set_ticklabels(yt, colors=list(yt_colors));
plt.title('Subgroup-specific drugs based on expression');
plt.savefig(COMBINED_EXP_SUBGROUP_SPECIFIC_DRUGS_HEATMAP_FILE, bbox_inches='tight', dpi=600)

# Hits per PDX and subgroup based on expression

In [None]:
exp_pvals_df = pd.read_csv(COMBINED_EXP_LOG_ODDS_PVALS_FILE, index_col=0)
exp_hits_df = exp_pvals_df < 0.05

In [None]:
nhits_per_drug = exp_hits_df.sum(axis=0)
ge1_drugs = nhits_per_drug[nhits_per_drug > 0].sort_values(ascending=False).index
len(ge1_drugs)

This next mock diagram is just to get colors we like that the matplotlib_venn package uses.

In [None]:
ss = [range(10), range(4, 14), range(8, 18)]
venn_for_bar = venn3_unweighted([set(s) for s in ss], '1 2 3'.split(), set_colors='r g y'.split())
rgy_colors = [patch.get_facecolor() for patch in venn_for_bar.patches]
r, g, y = np.array(rgy_colors)[[0, 1, 3]]
subgroup_to_rgy = dict(zip('SHH G4 G3'.split(), [r, g, y]))

colors = dict(official_subtype_colors)
del colors['WNT']

## hits per PDX

In [None]:
nhits_per_pdx = exp_hits_df.sum(axis=1).sort_values(ascending=False)

In [None]:
y = range(len(nhits_per_pdx))
ax2 = plt.gca()
ax2.barh(y, nhits_per_pdx.values, color=[subgroup_to_rgy[PDX2SUBGROUP[s]] for s in nhits_per_pdx.index])
ax2.set_yticks(y)
legend_handles = []
for subtype in 'G4 SHH G3'.split():
    color = subgroup_to_rgy[subtype]
    legend_handles.append(mpl_patches.Patch(color=color, label=subtype))
ax2.legend(handles=legend_handles, prop={'size': 16}, loc=(1, 0.35))
ax2.set_xlabel('# effective drugs', fontsize=16)
ax2.set_yticklabels(nhits_per_pdx.index, rotation=0, fontsize=11);
plt.savefig(COMBINED_EXP_N_HITS_PER_PDX_BARGRAPH_FILE, bbox_inches='tight', dpi=600)

### Annotate drugs effective in at least one PDX

In [None]:
our_drug_to_moa, our_drug_to_target = create_drug_annot_dicts(ge1_drugs)
passed_ge1_pdx_sorted = sorted(ge1_drugs)
simple_index = simplify_syn_index(passed_ge1_pdx_sorted)
passed_ge1_pdx_annot_df = pd.DataFrame(index=simple_index, columns='moa targets synonyms'.split())
passed_ge1_pdx_annot_df.synonyms = passed_ge1_pdx_sorted
passed_ge1_pdx_annot_df.moa = [our_drug_to_moa[d] if d in our_drug_to_moa else np.nan for d in passed_ge1_pdx_sorted]
passed_ge1_pdx_annot_df.targets = [our_drug_to_target[d] if d in our_drug_to_target else np.nan for d in passed_ge1_pdx_sorted]
passed_ge1_pdx_annot_df.index.name = 'drug_name'
passed_ge1_pdx_annot_df.to_csv(COMBINED_EXP_HITS_PER_PDX_UNION_AUTO_ANNOT_FILE)

In [None]:
passed_ge1_pdx_annot_df.moa.dropna().shape

## Hits per subgroup

In [None]:
sg2passed = {}
passed_ge1_subgroup = set()
for subgroup in subgroups:
    if subgroup != 'p53 SHH':
        pdxs = [pdx for pdx in alpha_twenty_samples if PDX2SUBGROUP[pdx] == subgroup]
        subdf = exp_hits_df.loc[pdxs]
        sums = subdf.sum(axis=0)
        passed = sums[sums >= binom(len(pdxs), 0.5).ppf(0.95)].index
        sg2passed[subgroup] = list(passed)
        passed_ge1_subgroup.update(passed)

### Venn diagram: effective drugs in each subtype

In [None]:
venn_labels = ['{} ({} drugs)'.format(key, len(sg2passed[key])) for key in SUBGROUPS_BY_SIZE_DESCENDING]
#venn = venn3([set(v) for v in sg2passed.values()], venn_labels)
venn = venn3_unweighted([set(v) for v in sg2passed.values()], venn_labels)
#ax3.set_title('# Drugs effective in each subtype')
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(16)
plt.savefig(COMBINED_EXP_HITS_PER_SUBGROUP_VENN_FILE, bbox_inches='tight', dpi=600)

### Annotate drugs effective in at least one subgroup

In [None]:
our_drug_to_moa, our_drug_to_target = create_drug_annot_dicts(passed_ge1_subgroup)
passed_ge1_subgroup_sorted = sorted(passed_ge1_subgroup)
simple_index = simplify_syn_index(passed_ge1_subgroup_sorted)
passed_ge1_subgroup_annot_df = pd.DataFrame(index=simple_index, columns='moa targets synonyms'.split())
passed_ge1_subgroup_annot_df.synonyms = passed_ge1_subgroup_sorted
passed_ge1_subgroup_annot_df.moa = [our_drug_to_moa[d] if d in our_drug_to_moa else np.nan for d in passed_ge1_subgroup_sorted]
passed_ge1_subgroup_annot_df.targets = [our_drug_to_target[d] if d in our_drug_to_target else np.nan for d in passed_ge1_subgroup_sorted]
passed_ge1_subgroup_annot_df.index.name = 'drug_name'
passed_ge1_subgroup_annot_df.to_csv(COMBINED_EXP_HITS_PER_SUBGROUP_UNION_AUTO_ANNOT_FILE)

In [None]:
passed_ge1_subgroup_annot_df.moa.dropna().shape

# G3 and p53 SHH candidate drugs
Note: the above code changed some since I last ran this, so it may need some minor refactoring.

- avg combined score in subgroup/type
- avg exp
- avg screen
- whether sig in sg by screen
- whether sig in sg by exp
- whether clin rel
- whether sig better in this sg vs others by screen
- whether sig better "" "" by exp
- whether sig better combined?
- mean diff by exp
- mean diff by screen
- mean diff combined

In [None]:
exp_log_odds_df.columns

In [None]:
CMAP2CID_FILE

In [None]:
sg_candidate_dfs = []
# rows = sorted(set().union(exp_log_odds_df.columns, pdx_dr_syn2.columns)) #2018-11-13
# pdx_dr_syn2 I'm not sure where this dataframe is. I am ingnoring it for now which is probably not a good idea.
rows = sorted(set().union(exp_log_odds_df.columns))

clue_moa2cids = json.load(open('clue_moa2cids.json', 'r'))


clue_cid2moa = {}
for moa, cids in clue_moa2cids.items():
    for cid in cids:
        clue_cid2moa[cid] = moa

mydrug2cluemoa = {}
for drug in rows:
    syns = drug.split(' /// ')
    for syn1 in syns:
        if syn1.isdigit():
            syn_int = int(syn1)
            if syn_int in clue_cid2moa:
                mydrug2cluemoa[drug] = clue_cid2moa[syn_int]
columns = ['clinically_relevant',
           'rdrug_moa',
           'clue_moa',
           'mean_combined_score',
           'hit_exp',
           'mean_exp_score',
           'hit_screen',
           'mean_screen_score',
           'mean_screen_viab',
           'better_in_subgroup_pval_combo',
           'better_in_subgroup_pval_exp',
           'better_in_subgroup_pval_screen',
           'synonyms'
          ]

# for i, pdxs in enumerate([g3_samples, shh_p53_samples]):
for i, pdxs in enumerate([G3_SAMPLES, SHH_P53_SAMPLES]):
    subgroup = ['G3', 'p53 SHH'][i]
    print(subgroup)
    sg_candidate_df = pd.DataFrame(index=rows, columns=columns)
    rdrugs_df = subset_to_reasonable_drugs(sg_candidate_df.T, exp_screen_drugs_to_cids, DRUG_ANNOTATION_DIR, annot=True)
    rdrugs = [d for d in rdrugs_df.index if d in sg_candidate_df.index]
    sg_candidate_df['clinically_relevant'] = 0
    sg_candidate_df.loc[rdrugs, 'clinically_relevant'] = 1
    sg_candidate_df['rdrug_moa'].loc[rdrugs] = rdrugs_df.loc[rdrugs].moa
    for drug in rows:
        if drug in mydrug2cluemoa:
#             sg_candidate_df.loc[drug, 'clue_moa'] = 'error1' #2018-12-04 #mydrug2cluemoa[drug]
            sg_candidate_df.loc[drug, 'clue_moa'] = mydrug2cluemoa[drug]
        else: # added on 2018-12-04
            sg_candidate_df.loc[drug, 'clue_moa'] = 'N/A'
    exp_hits_subdf = exp_hits_df.loc[pdxs]
    exp_hits_sums = exp_hits_subdf.sum(axis=0)
    exp_passed = exp_hits_sums[exp_hits_sums >= binom(len(pdxs), 0.5).ppf(0.935)].index.tolist()
    sg_candidate_df['hit_exp'] = 0
    sg_candidate_df.loc[exp_passed, 'hit_exp'] = 1
    screen_hits_subdf = screen_hits_df.loc[pdxs]
    screen_hits_sums = screen_hits_subdf.sum(axis=0)
    screen_passed = screen_hits_sums[screen_hits_sums >= binom(len(pdxs), 0.5).ppf(0.935)].index.tolist()
    print('--')
    
    sg_candidate_df['hit_screen'] = 0
    if screen_passed in sg_candidate_df.index.values:
        sg_candidate_df.loc[screen_passed, 'hit_screen'] = 1
        print('Wow!')
    sg_candidate_df.loc[:, 'mean_exp_score'] = exp_log_odds_df.loc[pdxs].mean(axis=0).loc[sg_candidate_df.index]
    sg_candidate_df.loc[:, 'mean_screen_score'] = screen_log_odds_df.loc[pdxs].mean(axis=0).loc[sg_candidate_df.index]
    sg_candidate_df.loc[:, 'mean_screen_viab'] = screen_viab_df.loc[pdxs].mean(axis=0).loc[sg_candidate_df.index]
    sg_candidate_df['mean_combined_score'] = sg_candidate_df.mean_exp_score.fillna(0) + sg_candidate_df.mean_screen_score.fillna(0)
    sg_candidate_df['synonyms'] = sg_candidate_df.index
    sg_candidate_df['better_in_subgroup_pval_combo'] = combo_subgroup_meandif_pvals_df.loc[subgroup, sg_candidate_df.index]
    sg_candidate_df['better_in_subgroup_pval_exp'] = exp_subgroup_meandif_pvals_df.loc[subgroup, sg_candidate_df.index]
    sg_candidate_df['better_in_subgroup_pval_screen'] = screen_subgroup_meandif_pvals_df.loc[subgroup, sg_candidate_df.index]
    sg_candidate_df.index = simplify_syn_index(sg_candidate_df.index)
    sg_candidate_df.index.name = 'drug_name'
    sg_candidate_df.sort_values(by='mean_combined_score', ascending=False, inplace=True)
    sg_candidate_dfs.append(sg_candidate_df)

In [None]:
!pip install cuzcatlan

In [None]:
x = sg_candidate_df.mean_screen_score.copy()
x = x[x != 0]
y = sg_candidate_df.mean_screen_viab.copy()
y = y[y != 0]
import cuzcatlan as cusca
cusca.compute_information_coefficient(x, y)

In [None]:
plt.scatter(x, y)

In [None]:
sg_candidate_dfs

In [None]:
from modules.utils import save_xls
save_xls(sg_candidate_dfs, ['G3', 'p53 SHH'], './TEMP_new_attempt_g3_p53shh_candidate_drugs.xlsx')

In [None]:
sg_candidate_df[(sg_candidate_df.clinically_relevant == 1)]

In [None]:
exp_subgroup_meandif_pvals_df