In [1]:
import os,sys
import pandas as pd
import numpy as np
import scanpy as sc
from statsmodels.stats.multitest import multipletests

import matplotlib.pyplot as plt
import seaborn as sns

from sc_target_evidence_utils import association_utils, sc_evidence_utils


In [2]:
## r2py setup
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

In [3]:
%load_ext rpy2.ipython

In [4]:
%%R
library(tidyverse)
library(patchwork)

remove_x_axis <- function(){
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())  
}

remove_y_axis <- function(){
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank())  
}

[0;1;31mSystem has not been booted with systemd as init system (PID 1). Can't operate.[0m
[0;1;31mFailed to create bus connection: Host is down[0m


In [5]:
%%R

### Plotting utils
evidence_labels <- c(
    "all_sc_evidence" = "cell type & disease cell specific", 
    "bulk_disease_evidence" = "DE in disease (tissue)", 
    "disease_ct_evidence" = "DE in disease (cell type)", 
    "disease_evidence" = "Disease cell specific", 
    "ct_marker_evidence" = "Cell type specific", 
    "has_genetic_support" = "Genetic association"
    )

universe_labels <- c(
    'protein_coding_targets'= 'protein-coding targets',
    'sm_tractable_targets' = 'SM tractable targets',
    'ab_tractable_targets' = "Ab tractable targets",
    'known_drug_targets' = 'known drug targets\n(reached phase I)',
    'nuclear_receptors' = 'nuclear receptors', 
    'catalytic_receptors' = 'catalytic receptors', 
    'rhodop_gpcr' = 'rhodopsin-like\nGPCRs', 
    'transporters' = 'transporters', 
    'kinases' = 'kinases', 
    'enzymes' = 'enzymes', 
    'ion_channels' = 'ion channels'
    )


# Plot odds-ratio (no faceting)
plot_OR <- function(
    results_all_df, 
    pval_alpha=0.05, 
    base_font_size=20,
    text_position='right',
    y_value = 'clinical_status',
    evidence_levels = c('ct_marker_evidence', 'disease_evidence', 'disease_ct_evidence', 'bulk_disease_evidence', 'has_genetic_support', 'all_sc_evidence', 'is_hvg_normal', 'is_hvg_disease'),
    clinical_status_levels = c(rev(c('druggable', 'safe', 'effective', 'approved'))),
    hide_zeros = TRUE
){
    pl_df <- results_all_df %>%
        mutate(clinical_status = factor(str_remove(clinical_status, 'is_'), levels=clinical_status_levels)) %>%
        mutate(evidence = factor(evidence, levels=evidence_levels)) %>%
        mutate(is_signif = pval < pval_alpha) %>%
        rename(setNames(y_value,'y_val')) 
    
    pl <- ggplot(pl_df, aes(y=y_val, x=odds_ratio, color=is_signif)) 
    
    if (isTRUE(hide_zeros)){
        pl <- pl +
            geom_point(
                data = . %>% filter(n_supported_approved > 0),
                size=3
            ) +
            geom_pointrange(data = . %>% filter(n_supported_approved > 0),
                            aes(xmin=ci_low, xmax=ci_high))
    } else {
        pl <- pl +
            geom_point(size=3) +
            geom_pointrange(aes(xmin=ci_low, xmax=ci_high))}
        
    pl <- pl + geom_text(aes(label = paste0(n_supported_approved,' / ', n_success)), 
                      x = ifelse(text_position == 'right', Inf, -Inf) , 
                      hjust= ifelse(text_position == 'right', 1.1, -1) , 
                      size=5,
                      color='black',) +
        geom_vline(xintercept=1, linetype=2) +
        scale_x_log10(labels = scales::label_number()) +
        xlab('Odds Ratio') +
        ylab(y_value) +
        theme_classic(base_size=base_font_size) +
        scale_color_manual(values=c('TRUE' = 'red', 'FALSE'='grey50'), 
                           name=paste0("Significant enrichment\n(Fisher's test p-value < ", round(pval_alpha, digits=2), ')')) +
        theme(strip.text.y=element_text(angle=0), strip.background = element_rect(color=NA, fill = "grey"))    
    pl
}

In [6]:
figdir = '/home/jovyan/mount/gdrive/sc_targetID/plots/association_results_nelson/'
if not os.path.exists(figdir):
    os.mkdir(figdir)

### Save sc evidence tables

In [13]:
def get_de_markers(t, lfc_thresh=5.0, signif_thresh=0.01, features='targets'):
    de_res = pd.read_csv(f'../data/DE_celltype_{t.replace("_","-")}.{features}.csv')
    
    targets_df = pd.DataFrame(de_res.gene_id.unique(), columns=['gene_id'])
    targets_df['disease_relevant_tissue'] = t
    targets_df['is_hvg_normal'] = 1
    top_genes = de_res[(de_res.lfc > lfc_thresh) & (de_res.adj_pval < signif_thresh)].gene_id.unique().tolist()
    targets_df['ct_marker_evidence'] = targets_df.gene_id.isin(top_genes).astype(int)
    return(targets_df)

#### DE analysis

HVG test

In [14]:
ct_targets_df = pd.DataFrame()
for t in all_tissue_ids:
    try:
        targets_df = get_de_markers(t, features='hvgs', lfc_thresh=5.0, signif_thresh=0.01)
        ct_targets_df = pd.concat([ct_targets_df, targets_df])
    except:
        print(f'Skipping {t}')
        pass
ct_targets_wide_df = ct_targets_df.pivot(
            index='gene_id', 
            columns='disease_relevant_tissue', 
            values='ct_marker_evidence'
        )
ct_targets_wide_df.columns = ['ct_marker_evidence_'+x for x in ct_targets_wide_df.columns]
assert ct_targets_wide_df.index.is_unique
assert all(ct_targets_wide_df.sum() < 5000)
ct_targets_wide_df.to_csv('../data/ct_marker_evidence.de.hvgs.csv')

Skipping skin
Skipping kidney
Skipping prostate


In [15]:
ct_targets_df = pd.DataFrame()
for t in all_tissue_ids:
    try:
        targets_df = get_de_markers(t, features='targets', lfc_thresh=5.0, signif_thresh=0.01)
        ct_targets_df = pd.concat([ct_targets_df, targets_df])
    except:
        print(f'Skipping {t}')
        pass
ct_targets_wide_df = ct_targets_df.pivot(
            index='gene_id', 
            columns='disease_relevant_tissue', 
            values='ct_marker_evidence'
        )
ct_targets_wide_df.columns = ['ct_marker_evidence_'+x for x in ct_targets_wide_df.columns]
assert ct_targets_wide_df.index.is_unique
assert all(ct_targets_wide_df.sum() < 5000)
ct_targets_wide_df.to_csv('../data/ct_marker_evidence.de.targets.csv')

Skipping skin
Skipping kidney
Skipping prostate


### Merge with clinical success evidence

In [8]:
def _get_ti_pair_tissue_evidence(ti_pair):
        dtr_evidence = ti_pair[['ct_marker_evidence_' + x.replace(' ', '_') for x in ti_pair['disease_relevant_tissues']]]
        if dtr_evidence.isna().all(): # Keep only HVGs
            return(np.nan)
        else:
            if dtr_evidence.sum() > 0:
                return 1
            else:
                return 0
    
def merge_evidence_success(
    ct_targets_wide_df, 
    nelson_targets_df, 
    ev='ct_marker_evidence',
    keep_nelson_cols = ['ti_uid', 'indication_mesh_term', 'gene_id', 'gene_name','combined_max_phase','target_status', 'disease_relevant_tissues'],
    how = 'left'
):
    all_scrnaseq_tissues = ct_targets_wide_df.set_index('gene_id').columns.str.replace(f'{ev}_', '').tolist()
    dtr_cols = nelson_targets_df.columns[nelson_targets_df.columns.str.startswith("dtr_")].tolist()
    
    tissues_all_df = nelson_targets_df[keep_nelson_cols + success_cols + dtr_cols]
    ti_no = tissues_all_df.shape[0]
    tissues_all_df = pd.merge(tissues_all_df, ct_targets_wide_df, how=how)
    
    tissues_all_df['disease_relevant_tissues'] = tissues_all_df['disease_relevant_tissues'].str.split(",")
    tissues_all_df['disease_relevant_tissues'] = [np.intersect1d(x, all_scrnaseq_tissues).tolist() for x in tissues_all_df['disease_relevant_tissues']]
    if how == 'left':
        # Keep T-I pairs with at least one tissue measured with scRNA-seq
        tissues_all_df = tissues_all_df[[len(x)>0 for x in tissues_all_df['disease_relevant_tissues']]].copy()
        tissues_all_df[ev] = tissues_all_df.apply(_get_ti_pair_tissue_evidence, axis=1)
        tissues_all_df.drop(ct_targets_wide_df.set_index('gene_id').columns, axis=1, inplace=True)
        assert tissues_all_df.ti_uid.is_unique
        assert not any(tissues_all_df.combined_max_phase.isna())
    else:
        assert ct_targets_wide_df.shape[1] == 2
        tissues_all_df[ev] = tissues_all_df[ct_targets_wide_df.set_index('gene_id').columns[0]]
        tissues_all_df.drop(ct_targets_wide_df.set_index('gene_id').columns, axis=1, inplace=True)
        assert tissues_all_df.gene_id.nunique() <= 5000
#     tissues_all_df['disease_relevant_tissues'] = [x[0] for x in tissues_all_df['disease_relevant_tissues']]
    return(tissues_all_df)

## Cell type specificity analysis

Read Minikel et al. table

In [12]:
success_cols = ['succ_p_1', 'succ_1_2', 'succ_2_3', 'succ_3_a']
nelson_targets_df = pd.read_csv('../data/filtered_nelson_disease_relevant_tissues_07032024.clean.csv', index_col=0)
dtr_cols = nelson_targets_df.columns[nelson_targets_df.columns.str.startswith("dtr_")].tolist()
nelson_targets_df.drop(dtr_cols + ['disease_relevant_tissues'], axis=1, inplace=True)

## Add disease-relevant tissue annotation
dtr_annotation = pd.read_csv('../data/tissue_indication_matching_manual_review_ET07252024.csv')[['manual_tissue_annotation', 'indication_mesh_term']]
dtr_annotation.columns = ['disease_relevant_tissues','indication_mesh_term']
dtr_annotation['disease_relevant_tissues'] = np.where(dtr_annotation.disease_relevant_tissues == 'eye (retina)', 'eye', dtr_annotation.disease_relevant_tissues.str.replace(' ', '_'))
dtr_annotation = dtr_annotation[[',' not in x for x in dtr_annotation.disease_relevant_tissues]].copy() # exclude multi-tissue conditions
nelson_targets_df = pd.merge(nelson_targets_df, dtr_annotation)
nelson_targets_df = nelson_targets_df[nelson_targets_df['disease_relevant_tissues'] != 'none'].copy() # Keep diseases with annotated DTR
dtr_dummies = pd.get_dummies(nelson_targets_df['disease_relevant_tissues'])
dtr_dummies.columns = [f'dtr_{x}' for x in dtr_dummies.columns]
nelson_targets_df = pd.concat([nelson_targets_df, dtr_dummies], axis=1)

## Annotate genetic evidence (indirect, as Minikel et al.)
nelson_targets_df['genetic_evidence_minikel'] = (nelson_targets_df['target_status'] == 'genetically supported target').astype(int)

In [13]:
nelson_targets_df.to_csv('../data/filtered_nelson_disease_relevant_tissues_07032024.clean.drt_annotation.csv')

Preclinical HVGs

In [11]:
success_cols = ['succ_p_1', 'succ_1_2', 'succ_2_3', 'succ_3_a']
evidence_table = pd.read_csv('../data/ct_marker_evidence.de.hvgs.csv')
nelson_targets_df = pd.read_csv('../data/filtered_nelson_disease_relevant_tissues_07032024.clean.drt_annotation.csv')
keep_nelson_cols = ['ti_uid', 'indication_mesh_term', 'gene_id', 'gene_name','combined_max_phase','target_status', 'disease_relevant_tissues', 'genetic_evidence_minikel']
merged_table = merge_evidence_success(evidence_table, nelson_targets_df, keep_nelson_cols=keep_nelson_cols)
## Keep only tested HVGs
merged_table = merged_table[~merged_table.ct_marker_evidence.isna()].copy()

All HVGs per disease

In [12]:
merged_table_hvgs = pd.DataFrame()
for ev_t in evidence_table.columns.drop('gene_id'):
    t = ev_t.replace("ct_marker_evidence_", '')
    tissue_targets_df = nelson_targets_df[nelson_targets_df['disease_relevant_tissues'] == t].copy()
    tissue_indications = tissue_targets_df.indication_mesh_term.unique()
    for i in tissue_indications: # Test all HVGs for each indication
        merged_table_tissue_i = merge_evidence_success(
            evidence_table[['gene_id', ev_t]].dropna(), 
            tissue_targets_df[tissue_targets_df.indication_mesh_term == i], 
            keep_nelson_cols=keep_nelson_cols, how='right')
        merged_table_tissue_i['genetic_evidence_minikel'] = merged_table_tissue_i['genetic_evidence_minikel'].fillna(0)
        merged_table_tissue_i['tissue_test'] = t
        merged_table_tissue_i['indication_mesh_term'] = i
        merged_table_hvgs = pd.concat([merged_table_hvgs, merged_table_tissue_i])

In [13]:
# assert you have the same genes tested for all indications in the same tissue
merged_table_hvgs.groupby(['tissue_test', 'indication_mesh_term']).size()

tissue_test      indication_mesh_term         
blood            Agammaglobulinemia               5000
                 Anemia                           5000
                 Anemia, Aplastic                 5000
                 Anemia, Hemolytic, Autoimmune    5000
                 Anemia, Iron-Deficiency          5000
                                                  ... 
nose             Rhinitis, Allergic, Perennial    5000
                 Rhinitis, Allergic, Seasonal     5000
                 Sinusitis                        5000
small_intestine  Duodenal Ulcer                   5000
                 Malabsorption Syndromes          5000
Length: 227, dtype: int64

In [17]:
dtr_cols.append('dtr_pancreas')
merged_table_hvgs = merged_table_hvgs.drop(['ti_uid', 'gene_name', 'combined_max_phase', 'target_status', 'disease_relevant_tissues'] + list(np.intersect1d(dtr_cols, merged_table_hvgs.columns)), axis=1)
merged_table_hvgs.to_csv('../data/expanded_DRT_analysis.target_disease_table.all_hvgs.csv')
merged_table.to_csv('../data/expanded_DRT_analysis.target_disease_table.preclinical_hvgs.csv')

In [19]:
df = pd.DataFrame(~merged_table_hvgs[success_cols[0]].isna())
df['tissue_test'] = merged_table_hvgs['tissue_test'].values
n_gd = df.groupby('tissue_test').sum()
n_gd.columns = ['n_preclinical']

In [20]:

map_labels = {
    'n_diseases':'n_diseases',
    'n_preclinical':'Preclinical',
    'succ_p_1':'Phase I',
    'succ_1_2':'Phase II',
    'succ_2_3':'Phase III',
    'succ_3_a':'Launched',
}

In [21]:
n_diseases = merged_table_hvgs[['indication_mesh_term', 'tissue_test']].drop_duplicates().groupby('tissue_test').size()
n_diseases.name = 'n_diseases'
n_success = merged_table_hvgs.groupby('tissue_test')[success_cols].sum()
pl_df = pd.concat([n_gd, n_success, n_diseases], axis=1).reset_index()
pl_df = pl_df.melt(id_vars=['tissue_test'])
pl_df['variable'] = pl_df['variable'].map(map_labels)


In [116]:
%%R -i pl_df -w 800 -h 300 -i figdir
p1 <- pl_df %>%
filter(variable != 'n_diseases') %>%
mutate(variable = factor(variable, levels=c('Preclinical', 'Phase I', 'Phase II', 'Phase III', 'Launched'))) %>%
ggplot(aes(variable, tissue_test, fill=log(value), label=value)) +
geom_tile() +
geom_text() +
ylab('Disease-relevant tissue') +
xlab('Phase reached') +
scale_fill_distiller(palette='Reds', direction=1, na.value='white', name='# G-D pairs') +
theme_classic(base_size=20) +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5), axis.text.y=element_blank(), axis.title.y=element_blank())

p2 <- pl_df %>%
filter(variable == 'n_diseases') %>%
ggplot(aes(tissue_test, y=value, label=value)) +
geom_col(fill='lightgray') +
geom_text() +
coord_flip() +
xlab('Disease-relevant\ntissue') +
ylab('# diseases') +
theme_classic(base_size=20) 

(p2 + p1) + plot_layout(widths=c(1,3)) 
ggsave(paste0(figdir, 'diseases_dtr_counts.pdf'), width=10, height=4);
ggsave(paste0(figdir, 'diseases_dtr_counts.png'), width=10, height=4)

  for name, values in obj.iteritems():


In [49]:
disease_table = merged_table_hvgs[['indication_mesh_term', 'tissue_test']].drop_duplicates()
disease_table

Unnamed: 0,indication_mesh_term,tissue_test
0,Cocaine-Related Disorders,blood
0,Graft vs Host Disease,blood
0,Infections,blood
0,COVID-19,blood
0,Myelodysplastic Syndromes,blood
...,...,...
0,Nasal Polyps,nose
0,Sinusitis,nose
0,"Rhinitis, Allergic, Perennial",nose
0,Duodenal Ulcer,small_intestine


In [68]:
# Merge with MeSH IDs
disease_table_nelson = pd.read_csv('../data/nelson_s01.csv', index_col=0, encoding='cp1252')
disease_ids = disease_table_nelson[['indication_mesh_id', 'indication_mesh_term']].drop_duplicates()

disease_table = pd.merge(disease_table, disease_ids, how='left')
disease_table.to_csv('../data/suppl_table_6.expanded_disease_set.csv', index=False)

TypeError: to_csv() got an unexpected keyword argument 'index_col'

### Max phase reached analysis

In [14]:
# Test by max phase reached 
merged_table[success_cols] = merged_table[success_cols].fillna(False)
merged_table[success_cols] = merged_table[success_cols].astype(int)
merged_table_hvgs[success_cols] = merged_table_hvgs[success_cols].fillna(False)
merged_table_hvgs[success_cols] = merged_table_hvgs[success_cols].astype(int)

results_combo_df = pd.DataFrame()
for ph in success_cols:
    for ev in ['ct_marker_evidence', 'genetic_evidence_minikel']:
        # HVG test
        or_df = association_utils.get_OR(
                merged_table_hvgs.fillna(0),  
                evidence_col = ev, clinical_status_col=ph)
        or_df['universe'] = 'HVGs'
        results_combo_df = pd.concat([results_combo_df, or_df], axis=0)
        # preclinical test
        or_df = association_utils.get_OR(
                merged_table.fillna(0),  
                evidence_col = ev, clinical_status_col=ph)
        or_df['universe'] = 'Preclinical_HVGs'
        results_combo_df = pd.concat([results_combo_df, or_df], axis=0)
    


In [15]:
sig, fdr, _,_ = multipletests(results_combo_df.pval, method='fdr_bh')
results_combo_df['pval'] = fdr

In [16]:
evidence_labels = {
    'ct_marker_evidence':'Cell type specific',
    'genetic_evidence_minikel':'Genetic evidence (indirect, Minikel et al.)'
}

succ2maxphase = {
    'succ_p_1':'Phase I',
    'succ_1_2':'Phase II',
    'succ_2_3':'Phase III',
    'succ_3_a':'Launched',
}

results_combo_df['clinical_status'] = results_combo_df['clinical_status'].map(succ2maxphase)
results_combo_df['evidence'] = results_combo_df['evidence'].map(evidence_labels)

In [17]:
results_combo_df

Unnamed: 0,odds_ratio,ci_low,ci_high,pval,n_success,n_insuccess,n_supported_approved,n_supported,evidence,clinical_status,universe
0,1.687927,1.500167,1.895169,6.446507e-17,1864.0,1133136.0,360.0,141091.0,Cell type specific,Phase I,HVGs
0,1.276195,1.066259,1.528925,0.005375071,1864.0,1621.0,360.0,616.0,Cell type specific,Phase I,Preclinical_HVGs
0,1014.037907,792.609159,1301.288231,0.0,1864.0,1133136.0,174.0,289.0,"Genetic evidence (indirect, Minikel et al.)",Phase I,HVGs
0,1.348196,1.047903,1.739577,0.01290877,1864.0,1621.0,174.0,289.0,"Genetic evidence (indirect, Minikel et al.)",Phase I,Preclinical_HVGs
0,1.697766,1.475217,1.948022,1.083907e-12,1319.0,1133681.0,256.0,141091.0,Cell type specific,Phase II,HVGs
0,1.208087,1.007319,1.44759,0.02544459,1319.0,2166.0,256.0,616.0,Cell type specific,Phase II,Preclinical_HVGs
0,814.544388,636.80386,1040.78572,6.227335e-308,1319.0,1133681.0,133.0,289.0,"Genetic evidence (indirect, Minikel et al.)",Phase II,HVGs
0,1.444745,1.124775,1.85379,0.002968262,1319.0,2166.0,133.0,289.0,"Genetic evidence (indirect, Minikel et al.)",Phase II,Preclinical_HVGs
0,1.6846,1.328657,2.117461,2.077943e-05,482.0,1134518.0,93.0,141091.0,Cell type specific,Phase III,HVGs
0,1.133619,0.877124,1.454572,0.1850026,482.0,3003.0,93.0,616.0,Cell type specific,Phase III,Preclinical_HVGs


In [83]:
%%R -i results_combo_df -w 1200 -h 600 -i figdir
pl_hvgs <- plot_OR(filter(results_combo_df, universe == 'HVGs'), 
        y_value='evidence',
        text_position='right', 
        evidence_levels=c('Cell type specific', 'Genetic evidence (indirect, Minikel et al.)'),
        clinical_status_levels = c('Phase I','Phase II', 'Phase III', 'Launched'),
        base_font_size=22, pval_alpha=0.1
       ) +
    facet_grid(clinical_status ~ ., scales='free_x') +
    expand_limits(x=170) +
    ylab("Omic evidence") 

pl_preclinical <- plot_OR(filter(results_combo_df, universe != 'HVGs'), 
        y_value='evidence',
        text_position='right', 
        evidence_levels=c('Cell type specific', 'Genetic evidence (indirect, Minikel et al.)'),
        clinical_status_levels = c('Phase I','Phase II', 'Phase III', 'Launched'),
        base_font_size=22, pval_alpha=0.1
       ) +
    facet_grid(clinical_status ~ ., scales='free_x') +
    expand_limits(x=6) +
    ylab("Omic evidence") 
    
(pl_hvgs / pl_preclinical ) + plot_layout(guides='collect')  
ggsave(paste0(figdir, 'cell_type_specific.all.pdf'), width=16, height=8)

  for name, values in obj.iteritems():


Save tables for sharing

In [18]:
results_combo_df.to_csv('../data/suppl_table_5.expanded_drt_analysis.odds_ratios.csv')

In [93]:
results_combo_df.to_csv('../data/expanded_DRT_analysis.OR_results.csv')

### Progression analysis by disease-relevant tissue

In [24]:
evidence_table = pd.read_csv('../data/ct_marker_evidence.de.hvgs.csv')
keep_nelson_cols = ['ti_uid', 'indication_mesh_term', 'gene_id', 'gene_name','combined_max_phase','target_status', 'disease_relevant_tissues', 'genetic_evidence_minikel']
merged_table = merge_evidence_success(evidence_table, nelson_targets_df, keep_nelson_cols=keep_nelson_cols)
merged_table['ct_marker_evidence'] = merged_table['ct_marker_evidence'].fillna(0)

In [26]:
merged_table['disease_relevant_tissues'] = [x[0] for x in merged_table['disease_relevant_tissues']]
merged_table.groupby(['disease_relevant_tissues']).sum()

  merged_table.groupby(['disease_relevant_tissues']).sum()


Unnamed: 0_level_0,genetic_evidence_minikel,dtr_blood,dtr_bone_marrow,dtr_brain,dtr_colon,dtr_esophagus,dtr_eye,dtr_heart,dtr_kidney,dtr_liver,dtr_lung,dtr_nose,dtr_pancreas,dtr_prostate,dtr_skin,dtr_small_intestine,ct_marker_evidence
disease_relevant_tissues,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
blood,147,2617,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87.0
bone_marrow,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,3.0
brain,157,0,0,2672,0,0,0,0,0,0,0,0,0,0,0,0,194.0
colon,25,0,0,0,261,0,0,0,0,0,0,0,0,0,0,0,6.0
esophagus,8,0,0,0,0,43,0,0,0,0,0,0,0,0,0,0,2.0
eye,21,0,0,0,0,0,436,0,0,0,0,0,0,0,0,0,44.0
heart,61,0,0,0,0,0,0,817,0,0,0,0,0,0,0,0,58.0
liver,22,0,0,0,0,0,0,0,0,519,0,0,0,0,0,0,91.0
lung,59,0,0,0,0,0,0,0,0,0,685,0,0,0,0,0,111.0
nose,13,0,0,0,0,0,0,0,0,0,0,114,0,0,0,0,20.0


In [52]:
results_tissue_df = pd.DataFrame()
tissues = ['blood', 'brain', 'colon', 'eye', 'heart', 'liver', 'lung']
for t in tissues:
    # Preclinical test
    for ph in success_cols[1:]:
        try:
            test_df = merged_table[merged_table[f'dtr_{t}'] == 1].copy()
            test_df = test_df[[ph, 'ct_marker_evidence']].dropna().astype(int)
            or_df = association_utils.get_OR(
                    test_df,  
                    evidence_col = 'ct_marker_evidence', clinical_status_col=ph)
            or_df['universe'] = 'Preclinical'
            or_df['dtr'] = t
            results_tissue_df = pd.concat([results_tissue_df, or_df], axis=0)
        except:
            print(t, ' - ',ph)
            pass
        
sig, fdr, _,_ = multipletests(results_tissue_df.pval, method='fdr_bh')
results_tissue_df['pval'] = fdr

colon  -  succ_3_a


In [49]:
succ2maxphase = {
    'succ_1_2':'Phase I to II',
    'succ_2_3':'Phase II to III',
    'succ_3_a':'Phase III to Launch',
}

results_tissue_df['clinical_status'] = results_tissue_df['clinical_status'].map(succ2maxphase)

Unnamed: 0,odds_ratio,ci_low,ci_high,pval,n_success,n_insuccess,n_supported_approved,n_supported,evidence,clinical_status,universe,dtr
0,0.359822,0.112821,1.277902,0.983371,144.0,22.0,17.0,23.0,ct_marker_evidence,succ_1_2,Preclinical,colon
0,1.876233,0.432158,7.535769,0.717473,33.0,81.0,5.0,12.0,ct_marker_evidence,succ_2_3,Preclinical,colon
0,0.903304,0.085141,12.776961,0.983371,18.0,11.0,3.0,5.0,ct_marker_evidence,succ_3_a,Preclinical,colon


In [50]:
%%R -i results_tissue_df -w 1500 -h 400 -i figdir
p1 <- plot_OR(filter(results_tissue_df, universe == 'Preclinical'), y_value='dtr',
        text_position='right', 
        evidence_levels=c('ct_marker_evidence'),
        clinical_status_levels = c('Phase I to II', 'Phase II to III', 'Phase III to Launch'),
               base_font_size=22, pval_alpha=0.1
       ) +
    facet_grid(.~clinical_status) +
    expand_limits(x=200) +
    ylab("Clinical status") 

p1 
ggsave(paste0(figdir, 'cell_type_specific.progression_tissue.pdf'), width=16, height=4)

  for name, values in obj.iteritems():


In [579]:
results_tissue_df = pd.DataFrame()
tissues = ['blood', 'brain', 'colon', 'eye', 'heart', 'liver', 'lung']
for t in tissues:
    # Preclinical test
    for ph in success_cols[1:]:
        try:
            test_df = merged_table[merged_table[f'dtr_{t}'] == 1].copy()
            test_df = test_df[[ph, 'genetic_evidence_minikel']].dropna().astype(int)
            or_df = association_utils.get_OR(
                    test_df,  
                    evidence_col = 'genetic_evidence_minikel', clinical_status_col=ph)
            or_df['universe'] = 'Preclinical'
            or_df['dtr'] = t
            results_tissue_df = pd.concat([results_tissue_df, or_df], axis=0)
        except:
            print(t, ' - ',ph)
            pass
        
sig, fdr, _,_ = multipletests(results_tissue_df.pval, method='fdr_bh')
results_tissue_df['pval'] = fdr

In [580]:
succ2maxphase = {
    'succ_1_2':'Phase I to II',
    'succ_2_3':'Phase II to III',
    'succ_3_a':'Phase III to Launch',
}

results_tissue_df['clinical_status'] = results_tissue_df['clinical_status'].map(succ2maxphase)

In [570]:
%%R -i results_tissue_df -w 1500 -h 400
p1 <- plot_OR(filter(results_tissue_df, universe == 'Preclinical'), y_value='dtr',
        text_position='right', 
        evidence_levels=c('ct_marker_evidence'),
        clinical_status_levels = c('Phase I to II', 'Phase II to III', 'Phase III to Launch'),
               base_font_size=22, pval_alpha=0.1
       ) +
    facet_grid(.~clinical_status) +
    expand_limits(x=200) +
    ylab("Clinical status") 

p1 
ggsave(paste0(figdir, 'cell_type_specific.progression_tissue_genetic.pdf'), width=16, height=4)

In [None]:
# nelson_targets_df[success_cols] = nelson_targets_df[success_cols].fillna(False)
# nelson_targets_df[success_cols] = nelson_targets_df[success_cols].astype(int)


### Test different disease-relevant tissue annotation

For lung and colon, where association with phase I success is weak --> no significant difference

In [39]:
evidence_table = pd.read_csv('../data/ct_marker_evidence.de.hvgs.csv')
keep_nelson_cols = ['ti_uid', 'indication_mesh_term', 'gene_id', 'gene_name','combined_max_phase','target_status', 'disease_relevant_tissues', 'genetic_evidence_minikel']

# evidence_table_colon = 
evidence_table['ct_marker_evidence_colon'] = evidence_table['ct_marker_evidence_small_intestine'].fillna(0) + evidence_table['ct_marker_evidence_colon'].fillna(0)

In [40]:
merged_table = merge_evidence_success(evidence_table, nelson_targets_df, keep_nelson_cols=keep_nelson_cols)
merged_table['ct_marker_evidence'] = merged_table['ct_marker_evidence'].fillna(0)

In [41]:
merged_table['disease_relevant_tissues'] = [x[0] for x in merged_table['disease_relevant_tissues']]
merged_table.groupby(['disease_relevant_tissues']).sum()

  merged_table.groupby(['disease_relevant_tissues']).sum()


Unnamed: 0_level_0,genetic_evidence_minikel,dtr_blood,dtr_bone_marrow,dtr_brain,dtr_colon,dtr_esophagus,dtr_eye,dtr_heart,dtr_kidney,dtr_liver,dtr_lung,dtr_nose,dtr_pancreas,dtr_prostate,dtr_skin,dtr_small_intestine,ct_marker_evidence
disease_relevant_tissues,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
blood,147,2617,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87.0
bone_marrow,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,3.0
brain,157,0,0,2672,0,0,0,0,0,0,0,0,0,0,0,0,194.0
colon,25,0,0,0,261,0,0,0,0,0,0,0,0,0,0,0,37.0
esophagus,8,0,0,0,0,43,0,0,0,0,0,0,0,0,0,0,2.0
eye,21,0,0,0,0,0,436,0,0,0,0,0,0,0,0,0,44.0
heart,61,0,0,0,0,0,0,817,0,0,0,0,0,0,0,0,58.0
liver,22,0,0,0,0,0,0,0,0,519,0,0,0,0,0,0,91.0
lung,59,0,0,0,0,0,0,0,0,0,685,0,0,0,0,0,111.0
nose,13,0,0,0,0,0,0,0,0,0,0,114,0,0,0,0,20.0


In [42]:
results_tissue_df = pd.DataFrame()
tissues = ['colon']
for t in tissues:
    # Preclinical test
    for ph in success_cols[1:]:
        try:
            test_df = merged_table[merged_table[f'dtr_{t}'] == 1].copy()
            test_df = test_df[[ph, 'ct_marker_evidence']].dropna().astype(int)
            or_df = association_utils.get_OR(
                    test_df,  
                    evidence_col = 'ct_marker_evidence', clinical_status_col=ph)
            or_df['universe'] = 'Preclinical'
            or_df['dtr'] = t
            results_tissue_df = pd.concat([results_tissue_df, or_df], axis=0)
        except:
            print(t, ' - ',ph)
            pass
        
sig, fdr, _,_ = multipletests(results_tissue_df.pval, method='fdr_bh')
results_tissue_df['pval'] = fdr

In [43]:
results_tissue_df

Unnamed: 0,odds_ratio,ci_low,ci_high,pval,n_success,n_insuccess,n_supported_approved,n_supported,evidence,clinical_status,universe,dtr
0,0.359822,0.112821,1.277902,0.983371,144.0,22.0,17.0,23.0,ct_marker_evidence,succ_1_2,Preclinical,colon
0,1.876233,0.432158,7.535769,0.717473,33.0,81.0,5.0,12.0,ct_marker_evidence,succ_2_3,Preclinical,colon
0,0.903304,0.085141,12.776961,0.983371,18.0,11.0,3.0,5.0,ct_marker_evidence,succ_3_a,Preclinical,colon


### What is the fraction of approved targets that are cell type specific?

In [571]:
merged_table = merge_evidence_success(evidence_table, nelson_targets_df, keep_nelson_cols=keep_nelson_cols)
merged_table['disease_relevant_tissues'] = [x[0] for x in merged_table['disease_relevant_tissues']]
merged_table['ct_marker_evidence'] = merged_table['ct_marker_evidence'].fillna(0)
merged_table['genetic_evidence_minikel'] = merged_table['genetic_evidence_minikel'].fillna(0)

In [572]:
def _annotate(ti):
    if (ti['genetic_evidence_minikel'] == 1) & (ti['ct_marker_evidence'] == 1):
        return 'both'
    elif (ti['genetic_evidence_minikel'] == 0) & (ti['ct_marker_evidence'] == 1):
        return 'cell type marker'
    elif (ti['genetic_evidence_minikel'] == 1) & (ti['ct_marker_evidence'] == 0):
        return 'genetic'
    else:
        return 'none'

merged_table['ti_annotation'] = [_annotate(x) for _, x in  merged_table.iterrows()]

In [573]:
sum((merged_table['ti_annotation'] == 'genetic') & (merged_table['combined_max_phase'] == 'Launched'))

76

In [574]:
ev_sum = merged_table[['combined_max_phase', 'disease_relevant_tissues','ti_annotation']].value_counts().reset_index()
ev_sum.columns = ['combined_max_phase','disease_relevant_tissues','ti_annotation', 'n_targets']

In [575]:
%%R -i ev_sum -w 600 -h 700
fill_palette <- c(
    'genetic' = '#e41a1c',
    'cell type marker' = '#377eb8',
    'both' = '#984ea3',
    'none' = '#999999')

ev_sum %>%
    filter(combined_max_phase != "Preclinical") %>%
    mutate(combined_max_phase = factor(combined_max_phase, levels=c('Preclinical', 'Phase I', 'Phase II', 'Phase III','Launched'))) %>%
    ggplot(aes(x=combined_max_phase, y=n_targets, fill=ti_annotation)) +
    geom_col() +
    # facet_wrap(disease_relevant_tissues~., scales='free_y')
    scale_fill_manual(values=fill_palette, name='Omic support') +
    theme_classic(base_size = 24) +
    ylab('No. of approved or\ninvestigational targets\n(227 diseases)') +
    xlab('Max. phase reached') +
    theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5)) 

ggsave(paste0(figdir, 'cell_type_specific.barplot.pdf'), width=8, height=6);

In [576]:
%%R -i ev_sum -w 1200 -h 1000
fill_palette <- c(
    'genetic' = '#e41a1c',
    'cell type marker' = '#377eb8',
    'both' = '#984ea3',
    'none' = '#999999')

ev_sum %>%
    filter(combined_max_phase != "Preclinical") %>%
    filter(!disease_relevant_tissues %in% c('bone_marrow', 'small_intestine')) %>%
    mutate(combined_max_phase = factor(combined_max_phase, levels=c('Preclinical', 'Phase I', 'Phase II', 'Phase III','Launched'))) %>%
    ggplot(aes(x=combined_max_phase, y=n_targets, fill=ti_annotation)) +
    geom_col() +
    facet_wrap(disease_relevant_tissues~., scales='free_y', nrow=2) +
    scale_fill_manual(values=fill_palette, name='Omic support') +
    theme_classic(base_size = 24) +
    ylab('No. of approved or\ninvestigational targets\n(227 diseases)') +
    xlab('Max. phase reached') +
    theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5)) 

ggsave(paste0(figdir, 'cell_type_specific.barplot_by_tissue.pdf'), width=12, height=8);

## Check reasons for stopping

In [570]:
ot2024_targets_df_full.to_csv('../data/OT2024_reasons2stop.csv')

In [28]:
ot2024_targets_df_full = pd.read_csv('../data/OT2024_reasons2stop.csv')

  ot2024_targets_df_full = pd.read_csv('../data/OT2024_reasons2stop.csv')


In [163]:
# stop_3_evidence_df = pd.merge(evidence_table, nelson_targets_df, how='left')
# stop_3_evidence_df = stop_3_evidence_df[~stop_3_evidence_df['succ_3_a'].isna()]

To share with Rasa

In [430]:
ot2024_targets_df_full

Unnamed: 0.1,Unnamed: 0,targetId,diseaseId,nctid,clinicalStatus,clinicalPhase,studyStartDate,stopStatus,isStopped,phase4,...,taLabel,taLabelSimple,gc,lof_tolerance,rnaDistribution,rnaSpecificity,partnersBin,datasourceId,datatypeId,total
0,0,ENSG00000004468,EFO_0000203,NCT03236428,"Active, not recruiting",2,2017-11-24,,,,...,hematologic disease,Other,3.0,LoF tolerant,Detected in many,Tissue enhanced,none,chembl,known_drug,413311
1,1,ENSG00000004468,EFO_0000203,NCT03236428,"Active, not recruiting",2,2017-11-24,,,,...,hematologic disease,Other,3.0,LoF tolerant,Detected in many,Tissue enhanced,none,europepmc,literature,413311
2,2,ENSG00000004468,EFO_0000203,NCT03236428,"Active, not recruiting",2,2017-11-24,,,,...,hematologic disease,Other,,LoF tolerant,Detected in many,Tissue enhanced,none,chembl,known_drug,413311
3,3,ENSG00000004468,EFO_0000203,NCT03236428,"Active, not recruiting",2,2017-11-24,,,,...,hematologic disease,Other,,LoF tolerant,Detected in many,Tissue enhanced,none,europepmc,literature,413311
4,4,ENSG00000004468,EFO_0000203,NCT03236428,"Active, not recruiting",2,2017-11-24,,,,...,hematologic disease,Other,,LoF tolerant,Detected in many,Tissue enhanced,none,chembl,known_drug,413311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2778287,2778287,ENSG00000278195,MONDO_0008315,NCT00510224,Terminated,2,2007-07-01,Terminated,stopped,,...,cell proliferation disorder,Oncology,,,Detected in some,Tissue enhanced,from1to10,chembl,known_drug,413311
2778288,2778288,ENSG00000278195,MONDO_0008315,NCT00510224,Terminated,2,2007-07-01,Terminated,stopped,,...,cell proliferation disorder,Oncology,,,Detected in some,Tissue enhanced,from1to10,slapenrich,affected_pathway,413311
2778289,2778289,ENSG00000278195,MONDO_0008315,NCT00166725,Completed,2,2004-02-01,,,,...,cell proliferation disorder,Oncology,,,Detected in some,Tissue enhanced,from1to10,chembl,known_drug,413311
2778290,2778290,ENSG00000278195,MONDO_0008315,NCT00166725,Completed,2,2004-02-01,,,,...,cell proliferation disorder,Oncology,,,Detected in some,Tissue enhanced,from1to10,slapenrich,affected_pathway,413311


In [435]:
success_cols = ['succ_p_1', 'succ_1_2', 'succ_2_3', 'succ_3_a']
stop_reasons_df = ot2024_targets_df_full.rename({'targetId':'gene_id', 'diseaseId':'disease_ontology_id', 'phase2':'succ_1_2', 'phase3':'succ_2_3', 'phase4':'succ_3_a'}, axis=1)
ot_efo_to_keep = [mondo2efo[x] for x in evidence_table.disease_ontology_id.unique()]
stop_reasons_df = stop_reasons_df[stop_reasons_df['disease_ontology_id'].isin(ot_efo_to_keep)]

# Rename disease_ontology_id
stop_reasons_df['disease_ontology_id'] = [efo2mondo[x] for x in stop_reasons_df['disease_ontology_id']]
stop_reasons_df = stop_reasons_df[['gene_id', 'disease_ontology_id', 'prediction', 'why_stopped', 'nctid', 'clinicalPhase', 'isStopped', 'status']].drop_duplicates()

In [436]:
stop_3_evidence_df = pd.merge(evidence_table, stop_reasons_df, how='left', on = ['disease_ontology_id', 'gene_id'])

In [439]:
stop_3_evidence_df.drop(['disease_ct_evidence','bulk_disease_evidence','is_hvg_normal','is_hvg_disease'], axis=1).to_csv('../data/evidence_reasons2stop.csv')

There can be many trials for the same G-D pair

In [249]:
stop_3_evidence_df[(stop_3_evidence_df['disease_ontology_id'] == 'MONDO_0005011') & (stop_3_evidence_df['gene_id'] == 'ENSG00000173585')]

Unnamed: 0,gene_id,gene_name,disease,disease_ontology_id,ct_marker_evidence,disease_evidence,disease_ct_evidence,bulk_disease_evidence,genetic_evidence_OT,is_hvg_normal,is_hvg_disease,prediction,why_stopped,nctid,phase,isStopped,succ_1_2,succ_2_3,succ_3_a
58636,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,,,NCT01536418,,stopped,Phase II+,Phase III+,
58637,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,,,NCT01611805,,,,,
58638,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,,,NCT00102921,,,Phase II+,,
58639,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,,,NCT01277666,,,Phase II+,Phase III+,
58640,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,,,NCT00306215,,,Phase II+,,
58641,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,,,NCT01489943,,,,,
58642,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,Negative,This study was terminated due to the lack of e...,NCT01318993,Phase 3,stopped,Phase II+,Phase III+,
58643,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,Negative,This study was terminated due to the lack of e...,NCT01316939,Phase 3,stopped,Phase II+,Phase III+,
58644,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,,,NCT01827631,,,,,
58645,ENSG00000173585,CCR9,Crohn disease,MONDO_0005011,1.0,1.0,1.0,0.0,0.0,1.0,1.0,,,NCT01114607,,,,,
