In [236]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.data_paths import *

In [237]:
df = pd.read_pickle(MERGED)
cosmic_proteins = pd.read_csv(COSMIC_PROTEINS, sep='\t')

In [238]:
cosmic_proteins.head(1)

Unnamed: 0,Gene,Gene synonym,Ensembl,Gene description,Uniprot,Chromosome,Position,Protein class,Biological process,Molecular function,...,Cancer prognostics - Pancreatic Adenocarcinoma (TCGA),Cancer prognostics - Pancreatic Adenocarcinoma (validation),Cancer prognostics - Prostate Adenocarcinoma (TCGA),Cancer prognostics - Rectum Adenocarcinoma (TCGA),Cancer prognostics - Rectum Adenocarcinoma (validation),Cancer prognostics - Skin Cuteneous Melanoma (TCGA),Cancer prognostics - Stomach Adenocarcinoma (TCGA),Cancer prognostics - Testicular Germ Cell Tumor (TCGA),Cancer prognostics - Thyroid Carcinoma (TCGA),Cancer prognostics - Uterine Corpus Endometrial Carcinoma (TCGA)
0,ABI1,"ABI-1, E3B1, SSH3BP1",ENSG00000136754,Abl interactor 1,Q8IZP0,10,26746593-26861087,"Cancer-related genes, Disease related genes, P...",Host-virus interaction,,...,unprognostic (6.17e-3),unprognostic (9.66e-2),unprognostic (8.60e-2),unprognostic (1.16e-1),unprognostic (3.28e-2),unprognostic (1.01e-1),unprognostic (3.66e-1),unprognostic (1.72e-1),unprognostic (1.03e-1),unprognostic (1.72e-1)


### Check if synonyms could be used

In [239]:
arr = cosmic_proteins['Gene synonym'].apply(lambda x: np.array([y.strip() for y in str(x).split(',')]))
all_elems = np.concatenate(arr)

# Check isin for each element in the array (careful of double lists)
df['target_name'].isin(all_elems).sum(), df['name_y'].isin(all_elems).sum()

(np.int64(0), np.int64(0))

In [240]:
cosmic_proteins['Uniprot']

0      Q8IZP0
1      P00519
2      P42684
3      P25106
4      O95573
        ...  
567    Q9UBW7
568    Q9NQX6
569    Q8TF68
570    Q96K83
571    Q15696
Name: Uniprot, Length: 572, dtype: object

In [241]:
# Matching by name_y
nb_bind_in_cosmic_bn = df['name_y'].isin(cosmic_proteins['Gene']).sum()
nb_cosmic_in_bind_bn = cosmic_proteins['Gene'].isin(df['name_y']).sum()
nb_bind_in_cosmic_by_targetname_bn = df['target_name'].isin(cosmic_proteins['Gene']).sum()

# Matching by uniprot
nb_bind_in_cosmic_bu = df['swissprot_protein_id'].dropna().isin(cosmic_proteins['Uniprot']).sum()
nb_cosmic_in_bind_bu = cosmic_proteins['Uniprot'].isin(df['swissprot_protein_id'].dropna()).sum()

print(f"Number of bind proteins in cosmic by name_y: {nb_bind_in_cosmic_bn}")
print(f"Number of cosmic proteins in bind by name_y: {nb_cosmic_in_bind_bn}")
print()
print(f"Number of bind proteins in cosmic by target_name: {nb_bind_in_cosmic_by_targetname_bn}")
print()
print(f"Number of bind proteins in cosmic by uniprot: {nb_bind_in_cosmic_bu}")
print(f"Number of cosmic proteins in bind by uniprot: {nb_cosmic_in_bind_bu}")

Number of bind proteins in cosmic by name_y: 20199
Number of cosmic proteins in bind by name_y: 45

Number of bind proteins in cosmic by target_name: 0

Number of bind proteins in cosmic by uniprot: 32320
Number of cosmic proteins in bind by uniprot: 75


### Cancer related proteins

In [242]:
bind_cancer_proteins_by_name = df[df['name_y'].isin(cosmic_proteins['Gene'])]
bind_cancer_proteins_by_uniprot = df[df['swissprot_protein_id'].isin(cosmic_proteins['Uniprot'])]

bcpbn_set = set(bind_cancer_proteins_by_name['target_name'])
bcpbu_set = set(bind_cancer_proteins_by_uniprot['target_name'])

print(f"Number of cancer proteins in bind by name_y: {len(bcpbn_set)}")
print(f"Number of cancer proteins in bind by uniprot: {len(bcpbu_set)}")
print(f"Difference between the two sets:")
print(f"  - {len(bcpbn_set - bcpbu_set)} proteins in name_y but not in uniprot")
print(f"  - {len(bcpbu_set - bcpbn_set)} proteins in uniprot but not in name_y")

Number of cancer proteins in bind by name_y: 91
Number of cancer proteins in bind by uniprot: 142
Difference between the two sets:
  - 4 proteins in name_y but not in uniprot
  - 55 proteins in uniprot but not in name_y


In [243]:
cancer_proteins_namey_not_uniprot = df[df['target_name'].isin(bcpbn_set - bcpbu_set)]
cancer_proteins_namey_not_uniprot[['name_y', 'target_name', 'swissprot_protein_id']]

Unnamed: 0,name_y,target_name,swissprot_protein_id
64111,EGFR,"Phosphatidylinositol 4,5-bisphosphate 3-kinase...",Q504U8
64125,EGFR,Receptor protein-tyrosine kinase/Serine/threon...,Q504U8
64139,EGFR,DNA-dependent protein kinase catalytic subunit...,Q504U8
64153,EGFR,"Phosphatidylinositol 4,5-bisphosphate 3-kinase...",Q504U8


In [244]:
cosmic_proteins[cosmic_proteins['Gene'] == 'EGFR'][['Gene', 'Gene synonym', 'Uniprot']]

Unnamed: 0,Gene,Gene synonym,Uniprot
141,EGFR,"ERBB, ERBB1, ERRP",P00533


In [251]:
# Combine the two sets (even though Uniprot is not the same)
cancer_related_proteins = pd.concat([bind_cancer_proteins_by_name, bind_cancer_proteins_by_uniprot]).drop_duplicates()
cancer_related_proteins_df = df[df['target_name'].isin(cancer_related_proteins['target_name'])]

print(f"Number of pairs of cancer proteins with their ligand in bind: {len(cancer_related_proteins_df)}")
print(f"Number of unique cancer proteins: {len(cancer_related_proteins['target_name'].unique())}")
print(f"  - {len(bind_cancer_proteins_by_name['target_name'].unique())} by name_y")
print(f"  - {len(bind_cancer_proteins_by_uniprot['target_name'].unique())} by uniprot")

Number of pairs of cancer proteins with their ligand in bind: 67058
Number of unique cancer proteins: 146
  - 91 by name_y
  - 142 by uniprot


### Ligands related to these proteins

In [263]:
ligands_related_to_cancer_proteins = cancer_related_proteins_df.dropna(subset=['name_x'])

print(f"Number of pairs of ligands related to cancer proteins (keep only the ones for which we have a name): {len(ligands_related_to_cancer_proteins)}")
print(f"Number of unique ligands related to cancer proteins: {len(set(ligands_related_to_cancer_proteins['name_x']))}")
print(f"Number of cancer proteins that matched to a ligand: {len(set(ligands_related_to_cancer_proteins['target_name']))} out of {len(cancer_related_proteins['target_name'].unique())}")

Number of pairs of ligands related to cancer proteins (keep only the ones for which we have a name): 3624
Number of unique ligands related to cancer proteins: 405
Number of cancer proteins that matched to a ligand: 66 out of 146


In [264]:
# Keep only the ligands that are in DrugBank 
drugs_related_to_cancer_proteins = ligands_related_to_cancer_proteins.dropna(subset='class_superclass')

print(f"Number of pairs of drugs related to cancer proteins: {len(drugs_related_to_cancer_proteins)}")
print(f"Number of unique drugs related to cancer proteins: {len(drugs_related_to_cancer_proteins['name_x'].unique())}")

Number of pairs of drugs related to cancer proteins: 3127
Number of unique drugs related to cancer proteins: 299


### All proteins related to these ligands

In [265]:
all_prots_related_to_cancer_ligands = df[df['name_x'].isin(ligands_related_to_cancer_proteins['name_x'].unique())]
all_prots_related_to_cancer_drugs = df[df['name_x'].isin(drugs_related_to_cancer_proteins['name_x'].unique())]

print(f"Number of unique proteins related to cancer ligands: {len(all_prots_related_to_cancer_ligands['target_name'].unique())}")
print(f"Number of unique proteins related to cancer drugs: {len(all_prots_related_to_cancer_drugs['target_name'].unique())}")

Number of unique proteins related to cancer ligands: 1651
Number of unique proteins related to cancer drugs: 1565


### Differentiate between secondary effects and drugs

In [266]:
all_prots_related_to_ligands_target_names = all_prots_related_to_cancer_ligands['target_name']
all_prots_related_to_drugs_target_names = all_prots_related_to_cancer_drugs['target_name']
direct_prots_related_target_names = cancer_related_proteins['target_name'].unique()

direct_effect_prots = all_prots_related_to_drugs_target_names[all_prots_related_to_drugs_target_names.isin(direct_prots_related_target_names)].unique()
secondary_effect_prots = all_prots_related_to_drugs_target_names[~all_prots_related_to_drugs_target_names.isin(direct_prots_related_target_names)].unique()

ligands_direct_effect_prots = all_prots_related_to_ligands_target_names[all_prots_related_to_ligands_target_names.isin(direct_prots_related_target_names)].unique()
ligands_secondary_effect_prots = all_prots_related_to_ligands_target_names[~all_prots_related_to_ligands_target_names.isin(direct_prots_related_target_names)].unique()

print(f"Number of proteins directly targeted by drugs: {len(direct_effect_prots)}")
print(f"Number of proteins representing secondary effect of drugs: {len(secondary_effect_prots)}")
print()
print(f"Number of proteins directly targeted by ligands: {len(ligands_direct_effect_prots)}")
print(f"Number of proteins representing secondary effect of ligands: {len(ligands_secondary_effect_prots)}")

Number of proteins directly targeted by drugs: 61
Number of proteins representing secondary effect of drugs: 1504

Number of proteins directly targeted by ligands: 66
Number of proteins representing secondary effect of ligands: 1585


In [267]:
# Summary
cancer_related_proteins_df = cancer_related_proteins_df

ligand_related_to_cancer_df = ligands_related_to_cancer_proteins
drug_related_to_cancer_df = drugs_related_to_cancer_proteins

all_proteins_related_to_cancer_ligands_df = all_prots_related_to_cancer_ligands
all_proteins_related_to_cancer_drugs_df = all_prots_related_to_cancer_drugs

drugs_direct_effect_prots_df = df[df['target_name'].isin(direct_effect_prots)]
drugs_secondary_effect_prots_df = df[df['target_name'].isin(secondary_effect_prots)]

ligands_direct_effect_prots_df = df[df['target_name'].isin(ligands_direct_effect_prots)]
ligands_secondary_effect_prots_df = df[df['target_name'].isin(ligands_secondary_effect_prots)]

In [270]:
print(f"Number of drugs related to cancer proteins: {len(drugs_related_to_cancer_proteins['name_x'].unique())}")
print(f"Number of ligands related to cancer proteins: {len(ligands_related_to_cancer_proteins['name_x'].unique())}")
print()
print(f"Number of proteins directly targeted by drugs: {len(drugs_direct_effect_prots_df['target_name'].unique())}")
print(f"Number of proteins representing secondary effect of drugs: {len(drugs_secondary_effect_prots_df['target_name'].unique())}")
print()


Number of drugs related to cancer proteins: 299
Number of ligands related to cancer proteins: 405

Number of proteins directly targeted by drugs: 61
Number of proteins representing secondary effect of drugs: 1504



Note that cancer related proteins has 146 distinct prots, but as we keep only the ones that are linked to a ligand in bindingdb, we end up using only 66 (61 for drugs) out of those 146 prots. So in the end, the difference between number of proteins directly targeted by ligands and the number of proteins that are related to cancer is only because of this:
```	python
    ligands_related_to_cancer_proteins = cancer_related_proteins_df.dropna(subset=['name_x'])
```
In summary, 
- we have 146 cancer prots
- they link to 405 ligands (ligands related to cancer prots), which themselves only link to 66 of the cancer prots (61 for drugs)
- these 405 ligands link to all prots which around 1651 prots
- out of all these prots, as we've seen 66 are cancer related (61 for drugs), and 1504 are likely proteins on which the drug has secondary effects

The difference between the 146 cancer prots and the 66 cancer prots is really because only a subset of the cancer prots can be used for our analysis. And this subset, as explained above, is given by the dropna on name_x which is the name of the ligand.