# Chemicals analysis

In [61]:
import os
import gc
import re
import requests
import time
import random

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

import seaborn as sns
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf

from statsmodels.stats import diagnostic
from scipy import stats

from src.drugbank_XML_drugparser import DrugParser
from src.drugbank_bindingdb_merger import DrugBank_BindingDB_Merger
from src.preprocessing import Preprocessing, ColumnClean

from src.data_paths import *

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## importing merged BDB and DB


In [62]:

merged_df = pd.read_pickle(MERGED)

In [63]:
merged_df.columns

Index(['ki', 'ph', 'temp', 'ic50', 'ec50', 'kd', 'kon', 'koff', 'doi',
       'target_name', 'pubchem_cid', 'chebi_id', 'chembl_id', 'drugbank_id',
       'kegg_id', 'zinc_id', 'smiles', 'inchi_key', 'bindingdb_id',
       'swissprot_target_chain_name', 'swissprot_target_chain_entry_name',
       'swissprot_protein_id', 'Unique_ID', 'drugbank_drug_name',
       'drugbank_drug_unii', 'drugbank_drug_toxicity',
       'drugbank_drug_class_kingdom', 'drugbank_drug_class_superclass',
       'drugbank_drug_synonyms', 'drugbank_drug_categories',
       'drugbank_drug_patent_approved', 'drugbank_drug_interaction',
       'Matched_On', 'drugbank_protein_name',
       'drugbank_protein_general_function',
       'drugbank_protein_specific_function', 'drugbank_protein_organism'],
      dtype='object')

In [64]:
merged_df["swissprot_target_chain_entry_name"].dropna().unique()

<StringArray>
[  'POL_HV1BR',   'POL_HV196', 'CP3A4_HUMAN', 'GALK1_HUMAN',   'POL_HV1N5',
 'CASP3_HUMAN', 'CASP1_HUMAN', 'CASP4_HUMAN', 'CASP7_HUMAN', 'CASP6_HUMAN',
 ...
 'CISY2_MYCTU',    'M2_I34A1',  'BMI1_HUMAN',  'CYSM_MYCTU', 'CYSK2_MYCTU',
  'TRYS_CRIFA',  'PDF2_ARATH',  'ANGT_HUMAN', 'TGFB2_HUMAN',  'LUXQ_VIBHA']
Length: 6533, dtype: string

In [65]:
merged_df["drugbank_protein_name"].dropna().unique()

array(['gag-pol', 'CCNA2', 'CCND1', 'CDK6', 'CDK5', '72', 'CDK7', 'CDK9',
       'CDK2', 'FNTB', 'PGGT1B', 'PIK3CA', 'ITGB1', 'ITGB7', 'ITGA5',
       'PIK3R2', 'PIK3CD', 'parE', 'gyrB', 'ureB', 'PIK3CB', 'GABRB3',
       'GABRA1', 'GABRA2', 'GABRA3', 'GABRA5', 'GABRA4', 'GABRA6',
       'GABRB2', 'APH1A', 'PRKAB1', 'PRKDC', 'MTOR', 'ABL1', 'HCK',
       'EGFR', 'EPHB4', 'PDGFRB', 'RARA', 'TGM2', 'JAK3', 'HTR1B',
       'MAPK9', 'INCENP', 'PDE3B', 'KDM1A', 'HSP90AA1', 'BTK', 'FYN',
       'LYN', 'MAPK1', 'HDAC8', 'PIK3R1', 'HTR2C', 'GHSR', 'PRKAG1',
       'PTGS2', 'GUCY1B1', 'GSTA1', 'TP53', 'ADRA2B', 'CHRNB2', 'MET',
       'ALK', 'RET', 'PPARG', 'GRM2', 'ITGB3', 'ATP1B1', 'HDAC2',
       'MAPKAPK2', 'MDM2', 'RXRA', 'MAPK3', 'MAP2K2', 'CACNB3', 'CFB',
       'TAS1R3', 'TAS1R2', 'GSTP1', 'TAB1', 'FGFR1', 'ROCK1', 'ROCK2',
       'ITGA4', 'CRBN', 'RAD51', 'SCN3A', 'CYP3A5', 'SERPINE1',
       'SERPINC1', 'ITGAV', 'ITGB6', 'RRM2', 'XIAP', 'CACNA2D1', 'P2RX2',
       'GRIN2B', 'KDR', 'GR

## Importing cancerous proteins dataframe and filtering merged_df

In [67]:

# loads cancerous protein dataframe
protein_classes = pd.read_csv("data/clean/protein_class_COSMIC.tsv", sep='\t')
# test
#protein_classes[protein_classes.Gene=="CCND1"]
cancer_proteins = list(protein_classes["Gene"].values)

# joins all targeted proteins 
pattern_protein_names = '|'.join(rf"\b{re.escape(term)}\b" for term in cancer_proteins)

#test
#pattern_protein_names

# extract rows based on the columns that contain cancer_keywords 
filtered_df = merged_df[
    #merged_df['drugbank_drug_unii'].str.contains(pattern_protein_names, case=False, na=False) |
    #merged_df['swissprot_target_chain_name'].str.contains(pattern_protein_names, case=False, na=False) |
    merged_df['swissprot_protein_id'].str.contains(pattern_protein_names, case=False, na=False) |
    merged_df['drugbank_protein_name'].str.contains(pattern_protein_names, case=False, na=False)
]
filtered_df.reset_index(inplace=True)


print(filtered_df.shape)
filtered_df.head()

(20199, 38)


Unnamed: 0,index,ki,ph,temp,ic50,ec50,kd,kon,koff,doi,...,drugbank_drug_class_superclass,drugbank_drug_synonyms,drugbank_drug_categories,drugbank_drug_patent_approved,drugbank_drug_interaction,Matched_On,drugbank_protein_name,drugbank_protein_general_function,drugbank_protein_specific_function,drugbank_protein_organism
0,8960,,7.6,22.0,840.0,,,,,10.1021/jm0201722,...,,,,,,,CCND1,Transcription factor binding,Regulatory component of the cyclin D1-CDK4 (DC...,Humans
1,8961,,7.6,22.0,490.0,,,,,10.1021/jm0201722,...,,,,,,,CCND1,Transcription factor binding,Regulatory component of the cyclin D1-CDK4 (DC...,Humans
2,8962,,7.6,22.0,640.0,,,,,10.1021/jm0201722,...,,,,,,,CCND1,Transcription factor binding,Regulatory component of the cyclin D1-CDK4 (DC...,Humans
3,8963,,7.6,22.0,1900.0,,,,,10.1021/jm0201722,...,,,,,,,CCND1,Transcription factor binding,Regulatory component of the cyclin D1-CDK4 (DC...,Humans
4,8964,,7.6,22.0,2800.0,,,,,10.1021/jm0201722,...,,,,,,,CCND1,Transcription factor binding,Regulatory component of the cyclin D1-CDK4 (DC...,Humans


In [98]:
for col_name in protein_classes.columns:
    print(col_name)

Gene
Gene synonym
Ensembl
Gene description
Uniprot
Chromosome
Position
Protein class
Biological process
Molecular function
Disease involvement
Evidence
HPA evidence
UniProt evidence
NeXtProt evidence
RNA tissue specificity
RNA tissue distribution
RNA tissue specificity score
RNA tissue specific nTPM
RNA single cell type specificity
RNA single cell type distribution
RNA single cell type specificity score
RNA single cell type specific nTPM
RNA single nuclei brain specificity
RNA single nuclei brain distribution
RNA single nuclei brain specificity score
RNA single nuclei brain specific nTPM
RNA cancer specificity
RNA cancer distribution
RNA cancer specificity score
RNA cancer specific FPKM
RNA brain regional specificity
RNA brain regional distribution
RNA brain regional specificity score
RNA brain regional specific nTPM
RNA blood cell specificity
RNA blood cell distribution
RNA blood cell specificity score
RNA blood cell specific nTPM
RNA blood lineage specificity
RNA blood lineage distri

In [96]:
inter = set(protein_classes["Uniprot"]).intersection(set(filtered_df["swissprot_protein_id"]))
len(inter)

45

In [69]:
counter=0
total_rows=0
for prot_name in merged_df["drugbank_protein_name"].dropna().values:
    if protein_classes["Gene"].str.contains(prot_name).sum():
        counter+=1
    total_rows+=protein_classes["Gene"].str.contains(prot_name).sum()

print(counter, total_rows)

KeyboardInterrupt: 

In [70]:
merged_df[merged_df["drugbank_protein_name"].isin(protein_classes["Gene"])].shape

(20199, 37)

In [71]:
protein_classes[protein_classes["Gene"].isin(merged_df["drugbank_protein_name"])].shape

(45, 107)

In [72]:
# Find overlaps
overlap = set(protein_classes["Gene"]).intersection(set(merged_df["drugbank_protein_name"]))
print("Overlapping values:", overlap)
len(overlap)

Overlapping values: {'MAP3K1', 'PIK3CA', 'PDGFRA', 'EGFR', 'HSP90AB1', 'PPARG', 'IDH2', 'MET', 'ALK', 'EPAS1', 'TP53', 'BCR', 'CD274', 'PIK3CB', 'CDK6', 'ATR', 'RAF1', 'JAK3', 'AKT2', 'MAP2K2', 'RET', 'ABL1', 'PIK3R1', 'IDH1', 'KDR', 'JAK2', 'ATIC', 'ABL2', 'ROS1', 'KRAS', 'FGFR2', 'CCND1', 'CTNNB1', 'NPM1', 'KCNJ5', 'RAC1', 'MDM2', 'FGFR1', 'RARA', 'MTOR', 'BTK', 'MAPK1', 'HSP90AA1', 'JAK1', 'PDGFRB'}


45

In [73]:
overlap = set(merged_df["drugbank_protein_name"]).intersection(set(protein_classes["Gene"]))
print("Overlapping values:", overlap)
len(overlap)

Overlapping values: {'MAP3K1', 'PIK3CA', 'PDGFRA', 'EGFR', 'HSP90AB1', 'PPARG', 'IDH2', 'MET', 'ALK', 'EPAS1', 'TP53', 'BCR', 'CD274', 'PIK3CB', 'CDK6', 'ATR', 'RAF1', 'JAK3', 'AKT2', 'MAP2K2', 'RET', 'ABL1', 'PIK3R1', 'IDH1', 'KDR', 'JAK2', 'ATIC', 'ABL2', 'ROS1', 'KRAS', 'FGFR2', 'CCND1', 'CTNNB1', 'NPM1', 'KCNJ5', 'RAC1', 'MDM2', 'FGFR1', 'RARA', 'MTOR', 'BTK', 'MAPK1', 'HSP90AA1', 'JAK1', 'PDGFRB'}


45

In [74]:
string_columns = merged_df.select_dtypes(include=["object", "string"])


for col in string_columns:
    print()
    print(col)
    print(filtered_df[col].str.contains("muta", na=False, case=False).unique())
    print()


doi
<BooleanArray>
[False]
Length: 1, dtype: boolean


target_name
<BooleanArray>
[False]
Length: 1, dtype: boolean


chembl_id
<BooleanArray>
[False]
Length: 1, dtype: boolean


drugbank_id
<BooleanArray>
[False]
Length: 1, dtype: boolean


kegg_id
<BooleanArray>
[False]
Length: 1, dtype: boolean


zinc_id
<BooleanArray>
[False]
Length: 1, dtype: boolean


smiles
<BooleanArray>
[False]
Length: 1, dtype: boolean


inchi_key
<BooleanArray>
[False]
Length: 1, dtype: boolean


swissprot_target_chain_name
<BooleanArray>
[False]
Length: 1, dtype: boolean


swissprot_target_chain_entry_name
<BooleanArray>
[False]
Length: 1, dtype: boolean


swissprot_protein_id
[False]


drugbank_drug_name
[False]


drugbank_drug_unii
[False]


drugbank_drug_toxicity
[False  True]


drugbank_drug_class_kingdom
[False]


drugbank_drug_class_superclass
[False]


drugbank_drug_synonyms
[False]


drugbank_drug_categories
[False]


drugbank_drug_patent_approved
[False]


drugbank_drug_interaction
[False]


Match

In [75]:
filtered_df[filtered_df["drugbank_protein_specific_function"].str.contains("mutation", na=False, case=False)]["drugbank_protein_specific_function"].unique()


array(['Tyrosine-protein kinase that acts as cell-surface receptor for fibroblast growth factors and plays an essential role in the regulation of cell proliferation, differentiation, migration and apoptosis, and in the regulation of embryonic development. Required for normal embryonic patterning, trophoblast function, limb bud development, lung morphogenesis, osteogenesis and skin development. Plays an essential role in the regulation of osteoblast differentiation, proliferation and apoptosis, and is required for normal skeleton development. Promotes cell proliferation in keratinocytes and immature osteoblasts, but promotes apoptosis in differentiated osteoblasts. Phosphorylates PLCG1, FRS2 and PAK4. Ligand binding leads to the activation of several signaling cascades. Activation of PLCG1 leads to the production of the cellular signaling molecules diacylglycerol and inositol 1,4,5-trisphosphate. Phosphorylation of FRS2 triggers recruitment of GRB2, GAB1, PIK3R1 and SOS1, and mediates a

In [76]:
mutants_list = [
    "m351t",
    "q252h",
    "t315i",
    "t315n",
    "y253f",
    "d246a",
    "m243a",
    "a356t",
    "d130v",
    "e74f",
    "m144i",
    "m918t",
    "s345c",
    "t338m",
    "v523i",
    "y355f",
    "aeaf",
    "etkf",
    "hy58a",
    "hy58f",
    "sfsf",
    "e50d",
    "e50q",
    "t338i",
    "s345c",
    "t338m",
    "s190a",
    "l300p",
    "v214a",
    "v219a",
    "w223a",
    "f130v",
    "f98y",
    "i100l",
    "h185a",
    "r501e",
    "a71v",
    "d30n",
    "g73s",
    "i47l",
    "i50l",
    "i50v",
    "i84v",
    "l23i",
    "l23v",
    "l76m",
    "v32i",
    "v82a",
    "v82i",
    "a-1",
    "triple mutant",
    "hd32a",
    "hd32e",
    "hd32n",
    "r140q",
    "r172s",
    "h1047r",
    "h176g",
    "d835h",
    "d835y",
    "v599e",
    "f78l",
    "g719s",
    "l858r",
    "t790m",
    "f203l",
    "c325s",
    "h976y",
    "d30n",
    "a71v",
    "d30n",
    "m36i",
    "l10i",
    "l90m",
    "m36i",
    "a71v",
    "m46i",
    "i54v",
    "v82a",
    "i84v",
    "v82f",
    "i84v",
    "k-60c",
    "v-18c",
    "mdr-hm",
    "mdr-qm",
    "nam-10",
    "k103n",
    "l100i",
    "p236l",
    "v106a",
    "v179d",
    "y181c",
    "y188l",
    "k510a",
    "k510e",
    "g110a",
    "g110d",
    "l111a",
    "y477a",
    "d597n",
    "n368d",
    "c59r",
    "s108n",
    "v82f",
    "n822k",
    "v559d",
    "d1228h",
    "y1230h",
    "h52r",
    "l306a",
    "l54v",
    "f82h",
    "k89t",
    "g550e",
    "a71v",
    "v82t",
    "i84v",
    "k103n",
    "181c",
    "l100i",
    "k103n",
    "d597n",
    "m336v",
    "g117e",
    "l119v",
    "m114v",
    "v559d",
    "t670i",
    "l49i",
    "t183a",
    "q181k",
    "t183a",
    "pkar1",
    "pkar1",
    "pkar2",
    "a16v",
    "s108t",
    "e109q",
    "h40a",
    "l10i",
    "g48v",
    "i54v",
    "l63p",
    "v82a",
    "m46i",
    "l63p",
    "a71v",
    "v82f",
    "i84v",
    "v106a",
    "y181c",
    "e166d",
    "t222a",
    "f82h",
    "l83v",
    "h84d",
    "k89t",
    "a160c",
    "d97c",
    "f99c",
    "g157c",
    "s204c",
    "t162c",
    "y209c",
    "d56c",
    "f77c",
    "l140c",
    "r132c",
    "r144c",
    "r185c",
    "r194c",
    "t126c",
    "t142c",
    "l10i",
    "g48v",
    "i54v",
    "l63p",
    "v82a",
    "c1a",
    "f82v",
    "del 661-681",
    "t338i",
    "a16v",
    "s108t",
    "fgd e109q",
    "fgd h40a",
    "v6",
    "y188l",
    "v82f",
    "a71v",
    "h176g",
    "d30n",
    "v82t"
]


In [79]:
string_columns = filtered_df.select_dtypes(include=["object", "string"])

# joins all targeted proteins 
mutants_protein_names = '|'.join(rf"\b{re.escape(term)}\b" for term in mutants_list)

# Check if any mutant is present in the DataFrame
matches = []

for col in string_columns:
    col_matches = filtered_df[col][filtered_df[col].str.contains(mutants_protein_names, case=False, na=False)].tolist()
    matches.extend(col_matches)

In [85]:
set(matches)

{'Isocitrate dehydrogenase [NADP] cytoplasmic [R132C]',
 'Phosphatidylinositol 3-kinase regulatory subunit alpha/4,5-bisphosphate 3-kinase catalytic subunit alpha isoform [H1047R]',
 'Tyrosine-protein kinase ABL1 [T315I]/Breakpoint cluster region protein'}

In [83]:
filtered_df["target_name"]

0        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
1        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
2        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
3        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
4        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
                               ...                        
20194    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
20195    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
20196    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
20197    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
20198    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
Name: target_name, Length: 20199, dtype: string

In [89]:
filtered_df["target_name"].dropna()

0        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
1        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
2        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
3        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
4        Cyclin-dependent kinase 4/G1/S-specific cyclin-D1
                               ...                        
20194    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
20195    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
20196    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
20197    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
20198    Cyclin-dependent kinase 6/G1/S-specific cyclin-D1
Name: target_name, Length: 20199, dtype: string

In [None]:
for protein in filter