# Using KEGG to extract viable drugs

In [6]:
import pandas as pd
from Bio.KEGG import REST

In [7]:
human_pathways = REST.kegg_list("pathway","hsa").read()

In [8]:
print(len(human_pathways))

21706


In [9]:
# Filter all cancer pathways
cancer_pathway = []
for line in human_pathways.rstrip().split("\n"):
    entry, description = line.split("\t")
    if 'cancer' in description:
        cancer_pathway.append((entry, description))

In [10]:
print(len(cancer_pathway))

17


In [11]:
for pathway in cancer_pathway:
    print(pathway)

('path:hsa05200', 'Pathways in cancer - Homo sapiens (human)')
('path:hsa05202', 'Transcriptional misregulation in cancer - Homo sapiens (human)')
('path:hsa05205', 'Proteoglycans in cancer - Homo sapiens (human)')
('path:hsa05206', 'MicroRNAs in cancer - Homo sapiens (human)')
('path:hsa05210', 'Colorectal cancer - Homo sapiens (human)')
('path:hsa05212', 'Pancreatic cancer - Homo sapiens (human)')
('path:hsa05213', 'Endometrial cancer - Homo sapiens (human)')
('path:hsa05215', 'Prostate cancer - Homo sapiens (human)')
('path:hsa05216', 'Thyroid cancer - Homo sapiens (human)')
('path:hsa05219', 'Bladder cancer - Homo sapiens (human)')
('path:hsa05222', 'Small cell lung cancer - Homo sapiens (human)')
('path:hsa05223', 'Non-small cell lung cancer - Homo sapiens (human)')
('path:hsa05224', 'Breast cancer - Homo sapiens (human)')
('path:hsa05226', 'Gastric cancer - Homo sapiens (human)')
('path:hsa05230', 'Central carbon metabolism in cancer - Homo sapiens (human)')
('path:hsa05231', 'Ch

In [12]:
pathway_file = REST.kegg_get(cancer_pathway[0][0]).read()

In [13]:
for line in pathway_file.rstrip().split("\n"):
    print (line)

ENTRY       hsa05200                    Pathway
NAME        Pathways in cancer - Homo sapiens (human)
CLASS       Human Diseases; Cancer: overview
PATHWAY_MAP hsa05200  Pathways in cancer
DISEASE     H00559  von Hippel-Lindau syndrome
            H00646  Odontoonychodermal dysplasia
            H00857  Oligodontia-colorectal cancer syndrome
            H00881  Li-Fraumeni syndrome
            H00895  Basal cell nevus syndrome
            H00947  Pilomatricoma
            H01007  Choroid plexus papilloma
            H01023  Juvenile polyposis syndrome
            H01025  Familial adenomatous polyposis
DRUG        D00094  Tretinoin (JAN/USP/INN)
            D00327  Fluoxymesterone (JAN/USP/INN)
            D00408  Methyltestosterone (JP17/USP/INN)
            D00554  Ethinyl estradiol (USP)
            D00586  Flutamide (JP17/USP/INN)
            D00745  Interferon alfa-2a (USAN/INN)
            D00747  Interferon gamma-1b (USAN/INN)
            D00748  Aldesleukin (USAN/INN)
           

In [14]:
pathway_file = REST.kegg_get(cancer_pathway[1][0]).read()
for line in pathway_file.rstrip().split("\n"):
    print (line)

ENTRY       hsa05202                    Pathway
NAME        Transcriptional misregulation in cancer - Homo sapiens (human)
DESCRIPTION In tumor cells, genes encoding transcription factors (TFs) are often amplified, deleted, rearranged via chromosomal translocation and inversion, or subjected to point mutations that result in a gain- or loss-of- function. In hematopoietic cancers and solid tumors, the translocations and inversions increase or deregulate transcription of the oncogene. Recurrent chromosome translocations generate novel fusion oncoproteins, which are common in myeloid cancers and soft-tissue sarcomas. The fusion proteins have aberrant transcriptional function compared to their wild-type counterparts. These fusion transcription factors alter expression of target genes, and thereby result in a variety of altered cellular properties that contribute to the tumourigenic process.
CLASS       Human Diseases; Cancer: overview
PATHWAY_MAP hsa05202  Transcriptional misregulation in 

In [15]:
pathway_file = REST.kegg_get(cancer_pathway[3][0]).read()
for line in pathway_file.rstrip().split("\n"):
    print (line)

ENTRY       hsa05206                    Pathway
NAME        MicroRNAs in cancer - Homo sapiens (human)
DESCRIPTION MicroRNA (miRNA) is a cluster of small non-encoding RNA molecules of 21 - 23 nucleotides in length, which controls gene expression post-transcriptionally either via the degradation of target mRNAs or the inhibition of protein translation. Using high-throughput profiling, dysregulation of miRNAs has been widely observed in different stages of cancer. The upregulation (overexpression) of specific miRNAs could lead to the repression of tumor suppressor gene expression, and conversely the downregulation of specific miRNAs could result in an increase of oncogene expression; both these situations induce subsequent malignant effects on cell proliferation, differentiation, and apoptosis that lead to tumor growth and progress. The miRNA signatures of cancer observed in various studies differ significantly. These inconsistencies occur due to the differences in the study populations 

In [16]:
lines = pathway_file.split("\n")

In [17]:
for index, line in enumerate(lines):
    if "DRUG" in line:
        start_index = index
        print (index)
        print (line)

15
DRUG        D03021  Azacitidine (JAN/USAN/INN)


In [18]:
for index, line in enumerate(lines):
    if "ORGANISM" in line:
        end_index = index
        print (index)
        print (line)

24
ORGANISM    Homo sapiens (human) [GN:hsa]


In [19]:
line = lines[16].replace(lines[13][:12],"")
print (line)

D03665  Decitabine (USAN/INN)


In [20]:
line.find("(")

19

In [21]:
line.replace(line[line.find("(")-1:],"")

'D03665  Decitabine'

In [22]:
for line in lines[start_index:end_index:1]:
    print(line.replace(line[:12],"").replace(line[line.find("(")-1:],""))

D03021  Azacitidine
D03665  Decitabine
D11163  Cobomarsen
D11164  Cobomarsen sodium
D11444  Tazemetostat
D11485  Tazemetostat hydrobromide
D11551  Valemetost
D11662  Valemetostat tosilate
D11740  Roducitabine


In [23]:
import time
start_time = time.time()
cancer_drugs = list()
cancer_pathways = ['path:hsa05200','path:hsa05202','path:hsa05206',
                   'path:hsa05230','path:hsa05231','path:hsa05235'] # sample three general pathways
for pathway in cancer_pathways:
    pathway_file = REST.kegg_get(pathway).read() # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file I am in, only read the drugs in each pathway
    for index, line in enumerate(pathway_file.rstrip().split("\n")):
        if "DRUG" in line:
            start_index = index
        if "ORGANISM" in line:
            end_index = index
    print (pathway, start_index, end_index)
    for line in pathway_file.rstrip().split("\n")[start_index:end_index:1]:
            print (line)
            cancer_drugs.append(line.replace(line[:12],"").replace(line[line.find("(")-1:],""))
print ("Total run time: %.4f" %(time.time()-start_time))

path:hsa05200 13 309
DRUG        D00094  Tretinoin (JAN/USP/INN)
            D00327  Fluoxymesterone (JAN/USP/INN)
            D00408  Methyltestosterone (JP17/USP/INN)
            D00554  Ethinyl estradiol (USP)
            D00586  Flutamide (JP17/USP/INN)
            D00745  Interferon alfa-2a (USAN/INN)
            D00747  Interferon gamma-1b (USAN/INN)
            D00748  Aldesleukin (USAN/INN)
            D00753  Sirolimus (JAN/USAN/INN)
            D00958  Testosterone enanthate (JP17/USP)
            D00961  Bicalutamide (JAN/USP/INN)
            D00965  Nilutamide (USAN/INN)
            D00966  Tamoxifen citrate (JP17/USP)
            D00967  Toremifene citrate (JAN/USAN)
            D01161  Fulvestrant (JAN/USP/INN)
            D01441  Imatinib mesylate (USAN)
            D01534  Dromostanolone propionate (USAN)
            D01977  Gefitinib (JAN/USAN/INN)
            D02106  Arsenic trioxide (JP17/USAN)
            D02714  Everolimus (JAN/USAN/INN)
            D02745  Interfe

path:hsa05202 83 85
DRUG        D00094  Tretinoin (JAN/USP/INN)
            D11073  Tepotinib hydrochloride (USAN)
path:hsa05206 15 24
DRUG        D03021  Azacitidine (JAN/USAN/INN)
            D03665  Decitabine (USAN/INN)
            D11163  Cobomarsen (USAN/INN)
            D11164  Cobomarsen sodium (USAN)
            D11444  Tazemetostat (USAN/INN)
            D11485  Tazemetostat hydrobromide (JAN/USAN)
            D11551  Valemetostat
            D11662  Valemetostat tosilate (JAN)
            D11740  Roducitabine (USAN)
path:hsa05230 5 10
DRUG        D07257  Lonidamine (INN)
            D11712  Pralsetinib (USAN/INN)
            D11713  Selpercatinib (JAN/USAN/INN)
            D11738  Telaglenastat (USAN/INN)
            D11739  Telaglenastat hydrochloride (USAN)
path:hsa05231 5 5
path:hsa05235 5 17
DRUG        D10316  Nivolumab (USAN/INN)
            D10390  Pidilizumab (USAN)
            D10574  Pembrolizumab (USAN)
            D10773  Atezolizumab (USAN/INN)
            D1080

In [24]:
print (len(cancer_drugs))
for drug in cancer_drugs:
    print (drug)

324
D00094  Tretinoin
D00327  Fluoxymesterone
D00408  Methyltestosterone
D00554  Ethinyl estradiol
D00586  Flutamide
D00745  Interferon alfa-2a
D00747  Interferon gamma-1b
D00748  Aldesleukin
D00753  Sirolimus
D00958  Testosterone enanthate
D00961  Bicalutamide
D00965  Nilutamide
D00966  Tamoxifen citrate
D00967  Toremifene citrate
D01161  Fulvestrant
D01441  Imatinib mesylate
D01534  Dromostanolone propionate
D01977  Gefitinib
D02106  Arsenic trioxide
D02714  Everolimus
D02745  Interferon alfa-2b
D02748  Peginterferon alfa-2b
D02815  Alitretinoin
D02970  Aprinocarsen sodium
D03061  Batimastat
D03106  Bexarotene
D03218  Axitinib
D03235  Filgrastim
D03247  Lenograstim
D03252  Bosutinib
D03257  Trastuzumab
D03305  Interferon alfa
D03350  Canertinib dihydrochloride
D03357  Interferon gamma-1a
D03455  Cetuximab
D03658  Dasatinib
D03682  Denileukin diftitox
D03800  Rebimastat
D03802  Tanomastat
D04014  Enzastaurin hydrochloride
D04023  Erlotinib hydrochloride
D04024  Lapatinib ditosylate
D0

In [25]:
unique_drug = list(set(cancer_drugs))
print (len(unique_drug))

320


In [26]:
drug_lst = []
for drug in unique_drug:
    drug_id = drug[:7].rstrip()
    drug_name = drug[7:].lstrip()
    drug_lst.append([drug_id, drug_name])

In [27]:
for i in range(10):
    print (drug_lst[i])

['D10543', 'Apitolisib']
['D10688', 'Abemaciclib']
['D08108', 'Lapatinib']
['D01441', 'Imatinib mesylate']
['D11107', 'Glasdegib maleate']
['D08805', 'Interferon gamma-n1']
['D00965', 'Nilutamide']
['D11775', 'Tarextumab']
['D10316', 'Nivolumab']
['D05350', 'Panitumumab']


In [96]:
import pandas as pd
import numpy as np
df = pd.DataFrame(drug_lst, columns=['Drug_id','Drug_description'])

In [99]:
df.head()

Unnamed: 0,Drug_id,Drug_description
0,D10543,Apitolisib
1,D10688,Abemaciclib
2,D08108,Lapatinib
3,D01441,Imatinib mesylate
4,D11107,Glasdegib maleate


In [100]:
df.to_csv('cancer_drug.csv',index=False)

In [101]:
df = pd.read_csv('cancer_drug.csv')

In [102]:
df.head()

Unnamed: 0,Drug_id,Drug_description
0,D10543,Apitolisib
1,D10688,Abemaciclib
2,D08108,Lapatinib
3,D01441,Imatinib mesylate
4,D11107,Glasdegib maleate
