In [15]:
import bs4
import requests

base = 'https://www.ebi.ac.uk'
urls = {'https://www.ebi.ac.uk/merops/cgi-bin/smi_summary?mid=J24.100'} # use set to avoid duplicates
max_length = 1

for url in urls:
    content = requests.get(url).content

    soup = bs4.BeautifulSoup(content, 'lxml')
    
    link = soup.find_all(class_ = "inhibit button")
    
    link = base + (link[0].get('href'))
    
    print(link)
    
#     links = {base + link.get('href') for link in links}
#     urls.update(links)
    
    print(urls)
    
    if len(urls) > max_length: # for demonstration purposes, include this so we don't run forever..
        break

https://www.ebi.ac.uk/merops/cgi-bin/inhibitors.pl?id=J24.100
{'https://www.ebi.ac.uk/merops/cgi-bin/smi_summary?mid=J24.100'}


In [1]:
import bs4
import requests
import pubchempy as pcp
import time

count = 0
name_mernum_edge_list = []
name_cid_edge_list = []
cid_mernum_edge_list = []
cid_smiles_edge_list = []

no_inhibits_link_urls = []
no_mernum_urls = []


base = 'https://www.ebi.ac.uk'

urls = set({})

content = requests.get('https://www.ebi.ac.uk/merops/cgi-bin/smi_index').content

soup = bs4.BeautifulSoup(content, 'lxml')
table = soup.find('table')
links = table.find_all('a') #All the links of the table
#     print(links)
links = {base + link.get('href') for link in links}
urls.update(links)

# print(urls)


for url in urls:
    count+=1
    if count % 100 == 0:
        print(count)
    time.sleep(1) #Add slight delay to each iteration to avoid overloading pcp servers. Unfortunately makes everything pretty slow
    content = requests.get(url).content

    soup = bs4.BeautifulSoup(content, 'lxml')
    
    inhibit_link = soup.find_all(class_ = "inhibit button")
    
    if len(inhibit_link) != 0:
        inhibit_link = base + (inhibit_link[0].get('href')) #The link to the page listing the inhibitors 
        
    dt_tags = soup.find_all('dt'); #First one is always 'common name'. There are usually 'other names' and occasionally 'CID at pubchem'
    dt_strings = []
    other_names = []
    for tag in dt_tags: #Storing the text for future checks. This array is parallel to dt_tags
        dt_strings.append(tag.string)
    
    cids = set()
    
    try:
        CID_index = dt_strings.index('CID at PubChem') #See if the CID is already on the page
        cids.add(int(dt_tags[CID_index].find_next_siblings('dd')[0].string)) #Goes to the 'CID at PubChem' and finds the text following it
                                                                             #Casts the cid to an int so it's the same type as the later cids
    except:
        pass
        
    try:
        other_names_index = dt_strings.index('Other names') #Store other names in case the common name yields no results at pubchem
        other_names = dt_tags[other_names_index].find_next_siblings('dd')[0].string.split('; ') #Multiple names are separated by ; so split separates them into a list
        
    except:
        pass
    
    common_name = dt_tags[0].find_next_siblings('dd')[0].string
    compounds = pcp.get_compounds(common_name, 'name') #Stores all the compounds resulting from pubchempy lookup

        
    for name in other_names:
        cs_ = pcp.get_compounds(name, 'name') #get compound from each 'other name' and check for duplicates
        for c_ in cs_: #To avoid duplicates (unfortunately I couldn't use a set here because compounds aren't hashable)
            if c_ not in compounds:
                compounds.append(c_)

    
    for compound in compounds: #Going to every compound in the list, find the cid and smiles
        cid = compound.cid
        cids.add(cid)
        smiles = compound.isomeric_smiles
        cid_smiles_edge_list.append((cid, smiles))
        
#     print(common_name, cids)

    try:
        inhibit_soup = bs4.BeautifulSoup(requests.get(inhibit_link).content, 'lxml')
        inhibit_table = inhibit_soup.find('table').find_all('td')
        for row in inhibit_table: #Go to each row of the inhitbit_table
            for item in row.find_all('a'): #unfortunately includes refrences too, but we can just look for 'pepsum' in the link
                temp_link = item.get('href')
                if 'pepsum' in temp_link:
                    mernum_link = base + temp_link
                    mernum_soup = bs4.BeautifulSoup(requests.get(mernum_link).content, 'lxml') #Go to the merid page
                    mernum_inlines = mernum_soup.find_all(class_ = 'inline')
                    mernum = ''
                    i = 0
                    while 'MER' not in mernum: #The mernum is almost always the first element, but occasionally there are additional background links. This keeps looping until 'MER' text is found
                        if i == len(mernum_inlines):
                            break
                        mernum = mernum_inlines[i].text
                        i+=1
                    if mernum == '': #If not mernum was found, skip the rest..
                        no_mernum_urls.append(mernum_link)
                        continue
                    if len(cids) == 0: #All CIDs found are used in the edge list. If no CIDs were found, common name is used instead
                        name_mernum_edge_list.append((common_name, mernum))
                    else:
                        for cid in cids:
                            cid_mernum_edge_list.append((cid, mernum))
    except:
#         print('oop ', url)
        no_inhibits_link_urls.append(url)
        
#     merid_col = inhibit_table[0]['MERID']
#     for merid in merid_col:
#         name_merid_edge_list.append((common_name, merid))

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300


In [2]:
inhibit_soup = bs4.BeautifulSoup(requests.get('https://www.ebi.ac.uk/merops/cgi-bin/inhibitors.pl?id=J22.400').content, 'lxml')
table = inhibit_soup.find('table').find_all('td')
for e in table:
    for j in (e.find_all('a')):
        if(len(j) > 0):
            p = j.get('href')
            if 'pepsum' in p:
                print(p)

/merops/cgi-bin/pepsum?mid=M01.001
/merops/cgi-bin/pepsum?mid=M01.001
/merops/cgi-bin/pepsum?mid=M01.001
/merops/cgi-bin/pepsum?mid=M01.001


In [6]:
print(len(cid_mernum_edge_list))
print(cid_mernum_edge_list)

39565
[(56684141, 'MER0000589'), (137347853, 'MER0000928'), (137347853, 'MER0000928'), (124628, 'MER0001019'), (45486355, 'MER0000375'), (45486355, 'MER0000376'), (45486355, 'MER0000383'), (45486355, 'MER0001514'), (10865444, 'MER0001516'), (10089677, 'MER0001090'), (89921642, 'MER0000295'), (134692755, 'MER0000295'), (60800, 'MER0001063'), (5362422, 'MER0001063'), (60800, 'MER0001063'), (5362422, 'MER0001063'), (60800, 'MER0001084'), (5362422, 'MER0001084'), (60800, 'MER0001080'), (5362422, 'MER0001080'), (60800, 'MER0001090'), (5362422, 'MER0001090'), (60800, 'MER0001067'), (5362422, 'MER0001067'), (60800, 'MER0001077'), (5362422, 'MER0001077'), (60800, 'MER0002384'), (5362422, 'MER0002384'), (60800, 'MER0013587'), (5362422, 'MER0013587'), (60800, 'MER0013586'), (5362422, 'MER0013586'), (60800, 'MER0001108'), (5362422, 'MER0001108'), (60800, 'MER0005214'), (5362422, 'MER0005214'), (60800, 'MER0001126'), (5362422, 'MER0001126'), (60800, 'MER0003902'), (5362422, 'MER0003902'), (60800, 

In [5]:
no_dupes_cid_mernum_edge_list = list(set(cid_mernum_edge_list)) #cast the list to a set and back to a list to remove dupelicates
no_dupes_cid_smiles_edge_list = list(set(cid_smiles_edge_list))

In [8]:
name_mernum_edge_list

[('Ser-Trp-Phe-Pro', 'MER0157380'),
 ('N-mercaptoacetyl-Phe-Tyr-amide', 'MER0001024'),
 ('trifluoroacetyl-peptide inhibitors of elastases ', 'MER0000142'),
 ('trifluoroacetyl-peptide inhibitors of elastases ', 'MER0000142'),
 ('trifluoroacetyl-peptide inhibitors of elastases ', 'MER0000142'),
 ('trifluoroacetyl-peptide inhibitors of elastases ', 'MER0000142'),
 ('trifluoroacetyl-peptide inhibitors of elastases ', 'MER0000142'),
 ('trifluoroacetyl-peptide inhibitors of elastases ', 'MER0000142'),
 ('trifluoroacetyl-peptide inhibitors of elastases ', 'MER0000142'),
 ('trifluoroacetyl-peptide inhibitors of elastases ', 'MER0000142'),
 ('aldehyde or ketone inhibitors of metallopeptidases ', 'MER0001190'),
 ('aldehyde or ketone inhibitors of metallopeptidases ', 'MER0013450'),
 ('aldehyde or ketone inhibitors of metallopeptidases ', 'MER0013450'),
 ('aldehyde or ketone inhibitors of metallopeptidases ', 'MER0013450'),
 ('GC375', 'MER0002195'),
 ('SCH 697466', 'MER0005221'),
 ('Tyr-Phe-hydro

In [8]:
count #The number of

1324

In [14]:
import pubchempy as pcp

In [19]:
results = pcp.get_compounds('ovalicin ', 'name')
print(results)
# print(results[0].ci)

[Compound(10957430), Compound(284995), Compound(442372), Compound(51049440), Compound(129638019), Compound(153274278)]


In [50]:
protease_file = open('D:/Downloads/protease.lib')
lines = protease_file.readlines(200000000)
# for line in lines:
#     print(line)

>MER0000001 - chymotrypsin B (Homo sapiens) [S01.152]#S01A#{peptidase unit: 34-263}~source CTRB_HUMAN~



In [46]:
import os
print(os.path.getsize('D:/Downloads/protease.lib'))

854456305


In [83]:
protease_file = open('D:/Downloads/protease.lib')
lines = protease_file.readlines()
prev_mernum = lines.pop(0)[1:11]
mernum_aasequence_dict = {}
prev_sequence = ''

for line in lines:
    count += len(line)
    if line[1:4] == 'MER': #Checks if it's at a mernum line. If so, insert the mernum in the dictionary with the sequence and reset the sequence
        mernum_aasequence_dict[prev_mernum] = prev_sequence
        prev_mernum = line[1:11]
        prev_sequence = ''
    else: #Otherwise keep adding to the previous sequence
        prev_sequence+=line[:-2] #This chops off the '\n' at the end of every line

mernum_aasequence_dict[prev_mernum] = prev_sequence #I update the dictionary at each new mernum line, so I have to manually insert the last one

In [84]:
mernum_aasequence_dict

{'MER0000001': 'MAFLWLLSCWALLGTTFGCGVPAIHPVLSGLSRIVNGEDAVPGSWPWQVSLQDKTGFHFGGSLISEDWVVTAAHCGVRTSDVVVAGEFDQGSDEENIQVLKIAKVFKNPKFSILTVNNITLLKLATPARFSQTVSAVCLPSADDDFPAGTLCATTGWGKTKYNANKTPDKLQQAALPLSNAECKKSWGRRITDVMICAGASGVSSCMGDSGGPLVCQKDGAWTLVGIVSWGSDTCSSSPGVYARVTKLIPWVQKILAA',
 'MER0000002': 'MNFLWLLSYCALLGTAFGCGVPAIQPVLSGLSRIVNGEEAVPGSWPWQVSLQDKTGFHFGGSLINENWVVTAAHCGVTTSDVVVAGEFDQGSSSEKIQKLKIAKVFKNSKYNSLTINNITLLKLSTAASFSQTVSAVCLPSASDDFAAGTTCVTTGWGLTRYTNANTPDRLQQASLPLSNTNCKKYWGTKIKDAMICAGASGVSSCMGDSGGPLVCKKNGAWTLVGIVSWGSSTCSSTPGVYARVTALVNWVQQTLAA',
 'MER0000003': 'MAFLWLVSCFALVGATFGCGVPTIQPVLTGLSRIVNGEDAIPGSWPWQVSLQDKTGFHFGGSLISEDWVVTAAHCGVKTSDVVVAGEFDQGSDEENIQVLKIAQVFKNPKFNMFTVRNITLLKLATPAQFSETVSAVCLPNVDDDFPPGTVCATTGWGKTKYNALKTPEKLQQAALPVSEADCKKSWGSKITDVMTCAGASGVSSCMGDSGGPLVCQKDGVWTLAGIVSWGSGVCSSTPAVYSRVTALMPWVQQILEA',
 'MER0000004': 'CGVPAIQPVLSGLARIVNGEDAVPGSWPWQVSLQDSTGFHFCGGSLISEDWVVTAAHCGTTSDVVVAGEFDQGLETEDTQVLKIGKVFKNPKFSILTVRNDITLLKLATPAQFSETVSVCLPSADEDFPAGMLCATTGWGKTKYNALKTPDKL