In [1]:
import bs4
import requests
import pubchempy as pcp
import time

count = 0
name_mernum_edge_list = []
name_cid_edge_list = []
cid_mernum_edge_list = []
cid_smiles_edge_list = []

no_inhibits_link_urls = []
no_mernum_urls = []


base = 'https://www.ebi.ac.uk'

urls = set({})

content = requests.get('https://www.ebi.ac.uk/merops/cgi-bin/smi_index').content

soup = bs4.BeautifulSoup(content, 'lxml')
table = soup.find('table')
links = table.find_all('a') #All the links of the table
#     print(links)
links = {base + link.get('href') for link in links}
urls.update(links)

# print(urls)


for url in urls:
    try: 
        print('.', end = '')
        count+=1
        if count % 100 == 0:
            print(count)
        content = requests.get(url).content

        soup = bs4.BeautifulSoup(content, 'lxml')

        inhibit_link = soup.find_all(class_ = "inhibit button")

        if len(inhibit_link) != 0:
            inhibit_link = base + (inhibit_link[0].get('href')) #The link to the page listing the inhibitors 

        dt_tags = soup.find_all('dt'); #First one is always 'common name'. There are usually 'other names' and occasionally 'CID at pubchem'
        dt_strings = []
        other_names = []
        for tag in dt_tags: #Storing the text for future checks. This array is parallel to dt_tags
            dt_strings.append(tag.string)

        cids = set()

        try:
            CID_index = dt_strings.index('CID at PubChem') #See if the CID is already on the page
            cids.add(int(dt_tags[CID_index].find_next_siblings('dd')[0].string)) #Goes to the 'CID at PubChem' and finds the text following it
                                                                                 #Casts the cid to an int so it's the same type as the later cids
        except:
            pass

        try:
            other_names_index = dt_strings.index('Other names') #Store other names in case the common name yields no results at pubchem
            other_names = dt_tags[other_names_index].find_next_siblings('dd')[0].string.split('; ') #Multiple names are separated by ; so split separates them into a list

        except:
            pass

        common_name = dt_tags[0].find_next_siblings('dd')[0].string
        compounds = pcp.get_compounds(common_name, 'name') #Stores all the compounds resulting from pubchempy lookup


        for name in other_names:
            cs_ = pcp.get_compounds(name, 'name') #get compound from each 'other name' and check for duplicates
            for c_ in cs_: #To avoid duplicates (unfortunately I couldn't use a set here because compounds aren't hashable)
                if c_ not in compounds:
                    compounds.append(c_)


        for compound in compounds: #Going to every compound in the list, find the cid and smiles
            cid = compound.cid
            cids.add(cid)
            smiles = compound.isomeric_smiles
            cid_smiles_edge_list.append((cid, smiles))

    #     print(common_name, cids)

        try:
            inhibit_soup = bs4.BeautifulSoup(requests.get(inhibit_link).content, 'lxml')
            inhibit_table = inhibit_soup.find('table').find_all('td')
            for row in inhibit_table: #Go to each row of the inhitbit_table
                for item in row.find_all('a'): #unfortunately includes refrences too, but we can just look for 'pepsum' in the link
                    temp_link = item.get('href')
                    if 'pepsum' in temp_link:
                        mernum_link = base + temp_link
                        mernum_soup = bs4.BeautifulSoup(requests.get(mernum_link).content, 'lxml') #Go to the merid page
                        mernum_inlines = mernum_soup.find_all(class_ = 'inline')
                        mernum = ''
                        i = 0
                        while 'MER' not in mernum: #The mernum is almost always the first element, but occasionally there are additional background links. This keeps looping until 'MER' text is found
                            if i == len(mernum_inlines):
                                break
                            mernum = mernum_inlines[i].text
                            i+=1
                        if mernum == '': #If not mernum was found, skip the rest..
                            no_mernum_urls.append(mernum_link)
                            continue
                        if len(cids) == 0: #All CIDs found are used in the edge list. If no CIDs were found, common name is used instead
                            name_mernum_edge_list.append((common_name, mernum))
                        else:
                            for cid in cids:
                                cid_mernum_edge_list.append((cid, mernum))
        except:
    #         print('oop ', url)
            no_inhibits_link_urls.append(url)

    #     merid_col = inhibit_table[0]['MERID']
    #     for merid in merid_col:
    #         name_merid_edge_list.append((common_name, merid))
    except pcp.PubChemHTTPError:
        print('Server Busy')
        time.sleep(5) #Add slight delay to avoid overloading pcp servers

....................................................................................................100
....................................................................................................200
....................................................................................................300
....................................................................................................400
....................................................................................................500
....................................................................................................600
....................................................................................................700
....................................................................................................800
....................................................................................................900
................................................................

In [8]:
count #The number of iterations / molecules in the initial table

1324

In [20]:
name_mernum_edge_list = list(set(name_mernum_edge_list)) #cast the list to a set and back to a list to remove dupelicates
name_cid_edge_list = list(set(name_cid_edge_list))
cid_mernum_edge_list = list(set(cid_mernum_edge_list)) 
cid_smiles_edge_list = list(set(cid_smiles_edge_list))


In [21]:
protease_file = open('D:/Downloads/protease.lib') #4th item on https://www.ebi.ac.uk/merops/download_list.shtml. Change this directory to appropriate location
lines = protease_file.readlines()
prev_mernum = lines.pop(0)[1:11]
mernum_sequence_dict = {}
prev_sequence = ''

for line in lines:
    count += len(line)
    if line[1:4] == 'MER': #Checks if it's at a mernum line. If so, insert the mernum in the dictionary with the sequence and reset the sequence
        mernum_sequence_dict[prev_mernum] = prev_sequence
        prev_mernum = line[1:11]
        prev_sequence = ''
    else: #Otherwise keep adding to the previous sequence
        prev_sequence+=line[:-2] #This chops off the '\n' at the end of every line

mernum_sequence_dict[prev_mernum] = prev_sequence #I update the dictionary at each new mernum line, so I have to manually insert the last one

In [24]:
import pickle #Pickle all of the data and store in seperate files
pickle.dump(name_mernum_edge_list, open("name_mernum_edge_list.p", "wb"))
pickle.dump(name_cid_edge_list, open("name_cid_edge_list.p", "wb"))
pickle.dump(cid_mernum_edge_list, open("cid_mernum_edge_list.p", "wb"))
pickle.dump(cid_smiles_edge_list, open("cid_smiles_edge_list.p", "wb"))

In [19]:
name_mernum_edge_list = pickle.load(open("name_mernum_edge_list.p", "rb" )) #Opening a pickle file

In [26]:
import bz2 
with bz2.BZ2File('mernum_sequence_dict.pbz2', 'wb') as f: #This file is very large so I'm compressing it first
    pickle.dump(mernum_sequence_dict, f)

In [1]:
import pickle
import _pickle as cPickle
import bz2

# Load any compressed pickle file
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

In [6]:
mernum_sequence_dict = decompress_pickle('mernum_sequence_dict.pbz2') #To load the compressed pickle file