# Webscraping the GuideToPharmacology website

## Output data

1. edgelist.pickle -> (ligand_id, receptor_id). Size: [21569, 2]
2. receptor_sequences.pickle -> (receptor_id, aa). Size: [752]
3. smiles.pickle -> (ligand_id, smiles). Size: [2825]
4. aa.pickle -> (ligand_id, aa). Size: [1240]

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import os
import requests
import yaml
import pandas as pd
from tqdm import tqdm
import pickle
import re

In [818]:
yaml_file = 'GuideToPharmacology.yml'

catalog = yaml.safe_load(open(yaml_file, 'r'))
base = catalog['base']
urls = catalog['urls']

## Landing page -> get Receptor names and links

In [819]:
def get_soup_table(response, *args, **kwargs):
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', *args, **kwargs)
    return table

def get_links(table, inclusion_word):
    links = {}
    for table_row in table.findAll('tr'):
        for entry in table_row.findAll('td'):
            text = entry.find('a')
            if text is not None and inclusion_word in text['href']:
                name, link = text.text, text['href']
                name = name.replace(' ', '').replace('\n', '')
                links[name] = link

    return links

In [820]:
receptor_links = {}
for url in urls:
    response = requests.get(base + url)
    table = get_soup_table(response)
    links = get_links(table, 'familyId')
    receptor_links.update(links)
print(f'There are a total of {len(receptor_links)} unique receptors')

There are a total of 373 unique receptors


### Get complete data tables (pKd, pKi, etc)

In [None]:
def get_table(response, receptor_name, classes = ['agonists', 'antagonists'], uniprot_pattern = 'https://www.uniprot.org/uniprot/(.*)",'):
    temp = []
    links = []
    temp_class = []

    soup = BeautifulSoup(response, 'html.parser')
    
    for class_ in classes:
        table = soup.find('table', {'id': class_})

        for table_row in table.findAll('tbody'):
            for tr in table_row.findAll('td'):
                text = tr.text.replace('\n', '').replace(' ', '')
                if len(text) > 0 and text != 'â¤·':
                    temp.append(text)
                for href in tr.findAll('a'):    
                    try:
                        if 'ligandId=' in href['href'] and ')' not in href['href']:
                            links.append(href['href'].split('ligandId=')[1])
                            temp_class.append(class_)
                    except:
                        continue
        
    results = {}
    results['Name'] = [receptor_name] * len(links)
    results['ID'] = links
    results['Ligand'] = temp[::7]
    results['Action'] = temp[2::7]
    results['Value'] = temp[3::7]
    results['Parameter'] = temp[4::7]
    results['Uniprot'] = [re.findall(uniprot_pattern, response.decode())[0]] * len(links)
    results['Class'] = temp_class
    return pd.DataFrame(results)

dataframes = []

for receptor_name, link in receptor_links.items():
    print(f'Now running {receptor_name}')
    url = base + link
    response = requests.get(url).content
    try:
        dataframes.append(get_table(response, receptor_name))
    except Exception as e:
        print(f'Error with {receptor_name} {e}')
        continue
    


In [873]:
dataframe = pd.concat(dataframes)
cleaning_index = [len(i) == 6 for i in dataframe.loc[:, 'Uniprot']]
dataframe = dataframe[cleaning_index]
dataframe.to_csv('gpcr.csv', index = False)

### Get edge list of receptor ligands
#### Note. This functionality is replaced in the section below that downloads all of the dataframse

In [877]:
edgelist = dataframe[['ID', 'Uniprot']].values
unique_receptors = dataframe['Uniprot'].unique()
unique_ligands = dataframe['ID'].unique()

receptors = {}
smiles, aa = {}, {}


In [None]:
"""def get_uniprot_ligand(url, patterns): # given a url of a receptor landing page
    results = []
    page = requests.get(url).content.decode()
    for _, pattern in patterns.items():
        results.append(re.findall(pattern, page))    
    results[0] = [r for r in results[0] if r.isdigit()] # check to make sure is digit
    results[1] = [r for r in results[1] if len(r) == 6]
    return results

for receptor, suffix in tqdm(receptor_links.items()): # iterate through each receptor
    url = base + suffix # each receptor url
    ids = get_uniprot_ligand(url, patterns) # get the ids 
    ligands_ids, receptors_ids = ids[0], ids[1] 
    edgelist.extend([(ligand, receptor) for ligand in ligands_ids for receptor in receptors_ids])
    
edgelist = list(set(edgelist))
"""


### For every unique receptor in edgelist, get fasta

In [None]:
# given uniprot id, gives sequence
def get_uniprot(uniprotid, prefix = 'https://www.uniprot.org/uniprot/', suffix = '.fasta'):    
    fasta_url = prefix + uniprotid + suffix
    return ''.join(requests.get(fasta_url).content.decode().split('\n')[1:-1])
receptor_sequences = {i: get_uniprot(i) for i in unique_receptors}


### For every unique ligand in edgelist, get sequence/smiles

In [None]:
def get_ligand_response(ligandid, prefix = 'https://www.guidetopharmacology.org/GRAC/LigandDisplayForward?tab=structure&ligandId=', suffix = ''):
    ligand_url = prefix + ligandid + suffix
    return requests.get(ligand_url)

def get_ligand_aa(response):    
    if 'Peptide Sequence' not in response.content.decode(): 
        return None       

    table = get_soup_table(response, {'class': 'receptor_data_tables'})
    for table_row in table.findAll('tr', {'class': 'info'}):
        for entry in table_row.findAll('td', {'style':'text-align:left;'}):
            if '-' in entry.text:
                continue
            return entry.text.replace(' ','').replace('\n','')

def get_ligand_smiles(response):
    table = get_soup_table(response)
    entries = [entry.text for entry in table.findAll('td', {'style':'text-align:left;'})]
    
    if len(entries) == 0:
        return ''

    return entries[1].replace(' ', '').replace('\n', '')

In [None]:
for ligandid in tqdm(unique_ligands):
    response = get_ligand_response(ligandid)
    temp = get_ligand_aa(response) # if aa, use aa
    if not temp: # if not aa, get smiles
        temp_smiles = get_ligand_smiles(response)
        if temp_smiles == '':
            continue
        smiles[ligandid] = temp_smiles
    else:
        aa[ligandid] = temp

### Save and dump data

In [None]:
import pickle

pickle.dump(receptor_sequences, open('receptors.pickle', 'wb'))
pickle.dump(smiles, open('smiles.pickle', 'wb'))
pickle.dump(aa, open('aa.pickle', 'wb'))