In [52]:
# Author: Ella Chee
# Date: July 2024
# Description: Script to scrape antibody availability data from antibodypedia.com 
# for a list of UniProt IDs, given a CSV list of proteins, returns CSV containing
# the antibody link, number of antibodies, and number of providers for each protein.

# Imports
import sys
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_antibodypedia_data(uniprot_id):
    '''Scrapes antibodypedia.com for the access link, number of antibodies and providers for a given UniProt ID'''

    # Set up the Selenium WebDriver
    driver = webdriver.Chrome()

    # Base URL
    base_url = 'https://www.antibodypedia.com/explore/uniprot%3A'

    # Add UniProt ID to base URL
    url = f'{base_url}{uniprot_id}'

    # Navigate to the URL
    driver.get(url)

    # Wait for the table to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "search_results_table"))
    )
    print('Found table for UNIPROT:', uniprot_id)

    data = []

    # Extract the link to the antibodies and number of antibodies
    try:
        link_tag = driver.find_element(By.XPATH, '//*[@id="search_results_table"]/tbody/tr/td[6]/a')
        if link_tag:
            antibodies_link = link_tag.get_attribute('href')
            number_of_antibodies = link_tag.text.split(' ')[0]
            print('Antibodies Link:', antibodies_link)
            print('Number of Antibodies:', number_of_antibodies)
    except Exception as error:
        print('Error: no antibodies found for UNIPROT:', uniprot_id)
        antibodies_link = 'None'
        number_of_antibodies = 0
        
    # Extract number of providers from the text within the div
    try:
        providers_div = driver.find_element(By.CLASS_NAME, 'txtOne')
        if providers_div:
            try:
                number_of_providers = providers_div.find_element(By.XPATH, '//*[@id="search_results_table"]/tbody/tr/td[6]/div/b').text
                print('Number of Providers:', number_of_providers)
            except Exception as error:
                print('Error: no providers found for UNIPROT:', uniprot_id)
                number_of_providers = 0
    except Exception as error:
        print('Error: no providers found for UNIPROT:', uniprot_id)
        number_of_providers = 0


    # Append the data to the list
    data.append({
        'UNPROT': uniprot_id,
        'Antibody Link': antibodies_link,
        'Number of Antibodies': number_of_antibodies,
        'Number of Providers': number_of_providers
    })

    driver.quit()
    return data

def main():
    '''Main function to scrape the data from antibodypedia.com and save it to a CSV file, 
        given a CSV of UniProt IDs'''

    df = pd.read_csv('unique_to_mTbG4P_dedup.csv')
    uniprot_df = df[['UNIPROT']].copy()

    for index, row in uniprot_df.iterrows():
        uniprot_id = row['UNIPROT']
        data = scrape_antibodypedia_data(uniprot_id)
        uniprot_df.loc[index, 'Antibody Link'] = data[0]['Antibody Link']
        uniprot_df.loc[index, 'Number of Antibodies'] = data[0]['Number of Antibodies']
        uniprot_df.loc[index, 'Number of Providers'] = data[0]['Number of Providers']
        print('Scraped data for UniProt ID:', uniprot_id)
    
    # Save the updated dataframe to a new CSV file
    uniprot_df.to_csv('antibody_availability_data.csv', index=False)

main()


Found table for UNIPROT: A0A087WUL8
Antibodies Link: https://www.antibodypedia.com/gene/82366/NBPF19
Number of Antibodies: 0
Error w/ //*[@id="search_results_table"]/tbody/tr/td[6]/div/b
Scraped data for UniProt ID: A0A087WUL8
Found table for UNIPROT: A0A5B6
Error w/ //*[@id="search_results_table"]/tbody/tr/td[6]/a
Error w/ //*[@id="search_results_table"]/tbody/tr/td[6]/div/b
Scraped data for UniProt ID: A0A5B6
Found table for UNIPROT: A2RRP1
Antibodies Link: https://www.antibodypedia.com/gene/26924/NBAS
Number of Antibodies: 84
Number of Providers: 24
Scraped data for UniProt ID: A2RRP1


  uniprot_df.loc[index, 'Number of Providers'] = data[0]['Number of Providers']


Found table for UNIPROT: A4UGR9
Antibodies Link: https://www.antibodypedia.com/gene/33785/XIRP2
Number of Antibodies: 50
Number of Providers: 14
Scraped data for UniProt ID: A4UGR9
Found table for UNIPROT: A5PKW4
Antibodies Link: https://www.antibodypedia.com/gene/31428/PSD
Number of Antibodies: 50
Number of Providers: 16
Scraped data for UniProt ID: A5PKW4
Found table for UNIPROT: A6NCQ9
Antibodies Link: https://www.antibodypedia.com/gene/62437/RNF222
Number of Antibodies: 6
Number of Providers: 6
Scraped data for UniProt ID: A6NCQ9
Found table for UNIPROT: A6NIE6
Error w/ //*[@id="search_results_table"]/tbody/tr/td[6]/a
Error w/ //*[@id="search_results_table"]/tbody/tr/td[6]/div/b
Scraped data for UniProt ID: A6NIE6
Found table for UNIPROT: A6NJZ7
Antibodies Link: https://www.antibodypedia.com/gene/21503/RIMBP3C
Number of Antibodies: 0
Error w/ //*[@id="search_results_table"]/tbody/tr/td[6]/div/b
Scraped data for UniProt ID: A6NJZ7
Found table for UNIPROT: A6NMT0
Antibodies Link: htt

In [8]:
import sys
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def track_references_antipodypedia(antibody_id):
    '''
    Track antibodies with references from antibodypedia.
    '''
    
    # Set up the Selenium  WebDriver, construct URL, navigate to URL
    driver = webdriver.Chrome()
    url = f'https://www.antibodypedia.com/gene/{antibody_id}?reference%5B%5D=yes'
    driver.get(url)
    
    # Load table
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "featured_antibodies"))
    )
    print('Found table for antibody:', antibody_id)

    # Extract the link to the antibodies and number of antibodies
    try:
        link_tag = driver.find_element(By.XPATH, '//*[@id="filter_results"]/b[1]')
        if link_tag:
            referenced_antibodies = link_tag.text
            print('Number of Referenced Antibodies:', referenced_antibodies)
    except Exception as error:
        print('Error: no references found for antibody:', antibody_id)
        referenced_antibodies = 0
    driver.quit()
    return referenced_antibodies


In [9]:
track_references_antipodypedia('33922/AGPS')

Found table for antibody: 33922/AGPS
Number of Referenced Antibodies: 1


'1'

In [17]:
import requests
from bs4 import BeautifulSoup
from lxml import etree 

def scrape_antibodypedia_data(uniprot_id):
    '''
    Scrapes antibodypedia.com for the access link, 
    number of antibodies and providers for a given UniProt ID.
    '''
    url = f'https://www.antibodypedia.com/explore/uniprot%3A{uniprot_id}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    body = soup.find('body')

    dom = soup.etree.HTML(str(body))
    print(dom.xpath('//*[@id="search_results_table"]/tbody/tr/td[6]/div')[0].text)

    # Extract the link to the antibodies and number of antibodies
    try:
        elements = soup.select('#search_results_table tbody tr td:nth-of-type(6) a')
        if elements:
            element = elements[0]
            antibodies_link = element.get('href')
            antibody_id = antibodies_link.split('/')[-1]
            number_of_antibodies = element.text.split(' ')[0]
            print('Antibodies Link:', antibodies_link)
            print('Number of Antibodies:', number_of_antibodies)
            print('Antibody ID:', antibody_id)
        else:
            raise Exception("No elements found")
    except Exception as error:
        print(f'Error: no antibodies found for UNIPROT: {uniprot_id} ({error})')
        antibodies_link = 'None'
        number_of_antibodies = 0
        antibody_id = 'None'

    # Extract number of providers from the text within the div
    try:
        providers_div = soup.select('#search_results_table tbody tr td:nth-of-type(6) div.txtOne')
        if providers_div:
            providers_element = providers_div[0].find('b')
            if providers_element:
                number_of_providers = providers_element.text
                print('Number of Providers:', number_of_providers)
            else:
                raise Exception("No providers element found")
        else:
            raise Exception("No providers div found")
    except Exception as error:
        print(f'Error: no providers found for UNIPROT: {uniprot_id} ({error})')
        number_of_providers = 0

    print('Queried for protein:', uniprot_id)
    return (antibodies_link, antibody_id, number_of_antibodies, number_of_providers)

def track_references_antibodypedia(antibody_id):
    '''
    Track antibodies with references from antibodypedia.
    '''
    if antibody_id == 'None':
        return 0
    
    url = f'https://www.antibodypedia.com/gene/{antibody_id}?reference%5B%5D=yes'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the link to the antibodies and number of antibodies
    try:
        elements = soup.select('#filter_results b:nth-of-type(1)')
        if elements:
            referenced_antibodies = elements[0].text
            print('Number of Referenced Antibodies:', referenced_antibodies)
        else:
            raise Exception("No elements found")
    except Exception as error:
        print(f'Error: no references found for antibody: {antibody_id} ({error})')
        referenced_antibodies = 0

    return referenced_antibodies

def query_antibodypedia(uniprots):
    '''
    Query antibodypedia given a list of proteins and return 
    a list of antibody links, number of antibodies, and number of providers.
    '''
    links = []
    ids = []
    antibodies = []
    providers = []
    references = []

    for u in uniprots:
        link, id, antibody, provider = scrape_antibodypedia_data(u)
        links.append(link)
        ids.append(id)
        antibodies.append(antibody)
        providers.append(provider)
    
    for i in ids:
        references.append(track_references_antibodypedia(i))
    
    print('Successfully queried antibodypedia')
    return links, antibodies, providers, references

query_antibodypedia(['A8MTJ6'])


AttributeError: 'NoneType' object has no attribute 'HTML'