In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

Create webdriver instance

In [2]:
driver = webdriver.Firefox()

In [3]:
def get_dali_query(prot: str, web_driver):
    url = 'http://ekhidna2.biocenter.helsinki.fi/dali/'
    web_driver.get(url)
    
    # Formulate and send query
    tab = web_driver.find_element_by_id('ui-id-3')
    tab.click()
    form = web_driver.find_element_by_id('taxonomy')
    form.send_keys(prot)
    submit_button = driver.find_element_by_name('submit')
    submit_button.click()
    
    # Wait until job finishes
    def wait_for_results():
        try:
            web_driver.find_element_by_link_text('Download matches against PDB90')
            return False
        except:
            return True
    
    while(wait_for_results()):
        pass
    
    link = web_driver.find_element_by_link_text('Download matches against PDB90')
    link.click()
    
    query_text = driver.find_element_by_tag_name('pre').get_attribute('innerHTML')
    
    # Parse the whole DALI query
    result_table = query_text.split('#')[3]
    rows = result_table.split('\n')
    rows.pop(0)
    rows[0]
    
    result_set = []
    for r in rows:
        if r:
            result_set.append(r.strip().split()[1])
            
    return result_set

In [4]:
result_set = get_dali_query('3m5lA', driver)

In [5]:
result_set

['6p6v-A',
 '6p6s-A',
 '6bqj-C',
 '5wdx-A',
 '1rgq-B',
 '6p6t-A',
 '1cu1-A',
 '5lkl-A',
 '5lkl-B',
 '3otp-A',
 '5jd8-A',
 '2w5e-A',
 '3stj-A',
 '3nwu-B',
 '5ilb-A',
 '3lkw-A',
 '2wv9-A',
 '3nzi-A',
 '4ynn-A',
 '3pv3-A',
 '3k6y-A',
 '2z9i-A',
 '6z05-A',
 '4fln-B',
 '3qo6-B',
 '4ink-A',
 '4m9m-A',
 '5il9-A',
 '1sgc-A',
 '5y28-C',
 '5t69-A',
 '2w7u-B',
 '3lgu-A',
 '5hma-A',
 '1l1j-A',
 '5lc0-A',
 '4ri0-A',
 '1soz-C',
 '6urv-F',
 '4k1t-C',
 '6u1b-A',
 '2ea3-A',
 '2pfe-A',
 '3u1i-B',
 '5yvu-B',
 '4wjg-2',
 '1p01-A',
 '2oua-A',
 '5mrr-A',
 '1qy6-A',
 '2qaa-A',
 '2yol-A',
 '4ic5-A',
 '2as9-A',
 '2sfa-A',
 '5mm8-A',
 '3sti-A',
 '4nsy-B',
 '1te0-A',
 '1agj-A',
 '2m9p-A',
 '6e0u-A',
 '1arc-A',
 '5t1v-B',
 '1cgi-E',
 '5zvj-A',
 '5dj7-A',
 '1spj-A',
 '4ic6-A',
 '1hpg-A',
 '5zfh-A',
 '1npm-A',
 '5c2z-A',
 '5mrt-A',
 '2oq5-A',
 '4h4f-A',
 '1elt-A',
 '2zch-P',
 '2qxg-A',
 '4dgj-A',
 '1p3c-A',
 '6ku7-A',
 '1bru-P',
 '5fht-A',
 '3i77-A',
 '1pq5-A',
 '3gdv-A',
 '2eek-A',
 '1euf-A',
 '1ym0-A',
 '2ijd-1',

# Now let's get all clan PA proteins

In [6]:
pa_proteins = {
    'C03': ['1L1NA', '1HAVA', '1CQQA', '2J92A', '2ZU3A', '2HRVA'],
    'C04': ['1LVMA'],
    'C37': ['1WQSA'],
    'S01': ['2GMTB', '1IAUA', '1A0LA', '2PSYA', '2OQ5A', '1OP0A', '2F91B', '1FI8A',
            '1BRAA', '1SGTA', '1TRYA', '2W5EA', '1DPOA', 
            '1HYLA', '1AZZA', '1A0JA',
            '1TRNA', '1HNEE', '1CGHA', '1FUJA', '1ORFA', '3FZZA', 
            '2ZGCA', '3RP2A', '2F9PA', '1MZAA', '2RDLA', '1JRSA', '2JETA', '1ELCA', '1EKBB', 
            '1PYTC', '3DFJA', '2ZCHP', '1SGFG', '1GVZA', '1TONA', '1AO5A', '2R9PC', 
            '1MD7A', '1ELVA', '2ODPA', '3GOVA', '2OLGA', '1ZJDA', '1PFXC', '1DANH',
            '1HCGA', '1ABJH', '1AUTC', '1FIZA', '1O5FL', '1YBWA', '1Q3XA', '1MLWA',
            '1BUIA', '1LO6A', '1YM0A', 
            '1NPMA', '2BDGA', '1SGCA', '1SGPE', 
            '1HPGA', '2ALPA', '1QY6A', '1EXFA', '1KY9A', '1SO7A', '1LCYA', '1ARCA',
            '2VIDA', '2AS9A', '2QXIA', '2GV6A', '2SFAA', '1EQ9A', '1P3EA', 
            '2AIQA', '1L1JA', '2XXLA', '1SGFA', '2PFEA'
           ],
    'S03': ['2SNVA'],
    'S06': ['1WXRA'],
    'S29': ['1A1RA'],
    'S32': ['1MBMA'],
    'S39': ['1ZYOA'],
}

In [None]:
for family, pdb_list in pa_proteins.items():
    for p in pdb_list:
        result_set = get_dali_query(p, driver)

        # Write out new data
        outdir = './dali_dataset/' + family
        if not os.path.exists(outdir):
            os.mkdir(outdir)
             
        with open(outdir + '/' + p + '.txt', 'w') as f:
            p_reformatted = p[:4].lower() + '-' + p[4]
            f.write(p_reformatted + '\n')
            f.writelines(m + '\n' for m in result_set)
            
        del result_set[:]