In [19]:
from robobrowser import RoboBrowser
import re

In [2]:
def fetch_thermophiles(pfam_id):
    """
    Function to query protDataTherm and extract thermophile uniprot accession numbers
    pfam_id str: PFAM ID
    
    returns: list of uniprot accession numbers
    """
    browser = RoboBrowser(parser="lxml", user_agent="wswp")
    browser.open('http://profiles.bs.ipm.ir/softwares/protdatatherm/')
    form = browser.get_form(action='2.php')
    
    form.fields['name'].value = pfam_id
    
    browser.submit_form(form, timeout=100)
    
    pattern=re.compile(r"""------------------------------------------------------------
Thermophilic Sequences:
------------------------------------------------------------
(.*)""")
    
    return pattern.findall(browser.select('textarea')[0].text)

In [3]:
fetch_thermophiles("PF07408")

['A0A0M0KJM6, A0A0A8X143, A0A0N0Y981, G9QMD8, A0A0H4NXZ5, A0A0D6ZEV9, A0A090KRH0, L0EDJ8, U2YC59, Q5L118, A0A0D8BYS0, A0A0E0T9S3, A0A087LHY8, G8MZJ4, A0A098L483, I3E568, A0A068LVF5, A0A094JZP4, A0A0B0HPR3, M5QY60, A0A0D0RX96']

In [4]:
from bs4 import BeautifulSoup

In [5]:
with open('./Pfam-C', 'r', encoding='windows-1252') as f:
    pfam_text = f.readlines()

In [6]:
pfam_pattern = re.compile('PF\d{5}')

In [7]:
pfam_pattern.findall('#=GF MB   PF02834;\n')

['PF02834']

In [8]:
ids = set([pfam_pattern.findall(x)[0] for x in pfam_text if len(pfam_pattern.findall(x)) > 0])

In [9]:
# from https://stackoverflow.com/questions/519633/lazy-method-for-reading-big-file-in-python
def read_in_chunks(file_object, chunk_size=1024):
    """Lazy function (generator) to read a file piece by piece.
    Default chunk size: 1k."""
    while True:
        data = file_object.readlines(chunk_size)
        if not data:
            break
        yield data

In [10]:
pfam_ids = set()

with open('/home/gil/Downloads/Pfam-A.fasta', 'r') as f:
    for piece in read_in_chunks(f):
        for line in piece:
            t = pfam_pattern.findall(line)
            if len(t) > 0:
                pfam_ids.add(t[0])

In [11]:
len(pfam_ids)

16479

In [12]:
from tqdm import tqdm_notebook

In [13]:
from concurrent.futures import ThreadPoolExecutor

In [14]:
def split_list(a, n=4):

    """
    Splits list into n iterators.
    """
    iter_lengths = [int(len(a) / n)] * n
    # add remainder to last process
    # iter_lengths[-1] += iter_lengths[-1] + len(a) % n

    iterators = []

    i = 0

    for x in iter_lengths:
        if isinstance(a[x], list):
            iterators.append([a[x][0] for x in range(i, i+x) if len(a[x]) == 1])
        else:
            iterators.append([a[x] for x in range(i, i+x)])
        i += x

    current_length = sum(iter_lengths)
    remainder = len(a) % n

    i = 0

    while remainder > 0:
        iterators[i].append(a[current_length + remainder - 1])
        remainder -= 1
        i += 1

    return iterators

In [15]:
def worker_group(names, result_list, completed, file):

    """
    Groups up work, so that one process can perform more
    than one job at a time and be more efficient
    """

    for name in names:
        result = fetch_thermophiles(name)
        if len(result) > 0:
            for x in result:
                result_list.append(x)
            with open(file, 'a') as f:
                for x in result:
                    f.write(x)
        completed+=1

In [16]:
from multiprocessing import Manager

In [None]:
file = '/home/gil/PhD_notebooks/Thermophiles/thermophiles.txt'
result_list = []

for x in tqdm_notebook(list(pfam_ids)[11433+705:]):
    result = fetch_thermophiles(x)
    if len(result) > 0:
        for x in result:
            result_list.append(x)
        with open(file, 'a') as f:
            for x in result:
                f.write(x)

In [17]:
manager = Manager()
results = manager.list()
completed = manager.Value('i', 0)
file = '/home/gil/PhD_notebooks/Thermophiles/thermophiles.txt'

with ThreadPoolExecutor(4) as executor:
    
    jobs = split_list(list(pfam_ids), 4)
    
    futures = {executor.submit(worker_group, j, results, completed, file) for j in jobs}
    
    pbar = tqdm_notebook(total=len(pfam_ids))
    while completed.value < len(pfam_ids):
        pbar.update(completed.value - pbar.n)

KeyboardInterrupt: 

In [76]:
thermophiles = set()

for pfam_id in tqdm_notebook(pfam_ids):
    thermophile = fetch_thermophiles(pfam_id)
    if len(thermophile) > 0:
        for x in thermophile:
            thermophiles.add(x)

KeyboardInterrupt: 