In [1]:
# Import the fabapi module, as well as some other stuff for working with data. 
from fabapi import *
import pandas as pd
import numpy as np
from tqdm import tqdm

from os import path

In [2]:
def get_save_q(name, outfname, qtype, field, val):
    q = Query(qtype)
    q.equal_to(field, val)

    count = q.count()
    print(f'{name} ({val}): {count} results')

    if count == 0:
        return 
    
    page_df = q.next()
    dfs = [page_df]
    while page_df is not None:
        page_df = q.next()
        dfs.append(page_df)

    print(f'Concatenating {len(dfs)} results and writing output')
    dfs = [d for d in dfs if d is not None]
    results = pd.concat(dfs)
    results.to_csv(outfname, index=False)

def get_save_all(outdir, qdicts, qtype='annotations_pfam', field='ko'):
    for d in tqdm(qdicts):
        name = d['name']
        outfname = path.join(outdir, f'{name}.csv')
        if path.exists(outfname):
            print(f'output for {name} already exists, skipping')
            continue
        get_save_q(name, outfname, qtype, field, d['query'])


In [3]:
kos2query = [
    dict(name='rubisco_LSU', query='K01601'),
    dict(name='rubisco_SSU', query='K01602'),
    dict(name='sbpase_bifuncI', query='K01086'),
    dict(name='sbpase_bifuncII', query='K11532'),
    dict(name='sbpaseI', query='K01100'),
    dict(name='sbpaseII', query='K22315'),
    dict(name='fdh', query='K00122'),
    dict(name='fdh_large', query='K00123'),
    dict(name='fd_h2ase', query='K00532'),
    dict(name='fd_h2ase_large', query='K00533'),
    dict(name='membrane_h2ase_alpha', query='K18016'),
    dict(name='membrane_h2ase_beta', query='K18017'),
    dict(name='hoxK_h2ase_small', query='K06282'),
    dict(name='hoxG_h2ase_large', query='K06281'),
    dict(name='hoxZ_h2ase_cyt', query='K03620'),
    dict(name='hoxY_h2ase_small', query='K18007'),
    dict(name='hoxH_h2ase_large', query='K00436'),
    dict(name='hoxU_nife_h2ase_small', query='K18006'),
    dict(name='hoxF_nife_h2ase_large', query='K18005'),
    dict(name='meoh_dehydrogenase', query='K00093'),
    dict(name='n2ase_delta', query='K00531'),
    dict(name='n2ase_femo_alpha', query='K02586'),
    dict(name='n2ase_femo_beta', query='K02591'),
]

outdir = '../output/findabug/kegg_annotations/'
queries_df = pd.DataFrame(kos2query)
queries_df.to_csv(path.join(outdir, 'queries.csv'), index=False)

In [4]:
get_save_all(outdir, kos2query, qtype='annotations_kegg', field='ko')

  0%|          | 0/23 [00:00<?, ?it/s]

100%|██████████| 23/23 [00:00<00:00, 241.96it/s]

output for rubisco_LSU already exists, skipping
output for rubisco_SSU already exists, skipping
output for sbpase_bifuncI already exists, skipping
output for sbpase_bifuncII already exists, skipping
output for sbpaseI already exists, skipping
sbpaseII (K22315): 0 results
output for fdh already exists, skipping
output for fdh_large already exists, skipping
output for fd_h2ase already exists, skipping
output for fd_h2ase_large already exists, skipping
output for membrane_h2ase_alpha already exists, skipping
output for membrane_h2ase_beta already exists, skipping
output for hoxK_h2ase_small already exists, skipping
output for hoxG_h2ase_large already exists, skipping
output for hoxZ_h2ase_cyt already exists, skipping
output for hoxY_h2ase_small already exists, skipping
output for hoxH_h2ase_large already exists, skipping
output for hoxU_nife_h2ase_small already exists, skipping
output for hoxF_nife_h2ase_large already exists, skipping
meoh_dehydrogenase (K00093): 0 results
output for n2as




In [5]:
pfams2query = [
    dict(name='BMC_domain', query='PF00936'),
    dict(name='Csos2', query='PF12288'),
]

outdir = '../output/findabug/pfam_annotations/'
queries_df = pd.DataFrame(pfams2query)
queries_df.to_csv(path.join(outdir, 'queries.csv'), index=False)

In [6]:
get_save_all(outdir, pfams2query, qtype='annotations_pfam', field='pfam')

100%|██████████| 2/2 [00:00<00:00, 903.36it/s]

output for BMC_domain already exists, skipping
output for Csos2 already exists, skipping



