In [1]:
import typing as t
import os
from glob import glob

import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

%matplotlib inline

In [2]:
gh_path = '/home/is6/glyco/annotations.tsv'

gh_matches = pd.read_csv(gh_path, sep='\t')
gh_matches['sample'].unique()

array(['chz', 'sp'], dtype=object)

In [3]:
sample = 'sp'

In [4]:
gh_loci = set(
    gh_matches
    .groupby('sample')
    .filter(lambda grp: grp.name == sample)
    ['query']
)

In [5]:
def extract_cds_loci(contig: SeqRecord) -> t.Set[str]:
    features = contig.features
    # extract cds locus tags
    loci = (feat.qualifiers['locus_tag'][0] 
            for feat in features if feat.type == 'CDS')
    return set(loci)


annotation_root = f'/home/is6/annotation/{sample}'
annotation_path = glob(f'{annotation_root}/*.gbk')[0]
annotation = list(SeqIO.parse(annotation_path, 'gb'))

# fetch IDs of contigs bearing GH proteins
contigs_gh = [contig.id for contig in annotation if extract_cds_loci(contig) & gh_loci]

In [6]:
# parse CAT predictions and fetch taxonomy for selected contigs

taxonomy_path = f'/home/is6/SolidBin/CAT_taxonomic_classification/{sample}/contigs_taxonomy_noscores.tsv'
taxa_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
taxonomy = pd.read_csv(taxonomy_path, sep='\t',  usecols=['# contig', *taxa_levels], index_col='# contig').fillna('NA')
taxonomy_gh = taxonomy.loc[[contig for contig in contigs_gh if contig in taxonomy.index]]
# are there any GH contigs missing from CAT predictions?
set(contigs_gh) - set(taxonomy_gh.index)

{'contig_2342', 'contig_2394', 'contig_2531'}

In [7]:
# export taxa summaries
os.makedirs(f'gh_taxa_summary/{sample}', exist_ok=True)

for level in taxa_levels:
    tax_counts = taxonomy_gh[level].value_counts()
    tax_freqs = tax_counts / tax_counts.sum()
    tax_freqs.to_csv(f'gh_taxa_summary/{sample}/{level}.tsv', sep='\t', header=None)
    

  import sys
