In [1]:
import numpy as np
import pandas as pd
import glob

from tqdm import tqdm
from Bio import SeqIO
from matplotlib import pyplot as plt
from os import path
from seq_util import *

In [2]:
# Load all the fasta filenames of the GTDB bac120 sequences
dirname = '../../kofamscan/bac120_marker_genes_reps_r207/faa/'
fnames = glob.glob(dirname + '*.faa')
fnames[:3]

['../../kofamscan/bac120_marker_genes_reps_r207/faa/TIGR01391.faa',
 '../../kofamscan/bac120_marker_genes_reps_r207/faa/TIGR00064.faa',
 '../../kofamscan/bac120_marker_genes_reps_r207/faa/TIGR02075.faa']

In [3]:
# Parse all the data from 
bac120_nosc_data = dict(gene_id=[], accession=[], NOSC=[])
for fpath in tqdm(fnames):
    # parse the gene identifier (PFAM, TIGERFAM) from the filename
    p, f = path.split(fpath)    
    head, tail = path.splitext(f)
    gene_identified = head
    for record in SeqIO.parse(fpath, 'fasta'):
        bac120_nosc_data['gene_id'].append(gene_identified)
        bac120_nosc_data['accession'].append(record.id)
        try:
            Ce, NC = calc_protein_nosc(record.seq)
            bac120_nosc_data['NOSC'].append(Ce/NC)
        except ValueError:
            # This sequence contains a non-specific amino acid
            # TODO: could calculate the NOSC with some error range
            # presuming either random aminos or some null distribution
            bac120_nosc_data['NOSC'].append(np.NaN)

  0%|          | 0/120 [00:00<?, ?it/s]

100%|██████████| 120/120 [5:48:39<00:00, 174.33s/it]  


In [4]:
# Convert to a dataframe
bac120_nosc_df = pd.DataFrame(bac120_nosc_data)
bac120_nosc_df.head()

Unnamed: 0,gene_id,accession,NOSC
0,TIGR01391,RS_GCF_001027105.1,-0.120462
1,TIGR01391,RS_GCF_001457635.1,-0.136591
2,TIGR01391,RS_GCF_001544255.1,-0.138392
3,TIGR01391,RS_GCF_001457695.1,-0.208469
4,TIGR01391,RS_GCF_000006945.2,-0.119282


In [5]:
# Save DF as a CSV
bac120_nosc_df.to_csv('../data/gtdb/r207/bac120_nosc_vals.csv')

In [6]:
# Pivot so that we have rows per genome of nosc values
bac120_nosc_mat = bac120_nosc_df.pivot(index='accession', columns='gene_id', values='NOSC')
bac120_nosc_mat.to_csv('../data/gtdb/r207/bac120_nosc_vals_wide.csv')

In [7]:
bac120_nosc_mat.to_csv('../data/gtdb/r207/bac120_nosc_vals_wide_compressed.csv', float_format='%.6f')