In [1]:
import boto3
import pandas as pd
from ete3 import NCBITaxa
import subprocess
import itertools
import os
import s3fs
import numpy as np
import pdb
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from lca_functions import *

In [2]:
def get_line_count (folder_name):
    system_return = str(subprocess.check_output(["wc -l "+folder_name+"/*"], shell=True)).split("\\n")[:-3]
    outdf = pd.DataFrame([row.split(" ")[-2:] for row in system_return], columns=["line_count", "filename"])
    outdf["line_count"] = outdf["line_count"].astype("int")
    return (outdf)

line_counts = [get_line_count(x) for x in os.listdir() if x.startswith("CMS00") and not x.endswith(".m9")]

In [30]:
def get_splits (line_count_df):
    target_chunk_size = line_count_df["line_count"].max()*2
    df = line_count_df.sort_values(by="line_count")
    groups = []
    curr_group = []
    nrows = 0
    for i in range(len(df)):
        curr_group += [df["filename"].iloc[i]]
        nrows += df["line_count"].iloc[i]
        if nrows >= target_chunk_size:
            groups.append(curr_group)
            curr_group = []
            nrows = 0
    for i, x in enumerate(groups):
        os.system("mkdir -p output/"+os.path.dirname(x[0]))
        os.system("cat "+' '.join(x)+" > output/"+os.path.dirname(x[0])+"/blast_nr_"+str(i)+".m9")

In [31]:
[get_splits(x) for x in line_counts]

[None, None, None, None, None]

In [47]:
def get_lca_analysis_command (fn, blast_type="nr", read_count_path=None):
    prefix = '_'.join(os.path.basename(fn).split("_")[:-2])
    filtered_blast_path = fn.replace(prefix, "filtered_blast")
    exclude_contigs_path = fn.replace(prefix, "exclude_contigs")
    lca_path = fn.replace(prefix, "lca")
    dirname = os.path.basename(os.path.dirname(fn))
    if read_count_path is None:
        read_count_path = "s3://czbiohub-mosquito/contigs/"+dirname+"/contig_stats.json"
    return ("python lca_analysis.py --blast_type "+blast_type+" --fpath "+fn+\
    " --filtered_blast_path "+filtered_blast_path+\
    " --excluded_contigs_path "+exclude_contigs_path+\
    " --outpath "+lca_path+\
    " --read_count_path "+read_count_path+\
    " --verbose True")

In [44]:
with open ("temp_blast_nr_commands", "w") as handle:
    for r, d, f in os.walk("output"):
        for file in f:
            if file.startswith("blast_nr_"):
                handle.write("%s\n" % get_lca_analysis_command (os.path.join(r, file), "nr"))

In [48]:
with open ("temp_blast_nt_commands", "w") as handle:
    for r, d, f in os.walk("archive/new_blast_nt_files"):
        for file in f:
            if file.startswith("new_blast_nt_"):
                handle.write("%s\n" % get_lca_analysis_command (os.path.join(r, file), "nt", "s3://czbiohub-mosquito/contig_quality_concat/contig_stats_all.tsv"))