In [1]:
import os
import subprocess
from pathlib import Path
import pandas as pd

In [2]:
def load_profile(file):
    return pd.read_csv(file, sep='\t', header=0, names=['locus_id', 'allele_id'], index_col=0).fillna('')

def seq_stats(seqfile):
    output = subprocess.check_output(f'seqkit stats -abT {seqfile} | tail -n 1', shell=True)
    output = output.decode().split()
    num_seqs, sum_len, n50 = output[3], output[4], output[12]
    return num_seqs, sum_len, n50

def workflow(dirpath):
    base_dirname = Path(dirpath)
    profile_dirname = base_dirname/'profile'
    reference_dirname = base_dirname/'reference'
    accuracy_dirname = base_dirname/'assembly_accuracy'
    denovo_dirname = base_dirname/'denovo'
    
    reference_profile = load_profile(reference_dirname/'profile.tsv')

    data = []
    for query_filename in profile_dirname.iterdir():
        query = load_profile(query_filename)
        distance = (query['allele_id'].values != reference_profile['allele_id'].values).sum()
        data.append((query_filename.stem, distance))
    distance_df = pd.DataFrame(data, columns=['sample_name', 'cgmlst_distance']).sort_values('sample_name')
        
    dfs = []
    for i in accuracy_dirname.glob('*.txt'):
        df = pd.read_csv(i, sep='\t', usecols=['qscore', 'segment_median_qscore', 'num_mismatches', 'num_insertions', 'num_deletions'])
        df.index = [i.stem]
        dfs.append(df)
    accuracy_df = pd.concat(dfs)
    
    assembly_df = pd.DataFrame(
        data={
            i.name: seq_stats(i/'contigs.fasta')
            for i in denovo_dirname.iterdir()
        },
        index=['contig_num', 'genome_size', 'N50']
    ).T

    df = distance_df.merge(accuracy_df, right_index=True, left_on='sample_name').merge(assembly_df, right_index=True, left_on='sample_name')
    df.loc[:, 'depth'], df.loc[:, 'repeat'] = df['sample_name'].str.split('-').str
    df['depth'] = df['depth'].str.strip('0').str.replace('x', '')
    return df

In [8]:
dirpaths = """/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20211209_Lis_barcode10_R20-0148
"""

In [9]:
for dirpath in dirpaths.split():
    df = workflow(Path(dirpath))
    df.to_csv(Path(dirpath, 'stats.txt'), sep='\t', index=False)



In [21]:
import shutil

In [29]:
dirpaths = """/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20200925_barcode01_R19-2905
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20200925_barcode02_R20-0026
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20200925_barcode04_R20-0088
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20200925_barcode05_R20-0127
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20200925_barcode06_R20-0131
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20200925_barcode10_R20-0150
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20200925_barcode11_R20-0158
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20200925_barcode12_R20-0160
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20211209_Lis_barcode07_R20-0030
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20211209_Lis_barcode08_R20-0140
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20211209_Lis_barcode09_R20-0145
/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20211209_Lis_barcode10_R20-0148"""

In [30]:
dirpaths = [Path(dirpath) for dirpath in dirpaths.splitlines()]

In [31]:
outpath = Path('/media/GenomicResearch/Issue/20220215_modification_paper_data/ONT_WGS/Fastq/')

for dirpath in dirpaths:
    src = dirpath/'reads.fastq.gz'
    dst = outpath/(dirpath.name.rsplit('_', 2)[-1] + '.fastq.gz')
    shutil.copyfile(src, dst)

In [14]:
df = pd.read_csv('/media/GenomicResearch/Issue/20210909_estimate_nanopore_depth/wgs-2/20200925_barcode12_R20-0160/stats.txt', sep='\t')

'025x-1'