In [1]:
from io import BytesIO
from pathlib import Path
from itertools import chain

import pandas as pd
from Bio import SeqIO

In [2]:
import sys
sys.path.append('../src')
from utils import run_cmd

In [3]:
def extract_sequence_by_id(input_path, id_list, output_file, seq_id_prefix=''):
    id_list = set(id_list)
    input_path = Path(input_path)
    if input_path.is_file():
        records = SeqIO.parse(input_path, 'fastq')
    else:
        records = chain.from_iterable(SeqIO.parse(filepath, 'fastq') for filepath in input_path.glob('*.fastq'))
    with open(output_file, 'a') as f:
        for record in records:
            if record.id in id_list:
                record.id = seq_id_prefix + record.id
                f.write(record.format('fastq'))

def fx2tab(input_path):
    """
    convert FASTQ to tabular format
    """
    input_path = Path(input_path)
    if input_path.is_file():
        child_process = run_cmd(f"seqkit fx2tab -inl -j 8 {input_path}") # only ID and length
    else:
        child_process = run_cmd(f"seqkit fx2tab -inl -j 8 {input_path/'*.fastq'}")
    df = pd.read_csv(BytesIO(child_process.stdout), sep='\t', names=['id', 'length'])
    return df

def keep_sequence_by_bases_number(input_path, numbases):
    table = fx2tab(input_path)
    table = table.sample(frac=1) # random shuffle
    keep_reads_id = []
    count = 0
    for read_id, read_length in zip(table['id'], table['length']):
        if count < numbases:
            keep_reads_id.append(read_id)
            count += read_length
        else:
            break
    return set(keep_reads_id)

In [4]:
final_bases_number = 100000000

In [5]:
genome_1_input = Path('/media/NAS/NanoporeOutput/20210414_Bacteria/Barcode/barcode01')
genome_2_input = Path('/media/NAS/NanoporeOutput/20210414_Bacteria/Barcode/barcode02')

In [6]:
genome_1_bases_percentage = 0
genome_2_bases_percentage = 100

In [7]:
output_file = f'/media/NGS/Data_Analysis/20201130_rapid_diagnostic_with_nanopore/host_contamination_samples/experiment-2/Enterococcus_hormaechei.fastq'
output_file

'/media/NGS/Data_Analysis/20201130_rapid_diagnostic_with_nanopore/host_contamination_samples/experiment-2/Enterococcus_hormaechei.fastq'

In [8]:
numbases = genome_1_bases_percentage/100 * final_bases_number
genome_1_keep_read_ids = keep_sequence_by_bases_number(genome_1_input, numbases)

extract_sequence_by_id(genome_1_input, genome_1_keep_read_ids, output_file, 'THU25713455-')

numbases = genome_2_bases_percentage/100 * final_bases_number
genome_2_keep_read_ids = keep_sequence_by_bases_number(genome_2_input, numbases)

extract_sequence_by_id(genome_2_input, genome_2_keep_read_ids, output_file, 'TUECL19004-')

In [None]:
genome_1_seqfile = "/media/NGS/Nanopore_1/20190917_R19_2839/2_unicycler/assembly.fasta"
genome_1_gsize = sum(len(record.seq) for record in SeqIO.parse(genome_1_seqfile, 'fasta'))

In [None]:
genome_2_seqfile = "/media/NGS/Nanopore_1/20200923/contigs/AMR200035.fa"
genome_2_gsize = sum(len(record.seq) for record in SeqIO.parse(genome_2_seqfile, 'fasta'))

In [None]:
dirpath = Path('/media/NGS/Data_Analysis/20201130_rapid_diagnostic_with_nanopore/mixed_samples/experiment-1/fastq')

In [None]:
genome_1_prefix = 'R19-2839'
genome_2_prefix = 'AMR200035'

In [None]:
data =dict()
for filepath in dirpath.iterdir():
    df = fx2tab(filepath)
    data[filepath.stem] = {genome_1_prefix: df[df['id'].str.startswith(genome_1_prefix)]['length'].sum()/genome_1_gsize,
                           genome_2_prefix: df[df['id'].str.startswith(genome_2_prefix)]['length'].sum()/genome_2_gsize}

In [None]:
df = pd.DataFrame(data).T.sort_index().apply(round, axis=1)

In [None]:
df

In [None]:
df.to_csv('/media/NGS/Data_Analysis/20201130_rapid_diagnostic_with_nanopore/mixed_samples/experiment-1/depth.txt', sep='\t')