In [26]:
import os
import sys
from pathlib import Path
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor

sys.path.append('../src')
from utils import run_cmd

In [27]:
def fn(raw_1, raw_2, outdir, output='', threads=16):
    paired_1 = os.path.join(outdir, output + '_R1.fastq.gz')
    paired_2 = os.path.join(outdir, output + '_R2.fastq.gz')
    unpaired = os.path.join(outdir, output + '_unpaired.fastq.gz')
    cmd = f"fastp -i {raw_1} -I {raw_2} -o {paired_1} -O {paired_2} --unpaired1 {unpaired} --unpaired2 {unpaired} "\
          f"--length_required 36 --cut_front 3 --cut_tail 3 --thread {threads} --detect_adapter_for_pe -j {json_report} -h {html_report} -t 1"
    run_cmd(cmd)
    return paired_1, paired_2, unpaired

In [28]:
raw_files = """/media/BioNumerics/NGS_Salmonella/R18.0246_S44_L001_R1_001.fastq.gz
/media/BioNumerics/NGS_Salmonella/R18.0246_S44_L001_R2_001.fastq.gz
/media/BioNumerics/NGS_Salmonella/R19.0144_S17_L001_R1_001.fastq.gz
/media/BioNumerics/NGS_Salmonella/R19.0144_S17_L001_R2_001.fastq.gz
/media/BioNumerics/NGS_Salmonella/R21.1368_S45_L001_R1_001.fastq.gz
/media/BioNumerics/NGS_Salmonella/R21.1368_S45_L001_R2_001.fastq.gz
/media/BioNumerics/NGS_Salmonella/R21.1436_S46_L001_R1_001.fastq.gz
/media/BioNumerics/NGS_Salmonella/R21.1436_S46_L001_R2_001.fastq.gz
"""

In [29]:
raw_files = raw_files.split()

In [30]:
pairwise = defaultdict(list)
for raw in raw_files:
    pairwise[Path(raw).stem.split('_')[0]].append(raw)
pairwise

defaultdict(list,
            {'R18.0246': ['/media/BioNumerics/NGS_Salmonella/R18.0246_S44_L001_R1_001.fastq.gz',
              '/media/BioNumerics/NGS_Salmonella/R18.0246_S44_L001_R2_001.fastq.gz'],
             'R19.0144': ['/media/BioNumerics/NGS_Salmonella/R19.0144_S17_L001_R1_001.fastq.gz',
              '/media/BioNumerics/NGS_Salmonella/R19.0144_S17_L001_R2_001.fastq.gz'],
             'R21.1368': ['/media/BioNumerics/NGS_Salmonella/R21.1368_S45_L001_R1_001.fastq.gz',
              '/media/BioNumerics/NGS_Salmonella/R21.1368_S45_L001_R2_001.fastq.gz'],
             'R21.1436': ['/media/BioNumerics/NGS_Salmonella/R21.1436_S46_L001_R1_001.fastq.gz',
              '/media/BioNumerics/NGS_Salmonella/R21.1436_S46_L001_R2_001.fastq.gz']})

In [31]:
outpath = Path('/media/Central_Lab_Storage/MinION_2022/20220106_Sal/fastq')
with ProcessPoolExecutor(8) as executor:
    for prefix, (sr_1, sr_2) in pairwise.items():
        executor.submit(fn, sr_1, sr_2, outpath, prefix ,8)

In [10]:
import json

In [11]:
with open('/home/chen1i6c04/fastp.json') as f:
    data = json.load(f)

In [13]:
data.keys()

dict_keys(['summary', 'filtering_result', 'duplication', 'insert_size', 'adapter_cutting', 'read1_before_filtering', 'read2_before_filtering', 'read1_after_filtering', 'read2_after_filtering', 'command'])

In [25]:
data['summary']

{'fastp_version': '0.22.0',
 'sequencing': 'paired end (301 cycles + 301 cycles)',
 'before_filtering': {'total_reads': 646222,
  'total_bases': 188780197,
  'q20_bases': 156062188,
  'q30_bases': 131278537,
  'q20_rate': 0.826687,
  'q30_rate': 0.695404,
  'read1_mean_length': 291,
  'read2_mean_length': 293,
  'gc_content': 0.523031},
 'after_filtering': {'total_reads': 614714,
  'total_bases': 167212821,
  'q20_bases': 148398255,
  'q30_bases': 127114460,
  'q20_rate': 0.887481,
  'q30_rate': 0.760196,
  'read1_mean_length': 282,
  'read2_mean_length': 261,
  'gc_content': 0.517632}}

In [24]:
data['read2_after_filtering']['q30_bases']/data['read2_after_filtering']['total_bases']*100

66.60281937888784