# Process FASTQ files in scikit-bio

[Documentation](http://scikit-bio.org/docs/latest/index.html)

skbio versus biopython
+ https://github.com/biocore/scikit-bio
+ https://github.com/biopython/biopython

In [1]:
import skbio.io
import skbio.alignment
import pathlib

In [2]:
directory = pathlib.Path('samples/ubiome')
fastq_paths = sorted(directory.glob('**/*.fastq.gz'))
fastq_paths

[PosixPath('samples/ubiome/ssr_178900/ssr_178900__R1__L001.fastq.gz'),
 PosixPath('samples/ubiome/ssr_178900/ssr_178900__R1__L002.fastq.gz'),
 PosixPath('samples/ubiome/ssr_178900/ssr_178900__R1__L003.fastq.gz'),
 PosixPath('samples/ubiome/ssr_178900/ssr_178900__R1__L004.fastq.gz'),
 PosixPath('samples/ubiome/ssr_178900/ssr_178900__R2__L001.fastq.gz'),
 PosixPath('samples/ubiome/ssr_178900/ssr_178900__R2__L002.fastq.gz'),
 PosixPath('samples/ubiome/ssr_178900/ssr_178900__R2__L003.fastq.gz'),
 PosixPath('samples/ubiome/ssr_178900/ssr_178900__R2__L004.fastq.gz')]

In [3]:
%%time
path = str(fastq_paths[0])
sequences = skbio.io.read(path, format='fastq', constructor=skbio.DNA)
sequences = list(sequences)

CPU times: user 4.12 s, sys: 24 ms, total: 4.14 s
Wall time: 4.32 s


In [4]:
len(sequences)

10675

In [5]:
sequences[0]

DNA
---------------------------------------------------------------------
Metadata:
    'description': '1:N:0:GATGATGA+CTACCTCG'
    'id': 'NB501532:123:HMH2CAFXX:1:11101:8553:1152'
Positional metadata:
    'quality': <dtype: uint8>
Stats:
    length: 151
    has gaps: False
    has degenerates: False
    has definites: True
    GC-content: 55.63%
---------------------------------------------------------------------
0   AGTGTGCCAG CAGCCGCGGT AATACGAAGG GGGCTAGCGT TGCTCGGAAT GACTGGGCGT
60  AAAGGGCGTG TAGGCGGTTT GTACAGTCAG ATGTGAAATC CCCGGGCTTA ACCTGGGAGC
120 TGCATTTGAT ACGTGCAGAC TAGAGTGTGA G

## After joining paired ends

In [3]:
path = './samples/ubiome/ssr_178900/join_paired_ends/fastqjoin.join.fastq'
sequences = skbio.io.read(path, format='fastq', constructor=skbio.DNA)
sequences = list(sequences)
len(sequences)

93

In [7]:
sequences[3]

DNA
-----------------------------------------------------
Metadata:
    'description': '1:N:0:GATGATGA+CTACCTCG'
    'id': 'NB501532:123:HMH2CAFXX:1:11108:23979:5586'
Positional metadata:
    'quality': <dtype: uint8>
Stats:
    length: 39
    has gaps: False
    has degenerates: False
    has definites: True
    GC-content: 69.23%
-----------------------------------------------------
0 CGTGTGCCAG CAGCCGCGGA TACCCCTGTA GTCCGGGGA