In [1]:
%%html
<link rel='stylesheet' type='text/css' href='custom.css'/>

In [2]:
!rm data/converted-seqs.fasta data/converted-seqs.qual data/not-yasf.fna

rm: cannot remove ‘data/converted-seqs.fasta’: No such file or directory
rm: cannot remove ‘data/converted-seqs.qual’: No such file or directory
rm: cannot remove ‘data/not-yasf.fna’: No such file or directory


![](assets/logo.svg)
#A Bioinformatics Library for Data Scientists, Students, and Developers

Jai Rideout and Evan Bolyen

> "The first step in developing a new genetic analysis algorithm is to decide how to make the input data file format different from all pre-existing analysis data file formats." - [Law's First Law](http://www.bioinformatics.roslin.ed.ac.uk/lawslaws/)















<span style='line-height:2em; word-spacing:2em'>Axt BAM SAM BED bedGraph bigBed bigGenePred table bigWig Chain GenePred table GFF GTF HAL MAF Microarray Net Personal Genome SNP format PSL VCF WIG  abi ace clustal embl fasta fastq genbank ig imgt nexus phred phylip pir seqxml sff stockholm swiss tab qual uniprot-xml emboss PhyolXML NexML newick CDAO MDL bcf caf gcproj scf SBML lsmat ordination qseq BIOM ASN.1 .2bit .nib ENCODE ... </span>

In [None]:
#TODO: review the list

<span style='line-height:2em; word-spacing:2em'>Axt BAM SAM BED bedGraph bigBed bigGenePred table bigWig Chain GenePred table GFF GTF HAL MAF Microarray Net Personal Genome SNP format PSL VCF WIG  abi ace <span class='supio'>clustal</span> embl <span class='supio'>fasta</span> <span class='supio'>fastq</span> genbank ig imgt nexus phred <span class='supio'>phylip</span> pir seqxml sff stockholm swiss tab qual uniprot-xml emboss PhyolXML NexML <span class='supio'>newick</span> CDAO MDL bcf caf gcproj scf SBML <span class='supio'>lsmat</span> <span class='supio'>ordination</span> <span class='supio'>qseq</span> BIOM ASN.1 .2bit .nib ENCODE ... </span>

## I/O in bioinformatics is hard


- format redundancy (many-to-many)

- Multiple file formats can be **read** into the same object.
- A single object can be **written** in multiple formats.

- format ambiguity

- heterogeneous sources

- efficient format conversion

## How can we solve this?


#I/O Registry


- file format implemented in single submodule
- registry provides simple API to implement format against
- (messy) format logic separate from object implementation

## Format redundancy (many-to-many)


In [3]:
from skbio import DNA

seq1 = DNA.read('data/seqs.fasta', qual='data/seqs.qual')
seq2 = DNA.read('data/seqs.fastq', variant='illumina1.8')
seq1 

DNA
---------------------------------------------------------------------
Metadata:
    'description': ''
    'id': 'M10_68:1:1:19607:29475#0/1'
Positional metadata:
    'quality': <dtype: int64>
Stats:
    length: 152
    has gaps: False
    has degenerates: False
    has non-degenerates: True
    GC-content: 45.39%
---------------------------------------------------------------------
0   GACATAAGGG TGGTTAGTAT ACCGGCAAGG ACGGGGTTAC TAGTGACGTC CTTCCCCGTA
60  TGCCGGGCAA TAATGTTTAT GTTGGTTTCA TGGTTTGGTC TAACTTTACC GCTACTAAAT
120 GCTGCGGATT GGTTTCGCTG AATCAGATTA TT

In [4]:
seq1 == seq2

False

## Format ambiguity

It is often unclear to the user what the format of a file is

Extensions aren'y formalized, or do not exist (fasta/fna/txt)

In [5]:
import skbio.io

skbio.io.sniff('data/mystery_file.gz')

('lsmat', {'delimiter': ','})

## Heterogeneous sources

In [6]:
from skbio import TreeNode

tree1 = skbio.io.read('http://localhost:8888/files/data/newick.gz', into=TreeNode)
print(tree1.ascii_art())

                    /-a
                   |
          /--------|--b
         |         |
---------|          \-c
         |
          \-d


In [7]:
import io 

with io.open('data/newick.bz2', mode='rb') as open_filehandle:
    tree2 = skbio.io.read(open_filehandle, into=TreeNode)

print(tree2.ascii_art())      

                    /-a
                   |
          /--------|--b
         |         |
---------|          \-c
         |
          \-d


In [8]:
tree3 = skbio.io.read([u'((a, b, c), d:15):0;'], into=TreeNode)
print(tree3.ascii_art())

                    /-a
                   |
          /--------|--b
         |         |
---------|          \-c
         |
          \-d


## Efficient format conversion

In [9]:
stream_of_seqs = skbio.io.read("data/seqs.fastq", format='fastq', 
                               variant='illumina1.8')

In [10]:
skbio.io.write(stream_of_seqs, format='fasta', into='data/converted-seqs.fasta', 
               qual='data/converted-seqs.qual')

'data/converted-seqs.fasta'

In [11]:
!head -2 data/converted-seqs.fasta

>M10_68:1:1:19607:29475#0/1
GACATAAGGGTGGTTAGTATACCGGCAAGGACGGGGTTACTAGTGACGTCCTTCCCCGTATGCCGGGCAATAATGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCTGCGGATTGGTTTCGCTGAATCAGATTATT


In [12]:
!head -2 data/converted-seqs.qual

>M10_68:1:1:19607:29475#0/1
26 31 31 35 28 10 17 32 35 35 27 27 31 27 34 38 38 38 27 27 32 17 34 36 39 37 31 25 25 15 15 8 15 30 35 6 27 6 21 34 31 22 8 13 8 22 32 20 32 36 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2


## Let's make a format!

#YASF (Yet Another Sequence Format)

In [13]:
!cat data/yasf-seq.yml

#YASF
ID: presentation-id-1
Location: Austin, TX
Description: SciPy 2015 is awesome!
Sequence: >
    ACGTGCTAGCTCGATCGCTAGCCCGAGAGCGCAGCGCGCTAGCTCGATCGCGCTAGAGAGAGATCGCGCTAGCGWA

In [14]:
from skbio import DNA
import skbio.io 
import yaml

yasf = skbio.io.create_format('yasf')

@yasf.sniffer()
def yasf_sniffer(fh):
    return fh.readline().rstrip() == "#YASF", {}

@yasf.reader(DNA)
def yasf_to_dna(fh):
    seq = yaml.load(fh.read())
    return DNA(seq['Sequence'], metadata={
        'id': seq['ID'],
        'location': seq['Location'],
        'description': seq['Description']
    })

skbio.io.io_registry.monkey_patch()

In [15]:
seq = DNA.read("data/yasf-seq.yml")
seq

DNA
--------------------------------------------------------------------
Metadata:
    'description': 'SciPy 2015 is awesome!'
    'id': 'presentation-id-1'
    'location': 'Austin, TX'
Stats:
    length: 76
    has gaps: False
    has degenerates: True
    has non-degenerates: True
    GC-content: 63.16%
--------------------------------------------------------------------
0  ACGTGCTAGC TCGATCGCTA GCCCGAGAGC GCAGCGCGCT AGCTCGATCG CGCTAGAGAG
60 AGATCGCGCT AGCGWA

In [16]:
seq = skbio.io.read("data/yasf-seq.yml", into=DNA)
seq

DNA
--------------------------------------------------------------------
Metadata:
    'description': 'SciPy 2015 is awesome!'
    'id': 'presentation-id-1'
    'location': 'Austin, TX'
Stats:
    length: 76
    has gaps: False
    has degenerates: True
    has non-degenerates: True
    GC-content: 63.16%
--------------------------------------------------------------------
0  ACGTGCTAGC TCGATCGCTA GCCCGAGAGC GCAGCGCGCT AGCTCGATCG CGCTAGAGAG
60 AGATCGCGCT AGCGWA

In [17]:
seq.write("data/not-yasf.fna", format='fasta')
!cat data/not-yasf.fna

>presentation-id-1 SciPy 2015 is awesome!
ACGTGCTAGCTCGATCGCTAGCCCGAGAGCGCAGCGCGCTAGCTCGATCGCGCTAGAGAGAGATCGCGCTAGCGWA


Talk about how developers using scikit-bio can rely on our object model to support current and future file formats

## We are in beta - should you even use our software?

#YES!

## API Lifecycle
![](assets/stability-state-diagram.svg)


### What is stable:

- `skbio.io` 
- `skbio.sequence`

&nbsp;
&nbsp;
###What is left:

- `skbio.alignment`
- `skbio.tree`
- `skbio.diversity`
- `skbio.stats`
- &lt;`your awesome subpackage!`&gt;

## Sequence API: putting the "scikit" in scikit-bio

Interoperability with scipy-stack

"numpythonic" API

performance

## Interoperability with scipy stack

Here's a sequence, now what?

In [30]:
seq = DNA.read('data/gapped-seq.fastq', variant='illumina1.8')
seq

FileNotFoundError: [Errno 2] No such file or directory: 'data/gapped-seq.fastq'

In [25]:
seq.values

array([b'G', b'A', b'C', b'A', b'T', b'A', b'A', b'G', b'G', b'G', b'T',
       b'G', b'G', b'T', b'T', b'A', b'G', b'T', b'A', b'T', b'A', b'C',
       b'C', b'G', b'G', b'C', b'A', b'A', b'G', b'G', b'A', b'C', b'G',
       b'G', b'G', b'G', b'T', b'T', b'A', b'C', b'T', b'A', b'G', b'T',
       b'G', b'A', b'C', b'G', b'T', b'C', b'C', b'T', b'T', b'C', b'C',
       b'C', b'C', b'G', b'T', b'A', b'T', b'G', b'C', b'C', b'G', b'G',
       b'G', b'C', b'A', b'A', b'T', b'A', b'A', b'T', b'G', b'T', b'T',
       b'T', b'A', b'T', b'G', b'T', b'T', b'G', b'G', b'T', b'T', b'T',
       b'C', b'A', b'T', b'G', b'G', b'T', b'T', b'T', b'G', b'G', b'T',
       b'C', b'T', b'A', b'A', b'C', b'T', b'T', b'T', b'A', b'C', b'C',
       b'G', b'C', b'T', b'A', b'C', b'T', b'A', b'A', b'A', b'T', b'G',
       b'C', b'T', b'G', b'C', b'G', b'G', b'A', b'T', b'T', b'G', b'G',
       b'T', b'T', b'T', b'C', b'G', b'C', b'T', b'G', b'A', b'A', b'T',
       b'C', b'A', b'G', b'A', b'T', b'T', b'A', b'

In [26]:
seq.positional_metadata

Unnamed: 0,quality
0,26
1,31
2,31
3,35
4,28
5,10
6,17
7,32
8,35
9,35


In [27]:
seq.gaps()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [28]:
seq[~seq.gaps()]

DNA
---------------------------------------------------------------------
Metadata:
    'description': ''
    'id': 'M10_68:1:1:19607:29475#0/1'
Positional metadata:
    'quality': <dtype: uint8>
Stats:
    length: 152
    has gaps: False
    has degenerates: False
    has non-degenerates: True
    GC-content: 45.39%
---------------------------------------------------------------------
0   GACATAAGGG TGGTTAGTAT ACCGGCAAGG ACGGGGTTAC TAGTGACGTC CTTCCCCGTA
60  TGCCGGGCAA TAATGTTTAT GTTGGTTTCA TGGTTTGGTC TAACTTTACC GCTACTAAAT
120 GCTGCGGATT GGTTTCGCTG AATCAGATTA TT

In [29]:
seq.degap()

DNA
---------------------------------------------------------------------
Metadata:
    'description': ''
    'id': 'M10_68:1:1:19607:29475#0/1'
Positional metadata:
    'quality': <dtype: uint8>
Stats:
    length: 152
    has gaps: False
    has degenerates: False
    has non-degenerates: True
    GC-content: 45.39%
---------------------------------------------------------------------
0   GACATAAGGG TGGTTAGTAT ACCGGCAAGG ACGGGGTTAC TAGTGACGTC CTTCCCCGTA
60  TGCCGGGCAA TAATGTTTAT GTTGGTTTCA TGGTTTGGTC TAACTTTACC GCTACTAAAT
120 GCTGCGGATT GGTTTCGCTG AATCAGATTA TT

##A flexible and powerful API

... demonstrated with a slightly contrived example:

In [None]:
seq = DNA.read("somefile", lowercase='exon')

seq.positional_metadata['position'] = np.arange(len(seq))

for exon in seq.iter_contiguous('exon', 
                                ignore=seq.gaps()):
        translated = exon.translate()
        n_glyco_positions = translated.find_motifs("N-glycosylation")
        
        if translated[n_glyco_positions]:
            print(exon.positional_metadata['positions'][0])
