Skip to content

Commit

Permalink
Added compress command
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed Mar 29, 2019
1 parent aa6445c commit a5879e3
Show file tree
Hide file tree
Showing 9 changed files with 347 additions and 1 deletion.
4 changes: 4 additions & 0 deletions Gopkg.toml
Expand Up @@ -2,6 +2,10 @@
name = "github.com/spf13/cobra"
version = "0.0.3"

[[constraint]]
name = "github.com/armon/go-radix"
version = "1.0.0"

[[constraint]]
branch = "master"
name = "github.com/fredericlemoine/cobrashell"
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -122,6 +122,7 @@ You may go to the [doc](docs/index.md) for a more detailed documentation of the
* sites : Removes sites with gaps
* seqs : Removes sequences with gaps
* codonalign: Aligns a given nt fasta file using a corresponding aa alignment (by codons)
* compress: Removes identical patterns/sites from alignment
* compute: Different computations (distances, etc.)
* distances: compute evolutionary distances for nucleotide alignment
* entropy: compute entropy of alignment sites
Expand Down
47 changes: 47 additions & 0 deletions align/align.go
Expand Up @@ -11,6 +11,7 @@ import (
"strings"
"unicode"

"github.com/armon/go-radix"
"github.com/evolbioinfo/goalign/io"
)

Expand All @@ -22,6 +23,9 @@ type Alignment interface {
CharStatsSite(site int) (map[rune]int, error)
Clone() (Alignment, error)
CodonAlign(ntseqs SeqBag) (codonAl *align, err error)
// Remove identical patterns/sites and return number of occurence
// of each pattern (order of patterns/sites may have changed)
Compress() []int
// concatenates the given alignment with this alignment
Concat(Alignment) error
// Compares all sequences to the first one and counts all differences per sequence
Expand Down Expand Up @@ -988,6 +992,49 @@ func (a *align) RandSubAlign(length int) (Alignment, error) {
return subalign, nil
}

/*
Remove identical patterns/sites and return number of occurence
of each pattern (order of patterns/sites may have changed)
*/
func (a *align) Compress() (weights []int) {
var count interface{}
var ok bool
r := radix.New()
npat := 0
// We add new patterns if not already insterted in the radix tree
for site := 0; site < a.Length(); site++ {
pattern := make([]rune, a.NbSequences())
for seq := 0; seq < a.NbSequences(); seq++ {
pattern[seq] = a.seqs[seq].sequence[site]
}
patstring := string(pattern)
if count, ok = r.Get(patstring); !ok {
npat++
count = &struct{ count int }{0}
}
count.(*struct{ count int }).count++
r.Insert(patstring, count)
}
// Init weights
weights = make([]int, npat)
// We add the patterns
npat = 0
r.Walk(func(pattern string, count interface{}) bool {
weights[npat] = count.(*struct{ count int }).count
for seq, c := range pattern {
a.seqs[seq].sequence[npat] = c
}
npat++
return false
})
// We remove what remains of the sequences after al patterns
for seq := 0; seq < a.NbSequences(); seq++ {
a.seqs[seq].sequence = a.seqs[seq].sequence[:npat]
}
a.length = npat
return
}

/*
Concatenates both alignments. It appends the given alignment to this alignment.
If a sequence is present in this alignment and not in c, then it adds a full gap sequence.
Expand Down
68 changes: 68 additions & 0 deletions align/align_test.go
Expand Up @@ -1169,3 +1169,71 @@ func TestDiffCount(t *testing.T) {
}

}

func TestCompress(t *testing.T) {
in := NewAlign(UNKNOWN)
in.AddSequence("Seq0000", "GGTTTTTTTT", "")
in.AddSequence("Seq0001", "TTCCCCACCC", "")
in.AddSequence("Seq0002", "GGTTTTTTTT", "")
in.AddSequence("Seq0003", "GGTTTTTTTT", "")
in.AutoAlphabet()

exp := NewAlign(UNKNOWN)
exp.AddSequence("Seq0000", "GTT", "")
exp.AddSequence("Seq0001", "TAC", "")
exp.AddSequence("Seq0002", "GTT", "")
exp.AddSequence("Seq0003", "GTT", "")
exp.AutoAlphabet()

expw := []int{2, 1, 7}

w := in.Compress()

if len(w) != len(expw) {
t.Error(fmt.Errorf("Number patterns is not what is expected %v vs. %v", w, expw))
}

for i, e := range w {
if e != expw[i] {
t.Error(fmt.Errorf("Pattern %d is different %v vs. %v", i, w, expw))
}
}

if !exp.Identical(in) {
t.Error(fmt.Errorf("Compressed alignment is different from expected \n %s \n vs. \n %s", in.String(), exp.String()))
}
}

func TestCompress2(t *testing.T) {
in := NewAlign(UNKNOWN)
in.AddSequence("Seq0000", "GGGGGGGGGGGGGGGGGGGG", "")
in.AddSequence("Seq0001", "TTTTTTTTTTTTTTTTTTTT", "")
in.AddSequence("Seq0002", "GGGGGGGGGGGGGGGGGGGG", "")
in.AddSequence("Seq0003", "AAAAAAAAAAAAAAAAAAAA", "")
in.AutoAlphabet()

exp := NewAlign(UNKNOWN)
exp.AddSequence("Seq0000", "G", "")
exp.AddSequence("Seq0001", "T", "")
exp.AddSequence("Seq0002", "G", "")
exp.AddSequence("Seq0003", "A", "")
exp.AutoAlphabet()

expw := []int{20}

w := in.Compress()

if len(w) != len(expw) {
t.Error(fmt.Errorf("Number patterns is not what is expected %v vs. %v", w, expw))
}

for i, e := range w {
if e != expw[i] {
t.Error(fmt.Errorf("Pattern %d is different %v vs. %v", i, w, expw))
}
}

if !exp.Identical(in) {
t.Error(fmt.Errorf("Compressed alignment is different from expected \n %s \n vs. \n %s", in.String(), exp.String()))
}
}
89 changes: 89 additions & 0 deletions cmd/compress.go
@@ -0,0 +1,89 @@
package cmd

import (
"fmt"
"os"

"github.com/evolbioinfo/goalign/align"
"github.com/evolbioinfo/goalign/io"
"github.com/spf13/cobra"
)

var compressOutput string
var compressWeightOutput string

var compressCmd = &cobra.Command{
Use: "compress",
Short: "Removes identical patterns/sites from an input alignment",
Long: `Removes identical patterns/sites from an input alignment
And prints in the weight file the number of occurence of each pattern
Example:
ali.phy
1 GGGGGGGGGGGGGGGGGGGG
2 TTTTTTTTTTTTTTTTTTTT
3 GGGGGGGGGGCCCCCCCCCC
4 AAAAAAAAAAAAAAAAAAAA
goalign compress -i ali.phy will produce:
1 GG
2 TT
3 GC
4 AA
and weight file:
10
10
`,
RunE: func(cmd *cobra.Command, args []string) (err error) {
var aligns *align.AlignChannel
var f, wf *os.File

if aligns, err = readalign(infile); err != nil {
io.LogError(err)
return
}
if f, err = openWriteFile(compressOutput); err != nil {
io.LogError(err)
return
}
defer closeWriteFile(f, compressOutput)

if wf, err = openWriteFile(compressWeightOutput); err != nil {
io.LogError(err)
return
}
defer closeWriteFile(f, compressWeightOutput)

for al := range aligns.Achan {
var w []int
if w = al.Compress(); err != nil {
io.LogError(err)
return
} else {
writeAlign(al, f)
writeWeights(w, wf)
}
}

if aligns.Err != nil {
err = aligns.Err
io.LogError(err)
}
return
},
}

func init() {
compressCmd.PersistentFlags().StringVarP(&compressOutput, "output", "o", "stdout", "Compressed output alignment file")
compressCmd.PersistentFlags().StringVar(&compressWeightOutput, "weight-out", "none", "Pattern weight output file")
RootCmd.AddCommand(compressCmd)
}

func writeWeights(weights []int, f *os.File) {
for _, w := range weights {
fmt.Fprintf(f, "%d\n", w)
}
}
49 changes: 49 additions & 0 deletions docs/api/compress.md
@@ -0,0 +1,49 @@
# Goalign: toolkit and api for alignment manipulation

## API

### compress

Remove identical patterns/sites

```go
package main

import (
"bufio"
"fmt"
"io"

"github.com/evolbioinfo/goalign/align"
"github.com/evolbioinfo/goalign/io/fasta"
"github.com/evolbioinfo/goalign/io/utils"
)

func main() {
var fi io.Closer
var r *bufio.Reader
var err error
var al align.Alignment
var weights []int

/* Get reader (plain text or gzip) */
fi, r, err = utils.GetReader("align.fa")
if err != nil {
panic(err)
}

/* Parse Fasta */
al, err = fasta.NewParser(r).Parse()
if err != nil {
panic(err)
}
fi.Close()

/* Compress */
weights = al.Compress()
fmt.Println(fasta.WriteAlignment(al))
for _, w := range weights {
fmt.Println(w)
}
}
```
57 changes: 57 additions & 0 deletions docs/commands/compress.md
@@ -0,0 +1,57 @@
# Goalign: toolkit and api for alignment manipulation

## Commands

### compress
This command removes identical patterns/sites from an input alignment

#### Usage
```
Usage:
goalign compress [flags]
Flags:
-o, --output string Compressed output alignment file (default "stdout")
--weight-out string Pattern weight output file (default "none")
Global Flags:
-i, --align string Alignment input file (default "stdin")
--auto-detect Auto detects input format (overrides -p, -x and -u)
-u, --clustal Alignment is in clustal? default fasta
--input-strict Strict phylip input format (only used with -p)
-x, --nexus Alignment is in nexus? default fasta
--no-block Write Phylip sequences without space separated blocks (only used with -p)
--one-line Write Phylip sequences on 1 line (only used with -p)
--output-strict Strict phylip output format (only used with -p)
-p, --phylip Alignment is in phylip? default fasta
```

#### Examples

```
cat > input <<EOF
4 20
1 GGGGGGGGGGGGGGGGGGGG
2 TTTTTTTTTTTTTTTTTTTT
3 GGGGGGGGGGCCCCTTTTTT
4 AAAAAAAAAAAAAAAAAAAA
EOF
```

goalign compress -i input -p --weight-out wres

should output:
```
4 3
1 GGG
2 TTT
3 CGT
4 AAA
```

and produce a weight file like:
```
4
10
6
```
1 change: 1 addition & 0 deletions docs/index.md
Expand Up @@ -61,6 +61,7 @@ Command | Subcommand |
-- | sites | Removes sequences with gaps
-- | seqs | Removes sites with gaps
[codonalign](commands/codonalign.md) ([api](api/codonalign.md))| | Adds gaps in nt sequences, according to its corresponding protein alignment
[compress](commands/compress.md) ([api](api/compress.md)) | | Removes identical patterns/sites from an input alignment
[compute](commands/compute.md) ([api](api/compute.md)) | | Different computations (distances, entropy, etc.)
-- | distance | Computes distance matrix from inpu alignment
-- | entropy | Computes entropy of sites of a given alignment
Expand Down

0 comments on commit a5879e3

Please sign in to comment.