Skip to content

Commit

Permalink
add tsv and simple names list as input (close #9)
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Dec 28, 2021
1 parent 9df2204 commit 7bcfaab
Show file tree
Hide file tree
Showing 11 changed files with 212 additions and 11,834 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## [v0.1.1]

- Add [#9]: allow to take TSV files and simple names lists.

## [v0.1.0]

- Add [#8]: make command line application.
Expand All @@ -15,7 +19,7 @@

This document follows [changelog guidelines]

[v0.2.0]: https://github.com/gnames/gndiff/compare/v0.1.0...v0.2.0
[v0.1.1]: https://github.com/gnames/gndiff/compare/v0.1.0...v0.1.1
[v0.1.0]: https://github.com/gnames/gndiff/tree/v0.1.0

[#20]: https://github.com/gnames/gndiff/issues/20
Expand Down
85 changes: 81 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,93 @@
# `gndiff` app takes two files with scientific names, compares them and returns the result.

## Introduction

It is often useful to compare one check-list to another.

## Usage

### Command Line App
If you need to compare a list of names with a data-set that is imported as a
[GNverifier data-source] use [GNverifier] instead like this:

```bash
gnvrifier names.txt -o -s 10
```

where `-s` flag provides Id of required [GNverifier data-source]

### Compare Files

Prepare two files with names. There are 3 possible file formats:

* A simple lists of scientific names, one name per line
* Comma-separated or Tab-separated (CSV/TSV) file with a `scientificName` field.

Prepare 2 CSV files with a `scientificName` field. One file contains names
that need to be matched, second file contains reference names.
The first of the two files should contain names that need to be matched.
The second file should contain reference names.

Run
Run command:

```bash
gndiff source.csv reference.csv
```

### Options and flags

According to POSIX standard flags and options can be given either before or
after name-string or file name.

#### help

```bash
gndiff -h
# or
gndiff --help
# or
gndiff
```

#### version

```bash
gndiff -V
# or
gndiff --version
```

#### format

Sets the format of the comparison result and can take the following values:

* `csv`: Comma-separated format
* `tsv`: Tab-separated format
* `compact`: JSON as one line
* `pretty`: JSON in a human-readable format with indentations and lines separation.

The default format is CSV.

```bash
gndiff source.txt ref.txt -f pretty
# or
gndiff source.txt ref.txt --format=pretty
```

#### quiet

This flag supresses warnings log, showing only the matching results.

```bash
gndiff source.txt ref.txt -q
# or
gndiff source.txt ref.txt --quiet
```

Please note, that matching result uses `STDOUT`, while log uses `STDERR`,
so a similar result can be achieved by redirecting `STDERR` to `/dev/null`

```bash
gndiff source.txt ref.txt 2> /dev/null
```


[GNverifier]: https://github.com/gnames/gnverifier
[GNverifier data-source]: https://verifier.globalnames.org/data_sources
20 changes: 19 additions & 1 deletion ent/output/output.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (

"github.com/gnames/gndiff/ent/record"
"github.com/gnames/gnfmt"
"github.com/gnames/gnlib/ent/verifier"
)

type Output struct {
Expand Down Expand Up @@ -84,9 +85,26 @@ func csvRow(m Match, sep rune) []string {
}
res = append(res, gnfmt.ToCSV(row, sep))
}
if len(r) == 0 {
row := []string{
s.DataSet,
strconv.Itoa(s.Index),
s.ID,
s.Name,
"",
verifier.NoMatch.String(),
"",
"",
"",
"",
}
res = append(res, gnfmt.ToCSV(row, sep))
}
return res
}

func jsonOutput(o Output, pretty bool) string {
return ""
enc := gnfmt.GNjson{Pretty: pretty}
res, _ := enc.Encode(o)
return string(res)
}
8 changes: 4 additions & 4 deletions ent/record/record.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ import (
type Record struct {
DataSet string `json:"dataSet"`
Index int `json:"index"`
EditDistance int `json:"editDistance"`
ID string `json:"id"`
EditDistance int `json:"editDistance,omitempty"`
ID string `json:"id,omitempty"`
Name string `json:"name"`
Family string `json:"family"`
MatchType verifier.MatchTypeValue `json:"matchType"`
Family string `json:"family,omitempty"`
MatchType verifier.MatchTypeValue `json:"matchType,omitempty"`
parsed.Parsed `json:"-"`
}
10 changes: 8 additions & 2 deletions gndiff/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,14 @@ var rootCmd = &cobra.Command{
Use: "gndiff source_file reference_file",
Short: "Compares two files with scientific names.",
Long: `
Extracts scientific names, their IDs and families from a CSV/TSV file and
prints out a match of a reference data to the source data.
Extracts scientific names, their IDs and families the source and reference
files and prints out a match of a reference data to the source data.
The files can be in comma-separated (CSV), tab-separated (TSV) formats, or
just contain name-strings only (one per line).
TSV/CSV files must contain 'ScientificName' field, 'Family' and 'TaxonID'
fields are also ingested.
`,
// Uncomment the following line if your bare application
// has an action associated with it:
Expand Down
54 changes: 53 additions & 1 deletion io/ingestio/ingestio.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package ingestio

import (
"bufio"
"encoding/csv"
"errors"
"fmt"
Expand Down Expand Up @@ -39,13 +40,28 @@ func (ing ingestio) Records(path string) ([]record.Record, error) {
return nil, err
}
defer f.Close()

records, sep, err := tryNamesOnly(f, fileName)
if err != nil {
return nil, err
}

if records != nil {
return records, nil
}

if sep == rune(0) {
return nil, errors.New("cannot determine field separator")
}

r := csv.NewReader(f)
r.Comma = sep

// skip header
header, err := r.Read()
if err != nil {
return nil, err
}

name, id, family, valid := readHeader(header)
if !valid {
return nil, errors.New("the CSV file needs `scientifiName` field")
Expand Down Expand Up @@ -112,3 +128,39 @@ func parse(recs []record.Record) []record.Record {
}
return res
}

func tryNamesOnly(f *os.File, fileName string) ([]record.Record, rune, error) {
var res []record.Record
scanner := bufio.NewScanner(f)

var count int

for scanner.Scan() {
line := scanner.Text()
if count == 0 {
sep := fileSep(line)
if sep != rune(0) {
f.Seek(0, io.SeekStart)
return nil, sep, nil
}
}

res = append(res, record.Record{DataSet: fileName, Index: count, Name: line})
count++
}

if err := scanner.Err(); err != nil {
return res, rune(0), err
}
return parse(res), rune(0), nil
}

func fileSep(s string) rune {
if strings.Contains(s, "\t") {
return '\t'
} else if !strings.Contains(s, ",") || strings.Contains(s, " ") {
return rune(0)
} else {
return ','
}
}
22 changes: 22 additions & 0 deletions io/ingestio/ingestio_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,25 @@ func TestRecords(t *testing.T) {
assert.True(t, rec[9].Parsed.Parsed)
assert.Equal(t, "Rhea american nobil", rec[9].Canonical.Stemmed)
}

func TestTSV(t *testing.T) {
cfg := config.New()
ing := ingestio.New(cfg)

p := filepath.Join(path, "ioc-bird.tsv")
rec, err := ing.Records(p)
assert.Nil(t, err)
assert.True(t, len(rec) > 2)
assert.Equal(t, "Rhea americana (Linnaeus, 1758)", rec[2].Name)
}

func TestNamesList(t *testing.T) {
cfg := config.New()
ing := ingestio.New(cfg)

p := filepath.Join(path, "names.txt")
rec, err := ing.Records(p)
assert.Nil(t, err)
assert.True(t, len(rec) > 2)
assert.Equal(t, "Rhea americana (Linnaeus, 1758)", rec[2].Name)
}
Loading

0 comments on commit 7bcfaab

Please sign in to comment.