Skip to content

Commit

Permalink
update gse test utils
Browse files Browse the repository at this point in the history
  • Loading branch information
vcaesar committed Apr 3, 2018
1 parent b378813 commit 0306de4
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 87 deletions.
162 changes: 82 additions & 80 deletions segmenter_test.go
Expand Up @@ -4,116 +4,118 @@ import (
"fmt"
"runtime"
"testing"

"github.com/vcaesar/tt"
)

var (
prodSeg = Segmenter{}
)

func TestGetVer(t *testing.T) {
fmt.Println(runtime.Version())
fmt.Println("go version: ", runtime.Version())
ver := GetVersion()
expect(t, version, ver)
tt.Expect(t, version, ver)
}

func TestSplit(t *testing.T) {
expect(t, "中/国/有/十/三/亿/人/口/",
tt.Expect(t, "中/国/有/十/三/亿/人/口/",
bytesToString(splitTextToWords([]byte(
"中国有十三亿人口"))))

expect(t, "github/ /is/ /a/ /web/-/based/ /hosting/ /service/,/ /for/ /software/ /development/ /projects/./",
tt.Expect(t, "github/ /is/ /a/ /web/-/based/ /hosting/ /service/,/ /for/ /software/ /development/ /projects/./",
bytesToString(splitTextToWords([]byte(
"GitHub is a web-based hosting service, for software development projects."))))

expect(t, "中/国/雅/虎/yahoo/!/ /china/致/力/于/,/领/先/的/公/益/民/生/门/户/网/站/。/",
tt.Expect(t, "中/国/雅/虎/yahoo/!/ /china/致/力/于/,/领/先/的/公/益/民/生/门/户/网/站/。/",
bytesToString(splitTextToWords([]byte(
"中国雅虎Yahoo! China致力于,领先的公益民生门户网站。"))))

expect(t, "こ/ん/に/ち/は/", bytesToString(splitTextToWords([]byte("こんにちは"))))
tt.Expect(t, "こ/ん/に/ち/は/", bytesToString(splitTextToWords([]byte("こんにちは"))))

expect(t, "안/녕/하/세/요/", bytesToString(splitTextToWords([]byte("안녕하세요"))))
tt.Expect(t, "안/녕/하/세/요/", bytesToString(splitTextToWords([]byte("안녕하세요"))))

expect(t, "Я/ /тоже/ /рада/ /Вас/ /видеть/", bytesToString(splitTextToWords([]byte("Я тоже рада Вас видеть"))))
tt.Expect(t, "Я/ /тоже/ /рада/ /Вас/ /видеть/", bytesToString(splitTextToWords([]byte("Я тоже рада Вас видеть"))))

expect(t, "¿/cómo/ /van/ /las/ /cosas/", bytesToString(splitTextToWords([]byte("¿Cómo van las cosas"))))
tt.Expect(t, "¿/cómo/ /van/ /las/ /cosas/", bytesToString(splitTextToWords([]byte("¿Cómo van las cosas"))))

expect(t, "wie/ /geht/ /es/ /ihnen/", bytesToString(splitTextToWords([]byte("Wie geht es Ihnen"))))
tt.Expect(t, "wie/ /geht/ /es/ /ihnen/", bytesToString(splitTextToWords([]byte("Wie geht es Ihnen"))))

expect(t, "je/ /suis/ /enchanté/ /de/ /cette/ /pièce/",
tt.Expect(t, "je/ /suis/ /enchanté/ /de/ /cette/ /pièce/",
bytesToString(splitTextToWords([]byte("Je suis enchanté de cette pièce"))))
}

func TestSegment(t *testing.T) {
var seg Segmenter
seg.LoadDict("testdata/test_dict1.txt,testdata/test_dict2.txt")
// seg.LoadDict("testdata/test_dict1.txt", "testdata/test_dict2.txt")
expect(t, "12", seg.dict.NumTokens())
// expect(t, "5", seg.dict.NumTokens())
tt.Expect(t, "12", seg.dict.NumTokens())
// tt.Expect(t, "5", seg.dict.NumTokens())
segments := seg.Segment([]byte("中国有十三亿人口"))
expect(t, "中国/ 有/p3 十三亿/ 人口/p12 ", ToString(segments, false))
// expect(t, "中国/ 有/x 十三亿/ 人口/p12 ", ToString(segments, false))
expect(t, "4", len(segments))
expect(t, "0", segments[0].start)
expect(t, "6", segments[0].end)
expect(t, "6", segments[1].start)
expect(t, "9", segments[1].end)
expect(t, "9", segments[2].start)
expect(t, "18", segments[2].end)
expect(t, "18", segments[3].start)
expect(t, "24", segments[3].end)
tt.Expect(t, "中国/ 有/p3 十三亿/ 人口/p12 ", ToString(segments, false))
// tt.Expect(t, "中国/ 有/x 十三亿/ 人口/p12 ", ToString(segments, false))
tt.Expect(t, "4", len(segments))
tt.Expect(t, "0", segments[0].start)
tt.Expect(t, "6", segments[0].end)
tt.Expect(t, "6", segments[1].start)
tt.Expect(t, "9", segments[1].end)
tt.Expect(t, "9", segments[2].start)
tt.Expect(t, "18", segments[2].end)
tt.Expect(t, "18", segments[3].start)
tt.Expect(t, "24", segments[3].end)
}

func TestSegmentS(t *testing.T) {
var seg Segmenter
seg.LoadDict("testdata/test_dict.txt")

dict := seg.Dictionary()
expect(t, "4", dict.maxTokenLen)
expect(t, "2103", dict.totalFrequency)
tt.Expect(t, "4", dict.maxTokenLen)
tt.Expect(t, "2103", dict.totalFrequency)

expect(t, "19", seg.dict.NumTokens())
tt.Expect(t, "19", seg.dict.NumTokens())
text1 := []byte("深圳地王大厦")
segments := seg.Segment([]byte(text1))
expect(t, "深圳/n 地王大厦/n ", ToString(segments, false))
tt.Expect(t, "深圳/n 地王大厦/n ", ToString(segments, false))

segs := seg.ModeSegment([]byte(text1), true)
expect(t, "深圳/n 地王大厦/n ", ToString(segs, false))
tt.Expect(t, "深圳/n 地王大厦/n ", ToString(segs, false))

expect(t, "2", len(segments))
expect(t, "0", segments[0].start)
expect(t, "6", segments[0].end)
expect(t, "6", segments[1].start)
expect(t, "18", segments[1].end)
tt.Expect(t, "2", len(segments))
tt.Expect(t, "0", segments[0].start)
tt.Expect(t, "6", segments[0].end)
tt.Expect(t, "6", segments[1].start)
tt.Expect(t, "18", segments[1].end)

text2 := []byte("留给真爱你的人")
segments2 := seg.Segment([]byte(text2))
expect(t, "留给/v 真爱/nr 你/x 的/x 人/x ", ToString(segments2, false))
tt.Expect(t, "留给/v 真爱/nr 你/x 的/x 人/x ", ToString(segments2, false))

expect(t, "5", len(segments2))
expect(t, "0", segments2[0].start)
expect(t, "6", segments2[0].end)
expect(t, "6", segments2[1].start)
expect(t, "12", segments2[1].end)
tt.Expect(t, "5", len(segments2))
tt.Expect(t, "0", segments2[0].start)
tt.Expect(t, "6", segments2[0].end)
tt.Expect(t, "6", segments2[1].start)
tt.Expect(t, "12", segments2[1].end)
}

func TestSegmentJp(t *testing.T) {
var seg Segmenter
seg.LoadDict("data/dict/jp/dict.txt")
text2 := []byte("こんにちは世界")
segments := seg.Segment([]byte(text2))
expect(t, "こんにちは/感動詞 世界/名詞 ", ToString(segments, false))
expect(t, "2", len(segments))
expect(t, "こん/名詞 こんにちは/感動詞 世界/名詞 ", ToString(segments, true))
expect(t, "[こん こんにちは 世界]", ToSlice(segments, true))
expect(t, "[こんにちは 世界]", ToSlice(segments, false))
expect(t, "2", len(segments))
expect(t, "0", segments[0].start)
expect(t, "15", segments[0].end)
tt.Expect(t, "こんにちは/感動詞 世界/名詞 ", ToString(segments, false))
tt.Expect(t, "2", len(segments))
tt.Expect(t, "こん/名詞 こんにちは/感動詞 世界/名詞 ", ToString(segments, true))
tt.Expect(t, "[こん こんにちは 世界]", ToSlice(segments, true))
tt.Expect(t, "[こんにちは 世界]", ToSlice(segments, false))
tt.Expect(t, "2", len(segments))
tt.Expect(t, "0", segments[0].start)
tt.Expect(t, "15", segments[0].end)
}

func TestDictPaths(t *testing.T) {
paths := DictPaths("./dictDir", "zh,jp")
expect(t, "2", len(paths))
tt.Expect(t, "2", len(paths))
if paths[0] != "dictDir/dict/dictionary.txt" {
t.Errorf("what=\"%s\", got=\"%s\"", "dictDir/dict/dictionary.txt", paths[0])
}
Expand All @@ -129,75 +131,75 @@ func TestSegmentDicts(t *testing.T) {

text1 := []byte("深圳地王大厦")
segments := seg.Segment([]byte(text1))
expect(t, "深圳/ns 地王大厦/n ", ToString(segments, false))
tt.Expect(t, "深圳/ns 地王大厦/n ", ToString(segments, false))

expect(t, "2", len(segments))
expect(t, "0", segments[0].start)
expect(t, "6", segments[0].end)
expect(t, "6", segments[1].start)
expect(t, "18", segments[1].end)
tt.Expect(t, "2", len(segments))
tt.Expect(t, "0", segments[0].start)
tt.Expect(t, "6", segments[0].end)
tt.Expect(t, "6", segments[1].start)
tt.Expect(t, "18", segments[1].end)

text2 := []byte("こんにちは世界")
segments = seg.Segment([]byte(text2))
expect(t, "こんにちは/感動詞 世界/n ", ToString(segments, false))
expect(t, "2", len(segments))
expect(t, "こん/名詞 こんにちは/感動詞 世界/n ", ToString(segments, true))
expect(t, "2", len(segments))
expect(t, "0", segments[0].start)
expect(t, "15", segments[0].end)
tt.Expect(t, "こんにちは/感動詞 世界/n ", ToString(segments, false))
tt.Expect(t, "2", len(segments))
tt.Expect(t, "こん/名詞 こんにちは/感動詞 世界/n ", ToString(segments, true))
tt.Expect(t, "2", len(segments))
tt.Expect(t, "0", segments[0].start)
tt.Expect(t, "15", segments[0].end)

expect(t, "0", segments[0].Start())
expect(t, "15", segments[0].End())
tt.Expect(t, "0", segments[0].Start())
tt.Expect(t, "15", segments[0].End())

token := segments[0].Token()
expect(t, "こんにちは", token.Text())
expect(t, "5704", token.Frequency())
expect(t, "感動詞", token.Pos())
tt.Expect(t, "こんにちは", token.Text())
tt.Expect(t, "5704", token.Frequency())
tt.Expect(t, "感動詞", token.Pos())

tseg := token.Segments()
expect(t, "0", tseg[0].Start())
expect(t, "6", tseg[0].End())
tt.Expect(t, "0", tseg[0].Start())
tt.Expect(t, "6", tseg[0].End())
}

func TestLargeDictionary(t *testing.T) {
prodSeg.LoadDict("data/dict/dictionary.txt")
expect(t, "中国/ns 人口/n ", ToString(prodSeg.Segment(
tt.Expect(t, "中国/ns 人口/n ", ToString(prodSeg.Segment(
[]byte("中国人口")), false))

expect(t, "中国/ns 人口/n ", ToString(prodSeg.internalSegment(
tt.Expect(t, "中国/ns 人口/n ", ToString(prodSeg.internalSegment(
[]byte("中国人口"), false), false))

expect(t, "中国/ns 人口/n ", ToString(prodSeg.internalSegment(
tt.Expect(t, "中国/ns 人口/n ", ToString(prodSeg.internalSegment(
[]byte("中国人口"), true), false))

expect(t, "中华人民共和国/ns 中央人民政府/nt ", ToString(prodSeg.internalSegment(
tt.Expect(t, "中华人民共和国/ns 中央人民政府/nt ", ToString(prodSeg.internalSegment(
[]byte("中华人民共和国中央人民政府"), true), false))

expect(t, "中华人民共和国中央人民政府/nt ", ToString(prodSeg.internalSegment(
tt.Expect(t, "中华人民共和国中央人民政府/nt ", ToString(prodSeg.internalSegment(
[]byte("中华人民共和国中央人民政府"), false), false))

expect(t, "中华/nz 人民/n 共和/nz 共和国/ns 人民共和国/nt 中华人民共和国/ns 中央/n 人民/n 政府/n 人民政府/nt 中央人民政府/nt 中华人民共和国中央人民政府/nt ", ToString(prodSeg.Segment(
tt.Expect(t, "中华/nz 人民/n 共和/nz 共和国/ns 人民共和国/nt 中华人民共和国/ns 中央/n 人民/n 政府/n 人民政府/nt 中央人民政府/nt 中华人民共和国中央人民政府/nt ", ToString(prodSeg.Segment(
[]byte("中华人民共和国中央人民政府")), true))
}

// func TestLoadDictionary(t *testing.T) {
// var seg Segmenter
// seg.LoadDict()
// expect(t, "中国/ns 人口/n ", ToString(prodSeg.Segment(
// tt.Expect(t, "中国/ns 人口/n ", ToString(prodSeg.Segment(
// []byte("中国人口")), false))

// expect(t, "中国/ns 人口/n ", ToString(prodSeg.internalSegment(
// tt.Expect(t, "中国/ns 人口/n ", ToString(prodSeg.internalSegment(
// []byte("中国人口"), false), false))

// expect(t, "中国/ns 人口/n ", ToString(prodSeg.internalSegment(
// tt.Expect(t, "中国/ns 人口/n ", ToString(prodSeg.internalSegment(
// []byte("中国人口"), true), false))

// expect(t, "中华人民共和国/ns 中央人民政府/nt ", ToString(prodSeg.internalSegment(
// tt.Expect(t, "中华人民共和国/ns 中央人民政府/nt ", ToString(prodSeg.internalSegment(
// []byte("中华人民共和国中央人民政府"), true), false))

// expect(t, "中华人民共和国中央人民政府/nt ", ToString(prodSeg.internalSegment(
// tt.Expect(t, "中华人民共和国中央人民政府/nt ", ToString(prodSeg.internalSegment(
// []byte("中华人民共和国中央人民政府"), false), false))

// expect(t, "中华/nz 人民/n 共和/nz 共和国/ns 人民共和国/nt 中华人民共和国/ns 中央/n 人民/n 政府/n 人民政府/nt 中央人民政府/nt 中华人民共和国中央人民政府/nt ", ToString(prodSeg.Segment(
// tt.Expect(t, "中华/nz 人民/n 共和/nz 共和国/ns 人民共和国/nt 中华人民共和国/ns 中央/n 人民/n 政府/n 人民政府/nt 中央人民政府/nt 中华人民共和国中央人民政府/nt ", ToString(prodSeg.Segment(
// []byte("中华人民共和国中央人民政府")), true))
// }
14 changes: 7 additions & 7 deletions test_utils.go
Expand Up @@ -5,13 +5,6 @@ import (
"testing"
)

func expect(t *testing.T, expect string, actual interface{}) {
actualString := fmt.Sprint(actual)
if expect != actualString {
t.Errorf("期待值=\"%s\", 实际=\"%s\"", expect, actualString)
}
}

func printTokens(tokens []*Token, numTokens int) (output string) {
for iToken := 0; iToken < numTokens; iToken++ {
for _, word := range tokens[iToken].text {
Expand All @@ -36,3 +29,10 @@ func bytesToString(bytes []Text) (output string) {
}
return
}

func expect(t *testing.T, expect string, actual interface{}) {
actualString := fmt.Sprint(actual)
if expect != actualString {
t.Errorf("期待值=\"%s\", 实际=\"%s\"", expect, actualString)
}
}

0 comments on commit 0306de4

Please sign in to comment.