Permalink
Browse files

simplify

  • Loading branch information...
client9 committed Aug 13, 2016
1 parent 4b25821 commit b2043629b216d171d0e175a3a1fa683204f96353
Showing with 39 additions and 61 deletions.
  1. +3 −0 falsepositives_test.go
  2. +24 −46 notwords.go
  3. +4 −2 notwords_test.go
  4. +5 −0 reddit/findtypo.go
  5. +1 −1 replace.go
  6. +2 −12 url.go
View
@@ -6,6 +6,9 @@ import (
func TestFalsePositives(t *testing.T) {
cases := []string{
"infinitie.net",
"foo summaries\n",
"thru",
"publically",
"6YUO5", // base64
"cleaner", // triggered by "cleane->cleanser" and partial word FP
View
@@ -2,55 +2,12 @@ package misspell
import (
"bytes"
"regexp"
"strings"
)
// Functions to remove non-words such as URLs, file paths, etc.
// This needs auditing as I believe it is wrong
func enURLChar(c rune) bool {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c == '-' ||
c == '_' ||
c == '\\' ||
c == '.' ||
c == ':' ||
c == ';' ||
c == '/' ||
c == '~' ||
c == '%' ||
c == '*' ||
c == '$' ||
c == '[' ||
c == ']' ||
c == '?' ||
c == '#' ||
c == '!'
}
func enNotURLChar(c rune) bool {
return !enURLChar(c)
}
// RemoveURL attempts to strip away obvious URLs
//
func RemoveURL(s string) string {
var idx int
for {
if idx = strings.Index(s, "http"); idx == -1 {
return s
}
news := s[:idx]
endx := strings.IndexFunc(s[idx:], enNotURLChar)
if endx != -1 {
news = news + " " + s[idx+endx:]
}
s = news
}
}
var reEmail = regexp.MustCompile(`[a-zA-Z0-9_.%+-]+@[a-zA-Z0-9-.]+\.[a-zA-Z]{2,6}`)
var reHost = regexp.MustCompile(`[a-zA-Z0-9-.]+\.[a-zA-Z]{2,6}`)
// RemovePath attempts to strip away embedded file system paths, e.g.
// /foo/bar or /static/myimg.png
@@ -96,3 +53,24 @@ func RemovePath(s string) string {
}
return out.String()
}
// replaceWithBlanks returns a string with the same number of spaces as the input
func replaceWithBlanks(s string) string {
return strings.Repeat(" ", len(s))
}
// RemoveEmail remove email-like strings, e.g. "nickg+junk@xfoobar.com", "nickg@xyz.abc123.biz"
func RemoveEmail(s string) string {
return reEmail.ReplaceAllStringFunc(s, replaceWithBlanks)
}
// RemoveHost removes host-like strings "foobar.com" "abc123.fo1231.biz"
func RemoveHost(s string) string {
return reHost.ReplaceAllStringFunc(s, replaceWithBlanks)
}
// RemoveNotWords blanks out all the not words
func RemoveNotWords(s string) string {
// do most selective/specific first
return RemoveHost(RemoveEmail(RemovePath(StripURL(s))))
}
View
@@ -4,7 +4,7 @@ import (
"testing"
)
func TestRemovePath(t *testing.T) {
func TestNotWords(t *testing.T) {
cases := []struct {
word string
want string
@@ -13,9 +13,11 @@ func TestRemovePath(t *testing.T) {
{"X/foo/bar abc", "X/foo/bar abc"},
{"[/foo/bar] abc", "[ ] abc"},
{"/", "/"},
{"x nickg@client9.xxx y", "x y"},
{"x infinitie.net y", "x y"},
}
for pos, tt := range cases {
got := RemovePath(tt.word)
got := RemoveNotWords(tt.word)
if got != tt.want {
t.Errorf("%d want %q got %q", pos, tt.want, got)
}
View
@@ -433,6 +433,11 @@ var badWord = map[string]bool{
}
var badTypo = map[string]bool{
"summaries": true, // correct!
"encompase": true, // should be corrected to encompass
"transitionend": true, // JS event
"responders": true,
"parenthesised": true, // word
"wraping": true, // wrapping or warping"
"deleters": true,
"invalide": true, // French
View
@@ -40,7 +40,7 @@ func recheckLine(s string, rep *strings.Replacer, corrected map[string]bool) (st
diffs := []Diff{}
out := ""
first := 0
redacted := RemovePath(StripURL(s))
redacted := RemoveNotWords(s)
idx := wordRegexp.FindAllStringIndex(redacted, -1)
for _, ab := range idx {
View
14 url.go
@@ -8,20 +8,10 @@ import (
//
// original @imme_emosol (54 chars) has trouble with dashes in hostname
// @(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS
var urlRE = regexp.MustCompile(`(?i)(https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?`)
var reURL = regexp.MustCompile(`(?i)(https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?`)
// StripURL attemps to replace URLs with blank spaces, e.g.
// "xxx http://foo.com/ yyy -> "xxx yyyy"
func StripURL(s string) string {
out := []byte(s)
matches := urlRE.FindAllIndex(out, -1)
if len(matches) == 0 {
return s
}
for _, idx := range matches {
for j := idx[0]; j < idx[1]; j++ {
out[j] = ' '
}
}
return string(out)
return reURL.ReplaceAllStringFunc(s, replaceWithBlanks)
}

0 comments on commit b204362

Please sign in to comment.