Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change behavior to output link after inner a tag text #1

Merged
merged 5 commits into from
Oct 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Run Tests

on:
- push

jobs:

build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: Set up Go
uses: actions/setup-go@v3
with:
go-version: 1.19

- name: Test
run: go test -v .
24 changes: 18 additions & 6 deletions html2text.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ const (
)

var lbr = WIN_LBR
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
var badTagnamesRE = regexp.MustCompile(`^(head|script|style)($|\s+)`)
var linkTagRE = regexp.MustCompile(`a.*href=('([^']*?)'|"([^"]*?)")`)
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
Expand Down Expand Up @@ -118,6 +118,8 @@ func HTML2Text(html string) string {
inEnt := false
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
shouldOutput := true
// maintain a stack of <a> tag href links and output it after the tag's inner text
hrefs := []string{}
// new line cannot be printed at the beginning or
// for <p> after a new line created by previous <p></p>
canPrintNewline := false
Expand Down Expand Up @@ -199,11 +201,18 @@ func HTML2Text(html string) string {
outBuf.WriteString(lbr + lbr)
}
canPrintNewline = false
} else if badTagnamesRE.MatchString(tagNameLowercase) {
// unwanted block
badTagStackDepth++

} else if tagNameLowercase == "/a" {
// end of link
// links can be empty can happen if the link matches the badLinkHrefRE
if len(hrefs) > 0 {
outBuf.WriteString(" <")
outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
outBuf.WriteString(">")
hrefs = hrefs[1:]
}
} else if linkTagRE.MatchString(tagNameLowercase) {
// parse link href
// add special handling for a tags
m := linkTagRE.FindStringSubmatch(tag)
if len(m) == 4 {
link := m[2]
Expand All @@ -212,9 +221,12 @@ func HTML2Text(html string) string {
}

if !badLinkHrefRE.MatchString(link) {
outBuf.WriteString(HTMLEntitiesToText(link))
hrefs = append(hrefs, link)
}
}
} else if badTagnamesRE.MatchString(tagNameLowercase) {
// unwanted block
badTagStackDepth++
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
badTagnamesRE.MatchString(tagNameLowercase[1:]) {
// end of unwanted block
Expand Down
15 changes: 8 additions & 7 deletions html2text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@ func TestHTML2Text(t *testing.T) {
Convey("Links", func() {
So(HTML2Text(`<div></div>`), ShouldEqual, "")
So(HTML2Text(`<div>simple text</div>`), ShouldEqual, "simple text")
So(HTML2Text(`click <a href="test">here</a>`), ShouldEqual, "click test")
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click test")
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click ents/'x'")
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click ")
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click test")
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click http://bit.ly/2n4wXRs")
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "/wiki/yet#English, /wiki/not_yet#English")
So(HTML2Text(`click <a href="test">here</a>`), ShouldEqual, "click here <test>")
So(HTML2Text(`click <a class="x" href="test">here</a>`), ShouldEqual, "click here <test>")
So(HTML2Text(`click <a href="ents/&apos;x&apos;">here</a>`), ShouldEqual, "click here <ents/'x'>")
So(HTML2Text(`click <a href="javascript:void(0)">here</a>`), ShouldEqual, "click here")
So(HTML2Text(`click <a href="test"><span>here</span> or here</a>`), ShouldEqual, "click here or here <test>")
So(HTML2Text(`click <a href="http://bit.ly/2n4wXRs">news</a>`), ShouldEqual, "click news <http://bit.ly/2n4wXRs>")
So(HTML2Text(`<a rel="mw:WikiLink" href="/wiki/yet#English" title="yet">yet</a>, <a rel="mw:WikiLink" href="/wiki/not_yet#English" title="not yet">not yet</a>`), ShouldEqual, "yet </wiki/yet#English>, not yet </wiki/not_yet#English>")
So(HTML2Text(`click <a href="one">here<a href="two"> or</a><span> here</span></a>`), ShouldEqual, "click here or <one> here <two>")
})

Convey("Inlines", func() {
Expand Down