This repository has been archived by the owner on May 11, 2022. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Basic support for incoming HTML email: extract text from HTML
Our old assumption, that every email would have a text/plain section, turns out to be wrong. Almost all email does, but occasionally I get email that only contains HTML. This is not a solid, long term solution, just a stopgap until I can ship Scramble 2.
- Loading branch information
Showing
7 changed files
with
151 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package scramble | ||
|
||
import "testing" | ||
|
||
func TestPlainTextFromHTML(t *testing.T) { | ||
pairs := [...][2]string{ | ||
{ | ||
` | ||
<!DOCTYPE html> | ||
<!-- doctypes and comments are stripped --> | ||
<div>Hello World</div> | ||
`, | ||
`Hello World`, | ||
}, | ||
{ | ||
` | ||
<!-- style tags are stripped --> | ||
<style> | ||
body { font-size:10em } | ||
</style> | ||
<!-- divs and paragraphs get a newline, spans don't --> | ||
<div>This Is Just To Say</div> | ||
<br /> | ||
<p>I have <span>eaten</span></p> | ||
<p>the plums</p> | ||
<p>that were in</p> | ||
<p>the icebox</p> | ||
`, | ||
`This Is Just To Say | ||
I have eaten | ||
the plums | ||
that were in | ||
the icebox`, | ||
}, | ||
{` | ||
<div>Hello World</div> | ||
<div>Click here: <a href="https://google.com">Google</a></div> | ||
`, | ||
`Hello World | ||
Click here: ( link to https://google.com ) Google`, | ||
}} | ||
for _, pair := range pairs { | ||
in, out := pair[0], pair[1] | ||
if x, err := extractTextFromHTML(in); err != nil || x != out { | ||
t.Errorf("extractTextFromHTML on input:\n'%s'\nproduced output:\n'%s'\nshould be:\n'%s'", in, x, out) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters