Skip to content
This repository has been archived by the owner on May 11, 2022. It is now read-only.

Commit

Permalink
Basic support for incoming HTML email: extract text from HTML
Browse files Browse the repository at this point in the history
Our old assumption, that every email would have a text/plain section, turns out to be wrong.

Almost all email does, but occasionally I get email that only contains HTML.

This is not a solid, long term solution, just a stopgap until I can ship Scramble 2.
  • Loading branch information
dcposch committed Nov 22, 2015
1 parent b5f5cf1 commit 33275fa
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 17 deletions.
6 changes: 3 additions & 3 deletions src/scramble/crypto.go
Expand Up @@ -2,11 +2,11 @@ package scramble

import (
"bytes"
"code.google.com/p/go.crypto/openpgp"
"code.google.com/p/go.crypto/openpgp/armor"
"code.google.com/p/go.crypto/openpgp/packet"
"crypto/sha1"
"encoding/base32"
"golang.org/x/crypto/openpgp"
"golang.org/x/crypto/openpgp/armor"
"golang.org/x/crypto/openpgp/packet"
"io"
"strings"
)
Expand Down
14 changes: 7 additions & 7 deletions src/scramble/migrations.go
Expand Up @@ -32,7 +32,7 @@ func migrateDb() {
// create the table, if needed
_, err := db.Exec(`create table if not exists migration (
version int not null
) engine=InnoDB`)
) collate=ascii_bin`)
if err != nil {
panic(err)
}
Expand Down Expand Up @@ -71,7 +71,7 @@ func migrateCreateUser() error {
primary key (token),
unique index (public_hash)
)`)
) collate=ascii_bin`)
return err
}

Expand All @@ -92,7 +92,7 @@ func migrateCreateEmail() error {
primary key (message_id, pub_hash_to),
index (pub_hash_to, box),
index (pub_hash_from)
)`)
) collate=ascii_bin`)
return err
}

Expand Down Expand Up @@ -163,7 +163,7 @@ func migrateEmailRefactor() error {
cipher_body LONGTEXT NOT NULL,
primary key (message_id)
)`)
) collate=ascii_bin`)
if err != nil {
return err
}
Expand All @@ -178,7 +178,7 @@ func migrateEmailRefactor() error {
PRIMARY KEY (id),
INDEX (address, box, unix_time),
INDEX (address, message_id)
)`)
) collate=ascii_bin`)
if err != nil {
return err
}
Expand Down Expand Up @@ -273,7 +273,7 @@ func migrateCreateNameResolution() error {
host VARCHAR(255),
hash CHAR(16),
index (host, name)
)`)
) collate=ascii_bin`)
if err != nil {
return err
}
Expand Down Expand Up @@ -417,7 +417,7 @@ func migrateCreateMxHosts() error {
unix_time BIGINT NOT NULL,
PRIMARY KEY (host)
)`)
) collate=ascii_bin`)
return err
}

Expand Down
2 changes: 1 addition & 1 deletion src/scramble/notary.go
Expand Up @@ -3,7 +3,7 @@
package scramble

import (
"code.google.com/p/go.crypto/openpgp"
"golang.org/x/crypto/openpgp"
"io/ioutil"
"log"
"net/http"
Expand Down
93 changes: 90 additions & 3 deletions src/scramble/smtp_saver.go
Expand Up @@ -11,15 +11,19 @@ package scramble

import (
"bytes"
"code.google.com/p/go.crypto/openpgp"
"code.google.com/p/go.crypto/openpgp/armor"
"golang.org/x/crypto/openpgp"
"golang.org/x/crypto/openpgp/armor"
"golang.org/x/net/html"
"log"
"net/mail"
"regexp"
"strings"
)

var regexSMTPTemplatep = regexp.MustCompile(`(?s)-----BEGIN PGP MESSAGE-----.*?-----END PGP MESSAGE-----`)
var regexWhitespace = regexp.MustCompile(`\s+`)
var regexAllWhitespace = regexp.MustCompile(`^\s*$`)
var regexTrailingSpace = regexp.MustCompile(`(?m) +$`)

func StartSMTPSaver() {
// start some savemail workers
Expand Down Expand Up @@ -69,7 +73,11 @@ func deliverMailLocally(msg *SMTPMessage) error {
var textBody string
if msg.data.textBody == "" && msg.data.body != "" {
// HTML email, blank body with file attachments, etc
textBody = "(Sorry, Scramble only supports plain text email for now.)"
var err error
textBody, err = extractTextFromHTML(msg.data.body)
if err != nil {
return err
}
} else {
textBody = msg.data.textBody
}
Expand Down Expand Up @@ -117,6 +125,85 @@ func deliverMailLocally(msg *SMTPMessage) error {
return nil
}

// Extracts reasonably readable plain text from an HTML email
// Note this is NOT an HTML sanitizer and the output is NOT safe to display as HTML.
// The output should be displayed only as plain text.
func extractTextFromHTML(dirtyHTML string) (string, error) {
var buffer bytes.Buffer
var err error
var tagNameStr = ""

z := html.NewTokenizer(strings.NewReader(dirtyHTML))
for {
tokenType := z.Next()
if tokenType == html.ErrorToken {
// HTML email is sometimes malformed...
// if there's a parse error, just return what we've parsed so far.
break
} else if tokenType == html.SelfClosingTagToken {
tagName, _ := z.TagName()
tagNameStr = string(tagName)
if tagNameStr == "br" {
_, err = buffer.WriteString("\n")
}
} else if tokenType == html.StartTagToken {
tagName, hasAttr := z.TagName()
tagNameStr = string(tagName)
// handle line breaks
if tagNameStr == "br" {
_, err = buffer.WriteString("\n")
continue
}
// finally, handle links
if tagNameStr != "a" {
continue
}
href := ""
var attrKey, attrVal []byte
for hasAttr {
attrKey, attrVal, hasAttr = z.TagAttr()
if string(attrKey) == "href" {
href = string(attrVal)
}
}
// write out the link target out as text
// very basic. user can decide whether to copy to URL bar
if href != "" {
_, err = buffer.WriteString("( link to " + href + " ) ")
}
} else if tokenType == html.TextToken {
if tagNameStr == "style" {
// ignore style tags
continue
}
textStr := string(z.Text())
if regexAllWhitespace.MatchString(textStr) {
// ignore runs of just whitespace
continue
}
textStr = regexWhitespace.ReplaceAllString(textStr, " ")
_, err = buffer.Write([]byte(textStr))
} else if tokenType == html.EndTagToken {
tagName, _ := z.TagName()
tagNameStr := string(tagName)
if tagNameStr == "span" || tagNameStr == "a" || tagNameStr == "th" || tagNameStr == "td" {
_, err = buffer.WriteString(" ")
} else {
_, err = buffer.WriteString("\n")
}
}
}

// if it worked, return a trimmed string
if err != nil {
return "", err
}
ret := buffer.String()
ret = regexTrailingSpace.ReplaceAllString(ret, "")
ret = strings.TrimSpace(ret)
return ret, nil
}

func joinAddresses(addrs []*mail.Address) string {
var strs []string
for _, addr := range addrs {
Expand Down
49 changes: 49 additions & 0 deletions src/scramble/smtp_saver_test.go
@@ -0,0 +1,49 @@
package scramble

import "testing"

func TestPlainTextFromHTML(t *testing.T) {
pairs := [...][2]string{
{
`
<!DOCTYPE html>
<!-- doctypes and comments are stripped -->
<div>Hello World</div>
`,
`Hello World`,
},
{
`
<!-- style tags are stripped -->
<style>
body { font-size:10em }
</style>
<!-- divs and paragraphs get a newline, spans don't -->
<div>This Is Just To Say</div>
<br />
<p>I have <span>eaten</span></p>
<p>the plums</p>
<p>that were in</p>
<p>the icebox</p>
`,
`This Is Just To Say
I have eaten
the plums
that were in
the icebox`,
},
{`
<div>Hello World</div>
<div>Click here: <a href="https://google.com">Google</a></div>
`,
`Hello World
Click here: ( link to https://google.com ) Google`,
}}
for _, pair := range pairs {
in, out := pair[0], pair[1]
if x, err := extractTextFromHTML(in); err != nil || x != out {
t.Errorf("extractTextFromHTML on input:\n'%s'\nproduced output:\n'%s'\nshould be:\n'%s'", in, x, out)
}
}
}
2 changes: 1 addition & 1 deletion src/scramble/smtp_server.go
Expand Up @@ -21,7 +21,6 @@ package scramble
import (
"bufio"
"bytes"
_ "code.google.com/p/go.crypto/ripemd160"
"compress/zlib"
"crypto/md5"
"encoding/base64"
Expand All @@ -30,6 +29,7 @@ import (
"fmt"
iconv "github.com/sloonz/go-iconv"
qprintable "github.com/sloonz/go-qprintable"
_ "golang.org/x/crypto/ripemd160"
"io"
"io/ioutil"
"log"
Expand Down
2 changes: 0 additions & 2 deletions static/js/app.js
Expand Up @@ -2425,8 +2425,6 @@ function initPGP() {
openpgp.init();
return true;
} else {
alert("Sorry, you'll need a modern browser to use Scramble.\n"+
"Use Chrome >= 11, Safari >= 3.1 or Firefox >= 21");
return false;
}
}
Expand Down

0 comments on commit 33275fa

Please sign in to comment.