Basic support for incoming HTML email: extract text from HTML

Our old assumption, that every email would have a text/plain section, turns out to be wrong. Almost all email does, but occasionally I get email that only contains HTML. This is not a solid, long term solution, just a stopgap until I can ship Scramble 2.
dcposch · Nov 22, 2015 · 33275fa · 33275fa
1 parent b5f5cf1
commit 33275fa
Show file tree

Hide file tree

Showing 7 changed files with 151 additions and 17 deletions.
diff --git a/src/scramble/crypto.go b/src/scramble/crypto.go
@@ -2,11 +2,11 @@ package scramble
 
 import (
 	"bytes"
-	"code.google.com/p/go.crypto/openpgp"
-	"code.google.com/p/go.crypto/openpgp/armor"
-	"code.google.com/p/go.crypto/openpgp/packet"
 	"crypto/sha1"
 	"encoding/base32"
+	"golang.org/x/crypto/openpgp"
+	"golang.org/x/crypto/openpgp/armor"
+	"golang.org/x/crypto/openpgp/packet"
 	"io"
 	"strings"
 )

diff --git a/src/scramble/migrations.go b/src/scramble/migrations.go
@@ -32,7 +32,7 @@ func migrateDb() {
 	// create the table, if needed
 	_, err := db.Exec(`create table if not exists migration (
         version int not null
-    ) engine=InnoDB`)
+    ) collate=ascii_bin`)
 	if err != nil {
 		panic(err)
 	}
@@ -71,7 +71,7 @@ func migrateCreateUser() error {
 
         primary key (token),
         unique index (public_hash)
-    )`)
+    ) collate=ascii_bin`)
 	return err
 }
 
@@ -92,7 +92,7 @@ func migrateCreateEmail() error {
         primary key (message_id, pub_hash_to),
         index (pub_hash_to, box),
         index (pub_hash_from)
-    )`)
+    ) collate=ascii_bin`)
 	return err
 }
 
@@ -163,7 +163,7 @@ func migrateEmailRefactor() error {
         cipher_body    LONGTEXT NOT NULL,
 
         primary key (message_id)
-    )`)
+    ) collate=ascii_bin`)
 	if err != nil {
 		return err
 	}
@@ -178,7 +178,7 @@ func migrateEmailRefactor() error {
         PRIMARY KEY (id),
         INDEX (address, box, unix_time),
         INDEX (address, message_id)
-    )`)
+    ) collate=ascii_bin`)
 	if err != nil {
 		return err
 	}
@@ -273,7 +273,7 @@ func migrateCreateNameResolution() error {
 		host           VARCHAR(255),
 		hash           CHAR(16),
 		index (host, name)
-    )`)
+    ) collate=ascii_bin`)
 	if err != nil {
 		return err
 	}
@@ -417,7 +417,7 @@ func migrateCreateMxHosts() error {
         unix_time    BIGINT NOT NULL,
 
         PRIMARY KEY (host)
-    )`)
+    ) collate=ascii_bin`)
 	return err
 }
 

diff --git a/src/scramble/notary.go b/src/scramble/notary.go
@@ -3,7 +3,7 @@
 package scramble
 
 import (
-	"code.google.com/p/go.crypto/openpgp"
+	"golang.org/x/crypto/openpgp"
 	"io/ioutil"
 	"log"
 	"net/http"

diff --git a/src/scramble/smtp_saver.go b/src/scramble/smtp_saver.go
@@ -11,15 +11,19 @@ package scramble
 
 import (
 	"bytes"
-	"code.google.com/p/go.crypto/openpgp"
-	"code.google.com/p/go.crypto/openpgp/armor"
+	"golang.org/x/crypto/openpgp"
+	"golang.org/x/crypto/openpgp/armor"
+	"golang.org/x/net/html"
 	"log"
 	"net/mail"
 	"regexp"
 	"strings"
 )
 
 var regexSMTPTemplatep = regexp.MustCompile(`(?s)-----BEGIN PGP MESSAGE-----.*?-----END PGP MESSAGE-----`)
+var regexWhitespace = regexp.MustCompile(`\s+`)
+var regexAllWhitespace = regexp.MustCompile(`^\s*$`)
+var regexTrailingSpace = regexp.MustCompile(`(?m) +$`)
 
 func StartSMTPSaver() {
 	// start some savemail workers
@@ -69,7 +73,11 @@ func deliverMailLocally(msg *SMTPMessage) error {
 		var textBody string
 		if msg.data.textBody == "" && msg.data.body != "" {
 			// HTML email, blank body with file attachments, etc
-			textBody = "(Sorry, Scramble only supports plain text email for now.)"
+			var err error
+			textBody, err = extractTextFromHTML(msg.data.body)
+			if err != nil {
+				return err
+			}
 		} else {
 			textBody = msg.data.textBody
 		}
@@ -117,6 +125,85 @@ func deliverMailLocally(msg *SMTPMessage) error {
 	return nil
 }
 
+// Extracts reasonably readable plain text from an HTML email
+// Note this is NOT an HTML sanitizer and the output is NOT safe to display as HTML.
+// The output should be displayed only as plain text.
+func extractTextFromHTML(dirtyHTML string) (string, error) {
+	var buffer bytes.Buffer
+	var err error
+	var tagNameStr = ""
+
+	z := html.NewTokenizer(strings.NewReader(dirtyHTML))
+	for {
+		tokenType := z.Next()
+		if tokenType == html.ErrorToken {
+			// HTML email is sometimes malformed...
+			// if there's a parse error, just return what we've parsed so far.
+			break
+		} else if tokenType == html.SelfClosingTagToken {
+			tagName, _ := z.TagName()
+			tagNameStr = string(tagName)
+			if tagNameStr == "br" {
+				_, err = buffer.WriteString("\n")
+			}
+		} else if tokenType == html.StartTagToken {
+			tagName, hasAttr := z.TagName()
+			tagNameStr = string(tagName)
+			// handle line breaks
+			if tagNameStr == "br" {
+				_, err = buffer.WriteString("\n")
+				continue
+			}
+			// finally, handle links
+			if tagNameStr != "a" {
+				continue
+			}
+			href := ""
+			var attrKey, attrVal []byte
+			for hasAttr {
+				attrKey, attrVal, hasAttr = z.TagAttr()
+				if string(attrKey) == "href" {
+					href = string(attrVal)
+				}
+			}
+			// write out the link target out as text
+			// very basic. user can decide whether to copy to URL bar
+			if href != "" {
+				_, err = buffer.WriteString("( link to " + href + " ) ")
+			}
+		} else if tokenType == html.TextToken {
+			if tagNameStr == "style" {
+				// ignore style tags
+				continue
+			}
+			textStr := string(z.Text())
+			if regexAllWhitespace.MatchString(textStr) {
+				// ignore runs of just whitespace
+				continue
+			}
+			textStr = regexWhitespace.ReplaceAllString(textStr, " ")
+			_, err = buffer.Write([]byte(textStr))
+		} else if tokenType == html.EndTagToken {
+			tagName, _ := z.TagName()
+			tagNameStr := string(tagName)
+			if tagNameStr == "span" || tagNameStr == "a" || tagNameStr == "th" || tagNameStr == "td" {
+				_, err = buffer.WriteString(" ")
+			} else {
+				_, err = buffer.WriteString("\n")
+			}
+		}
+	}
+
+	// if it worked, return a trimmed string
+	if err != nil {
+		return "", err
+	}
+	ret := buffer.String()
+	ret = regexTrailingSpace.ReplaceAllString(ret, "")
+	ret = strings.TrimSpace(ret)
+	return ret, nil
+}
+
 func joinAddresses(addrs []*mail.Address) string {
 	var strs []string
 	for _, addr := range addrs {

diff --git a/src/scramble/smtp_saver_test.go b/src/scramble/smtp_saver_test.go
@@ -0,0 +1,49 @@
+package scramble
+
+import "testing"
+
+func TestPlainTextFromHTML(t *testing.T) {
+	pairs := [...][2]string{
+		{
+			`
+<!DOCTYPE html>
+<!-- doctypes and comments are stripped -->
+<div>Hello World</div>
+`,
+			`Hello World`,
+		},
+		{
+			`
+<!-- style tags are stripped -->
+<style>
+body { font-size:10em }
+</style>
+<!-- divs and paragraphs get a newline, spans don't -->
+<div>This Is Just To Say</div>
+<br />
+<p>I have <span>eaten</span></p>
+<p>the plums</p>
+<p>that were in</p>
+<p>the icebox</p>
+`,
+			`This Is Just To Say
+
+I have eaten
+the plums
+that were in
+the icebox`,
+		},
+		{`
+<div>Hello World</div>
+<div>Click here: <a href="https://google.com">Google</a></div>
+`,
+			`Hello World
+Click here: ( link to https://google.com ) Google`,
+		}}
+	for _, pair := range pairs {
+		in, out := pair[0], pair[1]
+		if x, err := extractTextFromHTML(in); err != nil || x != out {
+			t.Errorf("extractTextFromHTML on input:\n'%s'\nproduced output:\n'%s'\nshould be:\n'%s'", in, x, out)
+		}
+	}
+}
diff --git a/src/scramble/smtp_server.go b/src/scramble/smtp_server.go
@@ -21,7 +21,6 @@ package scramble
 import (
 	"bufio"
 	"bytes"
-	_ "code.google.com/p/go.crypto/ripemd160"
 	"compress/zlib"
 	"crypto/md5"
 	"encoding/base64"
@@ -30,6 +29,7 @@ import (
 	"fmt"
 	iconv "github.com/sloonz/go-iconv"
 	qprintable "github.com/sloonz/go-qprintable"
+	_ "golang.org/x/crypto/ripemd160"
 	"io"
 	"io/ioutil"
 	"log"

diff --git a/static/js/app.js b/static/js/app.js
@@ -2425,8 +2425,6 @@ function initPGP() {
     openpgp.init();
     return true;
   } else {
-    alert("Sorry, you'll need a modern browser to use Scramble.\n"+
-          "Use Chrome >= 11, Safari >= 3.1 or Firefox >= 21");
     return false;
   }   
 }