Skip to content

Commit

Permalink
New singlefile processor, step 1
Browse files Browse the repository at this point in the history
  • Loading branch information
derfenix committed Apr 15, 2023
1 parent 1f3e5ec commit c0f3ea3
Showing 1 changed file with 54 additions and 181 deletions.
235 changes: 54 additions & 181 deletions adapters/processors/singlefile.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,14 @@ package processors
import (
"bytes"
"context"
"encoding/base64"
"fmt"
"io"
"net/http"
"net/url"
"strings"

"golang.org/x/net/html"
"golang.org/x/net/html/atom"

"github.com/derfenix/webarchive/entity"
"golang.org/x/net/html"
)

const defaultEncoding = "utf-8"

func NewSingleFile(client *http.Client) *SingleFile {
return &SingleFile{client: client}
}
Expand All @@ -27,35 +20,21 @@ type SingleFile struct {
}

func (s *SingleFile) Process(ctx context.Context, url string) ([]entity.File, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, fmt.Errorf("new request: %w", err)
}

response, err := s.client.Do(req)
response, err := s.get(ctx, url)
if err != nil {
return nil, fmt.Errorf("do request: %w", err)
}

if response.StatusCode != http.StatusOK {
return nil, fmt.Errorf("want status 200, got %d", response.StatusCode)
}

if response.Body == nil {
return nil, fmt.Errorf("empty response body")
return nil, err
}

defer func() {
_ = response.Body.Close()
}()

htmlNode, err := html.Parse(response.Body)
if err != nil {
_ = response.Body.Close()
return nil, fmt.Errorf("parse response body: %w", err)
}

if err := s.crawl(ctx, htmlNode, baseURL(url), getEncoding(response)); err != nil {
return nil, fmt.Errorf("crawl: %w", err)
_ = response.Body.Close()

if err := s.process(ctx, htmlNode, url, response.Header); err != nil {
return nil, fmt.Errorf("process: %w", err)
}

buf := bytes.NewBuffer(nil)
Expand All @@ -68,186 +47,80 @@ func (s *SingleFile) Process(ctx context.Context, url string) ([]entity.File, er
return []entity.File{htmlFile}, nil
}

func (s *SingleFile) crawl(ctx context.Context, node *html.Node, baseURL string, encoding string) error {
if node.Data == "head" {
s.setCharset(node, encoding)
func (s *SingleFile) get(ctx context.Context, url string) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, fmt.Errorf("new request: %w", err)
}

for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
if err := s.findAndReplaceResources(ctx, child, baseURL); err != nil {
return err
}
}

if err := s.crawl(ctx, child, baseURL, encoding); err != nil {
return fmt.Errorf("crawl child %s: %w", child.Data, err)
}
response, err := s.client.Do(req)
if err != nil {
return nil, fmt.Errorf("do request: %w", err)
}

return nil
}

func (s *SingleFile) findAndReplaceResources(ctx context.Context, node *html.Node, baseURL string) error {
switch node.DataAtom {
case atom.Img, atom.Image, atom.Script, atom.Style:
err := s.replaceResource(ctx, node, baseURL)
if err != nil {
return err
}

case atom.Link:
for _, attribute := range node.Attr {
if attribute.Key == "rel" && (attribute.Val == "stylesheet") {
if err := s.replaceResource(ctx, node, baseURL); err != nil {
return err
}
}
}
if response.StatusCode != http.StatusOK {
return nil, fmt.Errorf("want status 200, got %d", response.StatusCode)
}

return nil
}

func (s *SingleFile) replaceResource(ctx context.Context, node *html.Node, baseURL string) error {
for i, attribute := range node.Attr {
if attribute.Key == "src" || attribute.Key == "href" {
raw, contentType := s.loadResource(ctx, attribute.Val, baseURL)
setResource(raw, attribute, contentType, node)

node.Attr[i] = attribute
}
if response.Body == nil {
return nil, fmt.Errorf("empty response body")
}

return nil
}

func setResource(raw []byte, attribute html.Attribute, contentType string, node *html.Node) {
if len(raw) == 0 {
attribute.Val = ""
} else {
if strings.HasPrefix(contentType, "image") {
encoded := make([]byte, base64.StdEncoding.EncodedLen(len(raw)))
base64.StdEncoding.Encode(encoded, raw)
attribute.Val = fmt.Sprintf("data:%s;base64, %s", contentType, encoded)
} else {
attribute.Val = ""
var atomValue atom.Atom
var data string

for _, attr := range node.Attr {
if attr.Key == "type" {
switch attr.Val {
case "script":
atomValue = atom.Script
data = "script"
case "stylesheet":
atomValue = atom.Style
data = "style"
}
}
}
newNode := &html.Node{
NextSibling: node.NextSibling,
Type: html.ElementNode,
DataAtom: atomValue,
Data: data,
}
newNode.AppendChild(&html.Node{
Type: html.RawNode,
DataAtom: atom.Data,
Data: string(raw),
})
node.NextSibling = newNode
}
}
return response, nil
}

func (s *SingleFile) loadResource(ctx context.Context, val, baseURL string) ([]byte, string) {
if !strings.HasPrefix(val, "http://") && !strings.HasPrefix(val, "https://") {
var err error
val, err = url.JoinPath(baseURL, val)
if err != nil {
return nil, ""
}
val, err = url.PathUnescape(val)
if err != nil {
return nil, ""
}
}

req, err := http.NewRequestWithContext(ctx, http.MethodGet, val, nil)
func (s *SingleFile) process(ctx context.Context, node *html.Node, pageURL string, headers http.Header) error {
parsedURL, err := url.Parse(pageURL)
if err != nil {
return nil, ""
return fmt.Errorf("parse page url: %w", err)
}

response, err := s.client.Do(req)
if err != nil {
return nil, ""
}
baseURL := fmt.Sprintf("%s://%s", parsedURL.Scheme, parsedURL.Host)

defer func() {
if response.Body != nil {
_ = response.Body.Close()
}
}()
for child := node.FirstChild; child != nil; child = child.NextSibling {
var err error
switch child.Data {
case "head":
err = s.processHead(ctx, child, baseURL)

if response.StatusCode != http.StatusOK {
return []byte{}, ""
}
case "body":
err = s.processBody(ctx, child, baseURL)
}

raw, err := io.ReadAll(response.Body)
if err != nil {
return nil, ""
if err != nil {
return err
}
}

return raw, response.Header.Get("Content-Type")
return nil
}

func (s *SingleFile) setCharset(node *html.Node, encoding string) {
var charsetExists bool

func (s *SingleFile) processHead(ctx context.Context, node *html.Node, baseURL string) error {
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Data == "meta" {
for _, attribute := range child.Attr {
if attribute.Key == "charset" {
charsetExists = true
}
switch child.Data {
case "link":
if err := s.processHref(ctx, child.Attr, baseURL); err != nil {
return fmt.Errorf("process link %s: %w", child.Attr, err)
}

case "script":
if err := s.processSrc(ctx, child.Attr, baseURL); err != nil {
return fmt.Errorf("process script %s: %w", child.Attr, err)
}
}
}

if !charsetExists {
node.AppendChild(&html.Node{
Type: html.ElementNode,
DataAtom: atom.Meta,
Data: "meta",
Attr: []html.Attribute{
{
Key: "charset",
Val: encoding,
},
},
})
}
return nil
}

func baseURL(val string) string {
parsed, err := url.Parse(val)
if err != nil {
return val
}

return fmt.Sprintf("%s://%s", parsed.Scheme, parsed.Host)
func (s *SingleFile) processBody(ctx context.Context, child *html.Node, url string) error {
return nil
}

func getEncoding(response *http.Response) string {
_, encoding, found := strings.Cut(response.Header.Get("Content-Type"), "charset=")
if !found {
return defaultEncoding
}

encoding = strings.TrimSpace(encoding)
func (s *SingleFile) processHref(ctx context.Context, attrs []html.Attribute, baseURL string) error {
return nil
}

return encoding
func (s *SingleFile) processSrc(ctx context.Context, attrs []html.Attribute, baseURL string) error {
return nil
}

0 comments on commit c0f3ea3

Please sign in to comment.