Skip to content

Commit

Permalink
Unify manganelos grabber into plainHTML one
Browse files Browse the repository at this point in the history
This removes manganelo's specific grabber, adding support to more sites
via plainhtml.

- added support for mangamonks
- added support for mangapanda
- fix support for ajax pagination in sites like mangajar

refs #26
  • Loading branch information
elboletaire committed Jan 5, 2024
1 parent 2610da0 commit 25a3b4d
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 226 deletions.
215 changes: 0 additions & 215 deletions grabber/manganelo.go

This file was deleted.

102 changes: 100 additions & 2 deletions grabber/plainhtml.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package grabber

import (
"fmt"
"regexp"
"strconv"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/elboletaire/manga-downloader/http"
"github.com/fatih/color"
"golang.org/x/net/html"
)

// PlainHTML is a grabber for any plain HTML page (with no ajax pagination whatsoever)
Expand All @@ -25,6 +27,9 @@ type SiteSelector struct {
Chapter string
ChapterTitle string
Image string
Ajax string
AjaxRows string
AjaxNext string
}

// PlainHTMLChapter represents a PlainHTML Chapter
Expand Down Expand Up @@ -56,6 +61,33 @@ func (m *PlainHTML) Test() (bool, error) {
ChapterTitle: ".text-gray-500",
Image: "picture img",
},
// manganelo/manganato
{
Title: "h1",
Rows: "div.panel-story-chapter-list .row-content-chapter li",
Chapter: "a",
ChapterTitle: "a",
Link: "a",
Image: "div.container-chapter-reader img",
},
// manganelos/mangapanda
{
Title: "h1",
Rows: "#examples div.chapter-list .row",
Chapter: "a",
ChapterTitle: "a",
Link: "a",
Image: "div.container-chapter-reader img",
},
// mangakakalot
{
Title: "h1",
Rows: "div.chapter-list .row",
Chapter: "a",
ChapterTitle: "a",
Link: "a",
Image: "div.container-chapter-reader img,#vungdoc img",
},
// asuratoon.com
{
Title: "h1",
Expand All @@ -73,6 +105,18 @@ func (m *PlainHTML) Test() (bool, error) {
ChapterTitle: "a",
Link: "a",
Image: "#chapter-slider .carousel-item img",
Ajax: ".chapters-infinite-pagination .pagination .page-item",
AjaxRows: ".chapter-list-container .chapter-item",
AjaxNext: "ul.pagination .page-item:not(.disabled):last-child",
},
// mangamonks
{
Title: "h3.info-title",
Rows: "#chapter .chapter-list li",
Chapter: ".chapter-number",
ChapterTitle: ".chapter-number",
Link: "a",
Image: "#imageContainer img",
},
}

Expand All @@ -87,6 +131,45 @@ func (m *PlainHTML) Test() (bool, error) {
}
}

// some sites have ajax pagination
if m.site.Ajax != "" && m.doc.Find(m.site.Ajax).Length() > 0 {
var err error
var fetchChaps func(page int)
rows := &goquery.Selection{
Nodes: []*html.Node{},
}

fetchChaps = func(page int) {
rbody, err := http.Get(http.RequestParams{
URL: fmt.Sprintf("%s/chaptersList?page=%d", m.URL, page),
})
if err != nil {
return
}
defer rbody.Close()

doc, err := goquery.NewDocumentFromReader(rbody)
if err != nil {
return
}

rows = rows.AddNodes(doc.Find(m.site.AjaxRows).Nodes...)

if doc.Find(m.site.AjaxNext).Length() > 0 {
fetchChaps(page + 1)
}
}

fetchChaps(1)
if err != nil {
return false, err
}

m.rows = rows

return m.rows.Length() > 0, nil
}

if m.rows == nil {
return false, nil
}
Expand All @@ -98,7 +181,7 @@ func (m *PlainHTML) Test() (bool, error) {
func (m PlainHTML) FetchTitle() (string, error) {
title := m.doc.Find(m.site.Title)

return title.Text(), nil
return sanitizeTitle(title.Text()), nil
}

// FetchChapters returns a slice of chapters
Expand Down Expand Up @@ -183,8 +266,14 @@ func (m PlainHTML) FetchChapter(f Filterable) (*Chapter, error) {
}

func getPlainHTMLImageURL(selector string, doc *goquery.Document) []string {
// some sites store a plain text array with the urls into a hidden layer
pimages := doc.Find("#arraydata")
if pimages.Length() == 1 {
return strings.Split(pimages.Text(), ",")
}

// images are inside picture objects
pimages := doc.Find(selector)
pimages = doc.Find(selector)
imgs := []string{}
pimages.Each(func(i int, s *goquery.Selection) {
src := s.AttrOr("src", "")
Expand All @@ -196,3 +285,12 @@ func getPlainHTMLImageURL(selector string, doc *goquery.Document) []string {

return imgs
}

// sanitizeTitle sanitizes titles, trimming and removing extra spaces from titles
func sanitizeTitle(title string) string {
spaces := regexp.MustCompile(`\s+`)
title = spaces.ReplaceAllString(title, " ")
title = strings.TrimSpace(title)

return title
}
1 change: 0 additions & 1 deletion grabber/site.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ func (g *Grabber) IdentifySite() (Site, []error) {
&Inmanga{Grabber: g},
&Mangadex{Grabber: g},
&Tcb{Grabber: g},
&Manganelo{Grabber: g},
}
var errs []error

Expand Down

0 comments on commit 25a3b4d

Please sign in to comment.