diff --git a/grabber/manganelo.go b/grabber/manganelo.go deleted file mode 100644 index 58ecb72..0000000 --- a/grabber/manganelo.go +++ /dev/null @@ -1,215 +0,0 @@ -package grabber - -import ( - "fmt" - "regexp" - "strconv" - "strings" - - "github.com/PuerkitoBio/goquery" - "github.com/elboletaire/manga-downloader/http" - "github.com/fatih/color" - "golang.org/x/net/html" -) - -// Manganelo is a grabber for manganelo and similar pages -type Manganelo struct { - *Grabber - doc *goquery.Document - rows *goquery.Selection -} - -// ManganeloChapter represents a Manganelo Chapter -type ManganeloChapter struct { - Chapter - URL string -} - -// Test returns true if the URL is a valid Manganelo URL -func (m *Manganelo) Test() (bool, error) { - body, err := http.Get(http.RequestParams{ - URL: m.URL, - }) - if err != nil { - return false, err - } - m.doc, err = goquery.NewDocumentFromReader(body) - if err != nil { - return false, err - } - - // order is important, since some sites have very similar selectors - selectors := []string{ - // manganelo/manganato style - "div.panel-story-chapter-list .row-content-chapter li", - // manganelos style (not using the parent id returns more stuff) - "#examples div.chapter-list .row", - // mangakakalot style - "div.chapter-list .row", - } - - // mangajar has ajax pagination - if m.doc.Find(".chapters-infinite-pagination .pagination .page-item").Length() > 0 { - var err error - var fetchChaps func(page int) - rows := &goquery.Selection{ - Nodes: []*html.Node{}, - } - - fetchChaps = func(page int) { - rbody, err := http.Get(http.RequestParams{ - URL: fmt.Sprintf("%s/chaptersList?page=%d", m.URL, page), - }) - if err != nil { - return - } - defer rbody.Close() - - doc, err := goquery.NewDocumentFromReader(rbody) - if err != nil { - return - } - - rows = rows.AddNodes(doc.Find(".chapter-list-container .chapter-item").Nodes...) - - if doc.Find("ul.pagination .page-item:not(.disabled):last-child").Length() > 0 { - fetchChaps(page + 1) - } - } - - fetchChaps(1) - if err != nil { - return false, err - } - - m.rows = rows - - return m.rows.Length() > 0, nil - } - - // for the same priority reasons, we need to iterate over the selectors - // using a simple `,` joining all selectors would return missmatches - for _, selector := range selectors { - rows := m.doc.Find(selector) - if rows.Length() > 0 { - m.rows = rows - break - } - } - - if m.rows == nil { - return false, nil - } - - return m.rows.Length() > 0, nil -} - -// Ttitle returns the manga title -func (m Manganelo) FetchTitle() (string, error) { - title := m.doc.Find("h1") - - // mangajar has the name inside span.post-name - if title.Children().HasClass("post-name") { - title = title.Find(".post-name") - } - - return title.Text(), nil -} - -// FetchChapters returns a slice of chapters -func (m Manganelo) FetchChapters() (chapters Filterables, errs []error) { - m.rows.Each(func(i int, s *goquery.Selection) { - re := regexp.MustCompile(`Chapter\s*(\d+\.?\d*)`) - chap := re.FindStringSubmatch(s.Find("a").Text()) - // if the chapter has no number, we skip it (usually it's an announcement from the site) - if len(chap) == 0 { - return - } - - num := chap[1] - number, err := strconv.ParseFloat(num, 64) - if err != nil { - errs = append(errs, err) - return - } - u := s.Find("a").AttrOr("href", "") - if !strings.HasPrefix(u, "http") { - u = m.BaseUrl() + u - } - chapter := &ManganeloChapter{ - Chapter{ - Number: number, - Title: s.Find("a").Text(), - }, - u, - } - - chapters = append(chapters, chapter) - }) - - return -} - -// FetchChapter fetches a chapter and its pages -func (m Manganelo) FetchChapter(f Filterable) (*Chapter, error) { - mchap := f.(*ManganeloChapter) - body, err := http.Get(http.RequestParams{ - URL: mchap.URL, - }) - if err != nil { - return nil, err - } - defer body.Close() - doc, err := goquery.NewDocumentFromReader(body) - if err != nil { - return nil, err - } - - pimages := getImageUrls(doc) - - chapter := &Chapter{ - Title: f.GetTitle(), - Number: f.GetNumber(), - PagesCount: int64(len(pimages)), - Language: "en", - } - - for i, img := range pimages { - if img == "" { - // this error is not critical and is not from our side, so just log it out - color.Yellow("page %d of %s has no URL to fetch from 😕 (will be ignored)", i, chapter.GetTitle()) - continue - } - if !strings.HasPrefix(img, "http") { - img = m.BaseUrl() + img - } - page := Page{ - Number: int64(i), - URL: img, - } - chapter.Pages = append(chapter.Pages, page) - } - - return chapter, nil -} - -func getImageUrls(doc *goquery.Document) []string { - // some sites store a plain text array with the urls into a hidden layer - pimages := doc.Find("#arraydata") - if pimages.Length() == 1 { - return strings.Split(pimages.Text(), ",") - } - - // others just have the images - pimages = doc.Find("div.container-chapter-reader img, .chapter-images img") - imgs := []string{} - pimages.Each(func(i int, s *goquery.Selection) { - src := s.AttrOr("src", "") - if src == "" || strings.HasPrefix(src, "data:image") { - src = s.AttrOr("data-src", "") - } - imgs = append(imgs, src) - }) - - return imgs -} diff --git a/grabber/plainhtml.go b/grabber/plainhtml.go index 8acad95..df6ee9e 100644 --- a/grabber/plainhtml.go +++ b/grabber/plainhtml.go @@ -1,6 +1,7 @@ package grabber import ( + "fmt" "regexp" "strconv" "strings" @@ -8,6 +9,7 @@ import ( "github.com/PuerkitoBio/goquery" "github.com/elboletaire/manga-downloader/http" "github.com/fatih/color" + "golang.org/x/net/html" ) // PlainHTML is a grabber for any plain HTML page (with no ajax pagination whatsoever) @@ -25,6 +27,9 @@ type SiteSelector struct { Chapter string ChapterTitle string Image string + Ajax string + AjaxRows string + AjaxNext string } // PlainHTMLChapter represents a PlainHTML Chapter @@ -56,6 +61,33 @@ func (m *PlainHTML) Test() (bool, error) { ChapterTitle: ".text-gray-500", Image: "picture img", }, + // manganelo/manganato + { + Title: "h1", + Rows: "div.panel-story-chapter-list .row-content-chapter li", + Chapter: "a", + ChapterTitle: "a", + Link: "a", + Image: "div.container-chapter-reader img", + }, + // manganelos/mangapanda + { + Title: "h1", + Rows: "#examples div.chapter-list .row", + Chapter: "a", + ChapterTitle: "a", + Link: "a", + Image: "div.container-chapter-reader img", + }, + // mangakakalot + { + Title: "h1", + Rows: "div.chapter-list .row", + Chapter: "a", + ChapterTitle: "a", + Link: "a", + Image: "div.container-chapter-reader img,#vungdoc img", + }, // asuratoon.com { Title: "h1", @@ -73,6 +105,18 @@ func (m *PlainHTML) Test() (bool, error) { ChapterTitle: "a", Link: "a", Image: "#chapter-slider .carousel-item img", + Ajax: ".chapters-infinite-pagination .pagination .page-item", + AjaxRows: ".chapter-list-container .chapter-item", + AjaxNext: "ul.pagination .page-item:not(.disabled):last-child", + }, + // mangamonks + { + Title: "h3.info-title", + Rows: "#chapter .chapter-list li", + Chapter: ".chapter-number", + ChapterTitle: ".chapter-number", + Link: "a", + Image: "#imageContainer img", }, } @@ -87,6 +131,45 @@ func (m *PlainHTML) Test() (bool, error) { } } + // some sites have ajax pagination + if m.site.Ajax != "" && m.doc.Find(m.site.Ajax).Length() > 0 { + var err error + var fetchChaps func(page int) + rows := &goquery.Selection{ + Nodes: []*html.Node{}, + } + + fetchChaps = func(page int) { + rbody, err := http.Get(http.RequestParams{ + URL: fmt.Sprintf("%s/chaptersList?page=%d", m.URL, page), + }) + if err != nil { + return + } + defer rbody.Close() + + doc, err := goquery.NewDocumentFromReader(rbody) + if err != nil { + return + } + + rows = rows.AddNodes(doc.Find(m.site.AjaxRows).Nodes...) + + if doc.Find(m.site.AjaxNext).Length() > 0 { + fetchChaps(page + 1) + } + } + + fetchChaps(1) + if err != nil { + return false, err + } + + m.rows = rows + + return m.rows.Length() > 0, nil + } + if m.rows == nil { return false, nil } @@ -98,7 +181,7 @@ func (m *PlainHTML) Test() (bool, error) { func (m PlainHTML) FetchTitle() (string, error) { title := m.doc.Find(m.site.Title) - return title.Text(), nil + return sanitizeTitle(title.Text()), nil } // FetchChapters returns a slice of chapters @@ -183,8 +266,14 @@ func (m PlainHTML) FetchChapter(f Filterable) (*Chapter, error) { } func getPlainHTMLImageURL(selector string, doc *goquery.Document) []string { + // some sites store a plain text array with the urls into a hidden layer + pimages := doc.Find("#arraydata") + if pimages.Length() == 1 { + return strings.Split(pimages.Text(), ",") + } + // images are inside picture objects - pimages := doc.Find(selector) + pimages = doc.Find(selector) imgs := []string{} pimages.Each(func(i int, s *goquery.Selection) { src := s.AttrOr("src", "") @@ -196,3 +285,12 @@ func getPlainHTMLImageURL(selector string, doc *goquery.Document) []string { return imgs } + +// sanitizeTitle sanitizes titles, trimming and removing extra spaces from titles +func sanitizeTitle(title string) string { + spaces := regexp.MustCompile(`\s+`) + title = spaces.ReplaceAllString(title, " ") + title = strings.TrimSpace(title) + + return title +} diff --git a/grabber/site.go b/grabber/site.go index 2d700e7..b20096d 100644 --- a/grabber/site.go +++ b/grabber/site.go @@ -70,7 +70,6 @@ func (g *Grabber) IdentifySite() (Site, []error) { &Inmanga{Grabber: g}, &Mangadex{Grabber: g}, &Tcb{Grabber: g}, - &Manganelo{Grabber: g}, } var errs []error diff --git a/makefile b/makefile index 37c4f8f..034a0f7 100644 --- a/makefile +++ b/makefile @@ -33,14 +33,7 @@ else go test -v ./... endif -grabber: grabber/manganelo grabber/inmanga grabber/mangadex grabber/tcb grabber/html - -grabber/manganelo: - go run . https://mangakakalot.com/manga/vd921334 7 - go run . https://ww5.manganelo.tv/manga/manga-aa951409 3 - go run . https://readmangabat.com/read-ov357862 23 - go run . https://chapmanganato.com/manga-aa951409 50 - go run . https://h.mangabat.com/read-tc397521 5 +grabber: grabber/inmanga grabber/mangadex grabber/tcb grabber/html grabber/inmanga: go run . https://inmanga.com/ver/manga/One-Piece/dfc7ecb5-e9b3-4aa5-a61b-a498993cd935 1 @@ -50,9 +43,18 @@ grabber/mangadex: grabber/tcb: go run . https://www.tcbscans.net/manga/one-piece/ 5 + go run . https://ww1.tcbscans.org/manga/ao-ashi/ 203 go run . https://lscomic.com/manga/peerless-dad/ 285 grabber/html: go run . https://tcbscans.com/mangas/5/one-piece 1100 + go run . https://mangakakalot.com/manga/vd921334 7 + go run . https://ww7.mangakakalot.tv/chapter/manga-hj984766/chapter-86 86 + go run . https://ww5.manganelo.tv/manga/manga-aa951409 3 go run . https://asuratoon.com/manga/0435219386-return-of-the-sss-class-ranker/ 85 go run . https://mangajar.pro/manga/haite-kudasai-takamine-san 43 + go run . https://chapmanganato.com/manga-aa951409 50 + go run . https://readmangabat.com/read-ov357862 23 + go run . https://h.mangabat.com/read-tc397521 5 + go run . https://mangapanda.in/manga/dragon-ball-super-color-%E2%AD%90%E2%AD%90%E2%AD%90%E2%AD%90%E2%AD%90 90 + go run . https://mangamonks.com/manga/mashle 155