Skip to content

Commit

Permalink
feat(scraper): ✨ add 'tag' support
Browse files Browse the repository at this point in the history
This will include more sub-genres per book.

BREAKING! types have been reworked to be either 'genre' or 'tag' - removing any parent/child usage since those are no longer relevant
  • Loading branch information
djdembeck committed Oct 2, 2021
1 parent c7b6984 commit 5f0466e
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 32 deletions.
1 change: 1 addition & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 3 additions & 4 deletions src/helpers/authors/audibleScrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,20 @@ class ScrapeHelper {
* @param {NodeListOf<Element>} genres selected source from categoriesLabel
* @returns {GenreInterface[]}
*/
collectGenres (genres: cheerio.Cheerio<cheerio.Element>[]): GenreInterface[] | undefined {
collectGenres (genres: cheerio.Cheerio<cheerio.Element>[], type: string): GenreInterface[] | undefined {
// Check and label each genre
const genreArr: GenreInterface[] | undefined = genres.map((genre, index): any => {
let thisGenre = {} as GenreInterface
let asin: string
let href: string
const types: Array<string> = ['1st', '2nd', '3rd']
if (genre.attr('href')) {
href = genre.attr('href')!
asin = this.getAsinFromUrl(href)
if (genre.text() && asin) {
thisGenre = {
asin: asin,
name: genre.children().text(),
type: types[index]
type: type
}
}
return thisGenre
Expand Down Expand Up @@ -100,7 +99,7 @@ class ScrapeHelper {
const genres = $('div.contentPositionClass div.bc-box a.bc-color-link')
.toArray()
.map(element => $(element))
returnJson.genres = this.collectGenres(genres)
returnJson.genres = this.collectGenres(genres, 'genre')
} catch (err) {
console.log(`Genres not available on: ${this.asin}`)
}
Expand Down
41 changes: 28 additions & 13 deletions src/helpers/books/audibleScrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import fetch from 'isomorphic-fetch'
// For HTML scraping
import * as cheerio from 'cheerio'
import SharedHelper from '.././shared'
import { htmlToText } from 'html-to-text'

class ScrapeHelper {
asin: string;
Expand All @@ -22,21 +23,24 @@ class ScrapeHelper {
* @param {NodeListOf<Element>} genres selected source from categoriesLabel
* @returns {GenreInterface[]}
*/
collectGenres (genres: cheerio.Cheerio<cheerio.Element>[]): GenreInterface[] | undefined {
collectGenres (genres: cheerio.Cheerio<cheerio.Element>[], type: string): GenreInterface[] | undefined {
// Check and label each genre
const genreArr: GenreInterface[] | undefined = genres.map((genre, index): any => {
let thisGenre = {} as GenreInterface
let asin: string
let href: string
const types: Array<string> = ['parent', 'child']
// Only proceed if there's an ID to use
if (genre.attr('href')) {
href = genre.attr('href')!
asin = this.getAsinFromUrl(href)
const href = genre.attr('href')!
const asin = this.getAsinFromUrl(href)
// Verify existence of name and valid ID
if (genre.text() && asin) {
const cleanedName = htmlToText(
genre.text(),
{ wordwrap: false }
)
thisGenre = {
asin: asin,
name: genre.text(),
type: types[index]
name: cleanedName,
type: type
}
}
return thisGenre
Expand Down Expand Up @@ -126,16 +130,27 @@ class ScrapeHelper {
.toArray()
.map(element => dom(element))

const tags = dom('div.bc-chip-group a')
.toArray()
.map(element => dom(element))

const returnJson = {
genres: Array<GenreInterface>(genres.length),
genres: Array<GenreInterface>(genres.length + tags.length),
series: Array<SeriesInterface>(series.length)
} as HtmlBookInterface

// Genres
// Combine genres and tags
if (genres.length) {
returnJson.genres = this.collectGenres(genres)
} else {
console.log(`Genres not available on: ${this.asin}`)
let genreArr = this.collectGenres(
genres,
'genre'
) as any
// Tags.
if (tags.length) {
const tagArr = this.collectGenres(tags, 'tag')
genreArr = genreArr.concat(tagArr)
}
returnJson.genres = genreArr as GenreInterface[]
}

// Series
Expand Down
6 changes: 3 additions & 3 deletions tests/audible/authors/audibleScrape.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ describe('When scraping Andy Weir from Audible', () => {
})

it('returned genre 1 type', () => {
expect(response.genres![0].type).toBe('1st')
expect(response.genres![0].type).toBe('genre')
})

it('returned genre 2 asin', () => {
Expand All @@ -55,7 +55,7 @@ describe('When scraping Andy Weir from Audible', () => {
})

it('returned genre 2 type', () => {
expect(response.genres![1].type).toBe('2nd')
expect(response.genres![1].type).toBe('genre')
})

it('returned genre 3 asin', () => {
Expand All @@ -67,6 +67,6 @@ describe('When scraping Andy Weir from Audible', () => {
})

it('returned genre 3 type', () => {
expect(response.genres![2].type).toBe('3rd')
expect(response.genres![2].type).toBe('genre')
})
})
16 changes: 8 additions & 8 deletions tests/audible/books/audibleScrape.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ describe('When scraping Project Hail Mary genres from Audible', () => {
})
})

it('returned 2 genres', () => {
expect(response.genres?.length).toBe(2)
it('returned 5 genres', () => {
expect(response.genres?.length).toBe(5)
})

it('returned genre 1 asin', () => {
Expand All @@ -34,7 +34,7 @@ describe('When scraping Project Hail Mary genres from Audible', () => {
})

it('returned genre 1 type', () => {
expect(response.genres![0].type).toBe('parent')
expect(response.genres![0].type).toBe('genre')
})

it('returned genre 2 asin', () => {
Expand All @@ -46,7 +46,7 @@ describe('When scraping Project Hail Mary genres from Audible', () => {
})

it('returned genre 2 type', () => {
expect(response.genres![1].type).toBe('child')
expect(response.genres![1].type).toBe('genre')
})

it('returned 0 series', () => {
Expand Down Expand Up @@ -89,7 +89,7 @@ describe('When scraping Scorcerers Stone genres/series from Audible', () => {
})

it('returned genre 1 type', () => {
expect(response.genres![0].type).toBe('parent')
expect(response.genres![0].type).toBe('genre')
})

it('returned genre 2 asin', () => {
Expand All @@ -101,7 +101,7 @@ describe('When scraping Scorcerers Stone genres/series from Audible', () => {
})

it('returned genre 2 type', () => {
expect(response.genres![1].type).toBe('child')
expect(response.genres![1].type).toBe('genre')
})

it('returned 2 series', () => {
Expand Down Expand Up @@ -176,7 +176,7 @@ describe('When fetching The Coldest Case from Audible API', () => {
})

it('returned genre 1 type', () => {
expect(response.genres![0].type).toBe('parent')
expect(response.genres![0].type).toBe('genre')
})

it('returned genre 2 asin', () => {
Expand All @@ -188,7 +188,7 @@ describe('When fetching The Coldest Case from Audible API', () => {
})

it('returned genre 2 type', () => {
expect(response.genres![1].type).toBe('child')
expect(response.genres![1].type).toBe('genre')
})

it('returned 1 series', () => {
Expand Down
8 changes: 4 additions & 4 deletions tests/audible/books/audibleStitch.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ describe('When stitching together Scorcerers Stone from Audible', () => {
})

it('returned genre 1 type', () => {
expect(response.genres![0].type).toBe('parent')
expect(response.genres![0].type).toBe('genre')
})

it('returned genre 2 asin', () => {
Expand All @@ -113,7 +113,7 @@ describe('When stitching together Scorcerers Stone from Audible', () => {
})

it('returned genre 2 type', () => {
expect(response.genres![1].type).toBe('child')
expect(response.genres![1].type).toBe('genre')
})

it('returned a primary series asin', () => {
Expand Down Expand Up @@ -263,7 +263,7 @@ describe('When stitching together The Coldest Case from Audible', () => {
})

it('returned genre 1 type', () => {
expect(response.genres![0].type).toBe('parent')
expect(response.genres![0].type).toBe('genre')
})

it('returned genre 2 asin', () => {
Expand All @@ -275,7 +275,7 @@ describe('When stitching together The Coldest Case from Audible', () => {
})

it('returned genre 2 type', () => {
expect(response.genres![1].type).toBe('child')
expect(response.genres![1].type).toBe('genre')
})

it('returned a primary series asin', () => {
Expand Down

0 comments on commit 5f0466e

Please sign in to comment.