Skip to content

Commit

Permalink
feat(scraper): ✨ add scraping portion of Author
Browse files Browse the repository at this point in the history
Still need to implement author creation on book fetch, and author update, etc

This commit moves GenreInterface to Audible, from Books, since this is a common interface type
  • Loading branch information
djdembeck committed Oct 1, 2021
1 parent 783fa22 commit 2d533bd
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 10 deletions.
1 change: 0 additions & 1 deletion src/config/models/Author.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ const authorSchema = schema({
types.objectId()
),
description: types.string({ required: true }),
formatType: types.string(),
genres: types.array(
types.object(
{
Expand Down
49 changes: 49 additions & 0 deletions src/config/routes/authors/show.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import ScrapeHelper from '../../../helpers/authors/audibleScrape'
import SharedHelper from '../../../helpers/shared'
import Author from '../../models/Author'

async function routes (fastify, options) {
fastify.get('/authors/:asin', async (request, reply) => {
// First, check ASIN validity
const commonHelpers = new SharedHelper()
if (!commonHelpers.checkAsinValidity(request.params.asin)) {
throw new Error('Bad ASIN')
}

const { redis } = fastify
const findInRedis = await redis.get(`author-${request.params.asin}`, (val: string) => {
return JSON.parse(val)
})
const findInDb = await Promise.resolve(Author.findOne({
asin: request.params.asin
}))

if (findInRedis) {
return JSON.parse(findInRedis)
} else if (findInDb) {
redis.set(`author-${request.params.asin}`, JSON.stringify(findInDb, null, 2))
return findInDb
} else {
// Set up helpers
// const api = new ApiHelper(request.params.asin)
const scraper = new ScrapeHelper(request.params.asin)

// Run fetch tasks in parallel/resolve promises
const [scraperRes] = await Promise.all([scraper.fetchBook()])

// Run parse tasks in parallel/resolve promises
const [parseScraper] = await Promise.all([scraper.parseResponse(scraperRes)])

// const stitch = new StitchHelper(parseApi)
if (parseScraper === undefined) {
return undefined
}

const newDbItem = await Promise.resolve(Author.insertOne(parseScraper))
redis.set(`author-${request.params.asin}`, JSON.stringify(newDbItem, null, 2))
return parseScraper
}
})
}

export default routes
3 changes: 2 additions & 1 deletion src/helpers/audibleScrape.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Import interfaces
import { HtmlBookInterface, GenreInterface, SeriesInterface } from '../interfaces/books/index'
import { HtmlBookInterface, SeriesInterface } from '../interfaces/books/index'
import { GenreInterface } from '../interfaces/audible'
import fetch from 'isomorphic-fetch'
// For HTML scraping
import * as cheerio from 'cheerio'
Expand Down
152 changes: 152 additions & 0 deletions src/helpers/authors/audibleScrape.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// Import interfaces
import { AuthorInterface } from '../../interfaces/people/index'
import { GenreInterface } from '../../interfaces/audible'
import fetch from 'isomorphic-fetch'
// For HTML scraping
import * as cheerio from 'cheerio'
import SharedHelper from '../shared'
import { htmlToText } from 'html-to-text'

class ScrapeHelper {
asin: string;
reqUrl: string;
constructor (asin: string) {
this.asin = asin
const helper = new SharedHelper()
const baseDomain: string = 'https://www.audible.com'
const baseUrl: string = 'author'
this.reqUrl = helper.buildUrl(asin, baseDomain, baseUrl)
}

/**
* Checks the presence of genres on html page and formats them into JSON
* @param {NodeListOf<Element>} genres selected source from categoriesLabel
* @returns {GenreInterface[]}
*/
collectGenres (genres: cheerio.Cheerio<cheerio.Element>[]): GenreInterface[] | undefined {
// Check and label each genre
const genreArr: GenreInterface[] | undefined = genres.map((genre, index): any => {
let thisGenre = {} as GenreInterface
let asin: string
let href: string
const types: Array<string> = ['1st', '2nd', '3rd']
if (genre.attr('href')) {
href = genre.attr('href')!
asin = this.getAsinFromUrl(href)
if (genre.text() && asin) {
thisGenre = {
asin: asin,
name: genre.children().text(),
type: types[index]
}
}
return thisGenre
} else {
console.log(`Genre ${index} asin not available on: ${this.asin}`)
}
return undefined
}) as GenreInterface[]

return genreArr
}

/**
* Fetches the html page and checks it's response
* @returns {Promise<cheerio.CheerioAPI | undefined>} return text from the html page
*/
async fetchBook (): Promise<cheerio.CheerioAPI | undefined> {
const response = await fetch(this.reqUrl)
if (!response.ok) {
const message = `An error has occured while scraping HTML ${response.status}: ${this.reqUrl}`
if (response.status !== 404) {
console.log(message)
}
return undefined
} else {
const text = await response.text()
const dom = cheerio.load(text)
return dom
}
}

/**
* Parses fetched HTML page to extract genres and series'
* @param {JSDOM} dom the fetched dom object
* @returns {HtmlBookInterface} genre and series.
*/
async parseResponse ($: cheerio.CheerioAPI | undefined): Promise<AuthorInterface | undefined> {
// Base undefined check
if (!$) {
return undefined
}

const returnJson = {} as AuthorInterface

// ID
returnJson.asin = this.asin

// Bio.
try {
returnJson.description = htmlToText(
$('div.bc-expander-content').children().text(),
{ wordwrap: false }
)
} catch (err) {
console.log(`Bio not available on: ${this.asin}`)
}

// Genres.
try {
const genres = $('div.contentPositionClass div.bc-box a.bc-color-link')
.toArray()
.map(element => $(element))
returnJson.genres = this.collectGenres(genres)
} catch (err) {
console.log(`Genres not available on: ${this.asin}`)
}

// Image.
try {
// We'll ask for a *slightly* larger than postage-stamp-sized pic...
returnJson.image = $('img.author-image-outline')[0].attribs.src.replace('__01_SX120_CR0,0,120,120__.', '')
} catch (err) {
console.log(`Image not available on: ${this.asin}`)
}

// Name.
try {
// Workaround data error: https://github.com/cheeriojs/cheerio/issues/1854
returnJson.name = ($('h1.bc-text-bold')[0].children[0] as any).data
} catch (err) {
console.error(err)
}

console.log(returnJson)
return returnJson
}

// Helpers
/**
* Regex to return just the ASIN from the given URL
* @param {string} url string to extract ASIN from
* @returns {string} ASIN.
*/
getAsinFromUrl (url: string): string {
const asinRegex = /[0-9A-Z]{9}.+?(?=\?)/gm
const ASIN = url.match(asinRegex)![0]
return ASIN
}

/**
* Regex to return just the book position from HTML input
* @param {JSDOM} html block/object to retrieve book number from.
* @returns {string} Cleaned book position string, like "Book 3"
*/
getBookFromHTML (html): string {
const bookRegex = /(Book ?(\d*\.)?\d+[+-]?[\d]?)/gm
const matches = html.match(bookRegex)
return matches
}
}

export default ScrapeHelper
6 changes: 6 additions & 0 deletions src/interfaces/audible/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ interface Codecs {
name: string
}

export interface GenreInterface {
asin: string;
name: string;
type: string;
}

interface RatingItems {
average_rating: number;
display_average_rating: string;
Expand Down
7 changes: 1 addition & 6 deletions src/interfaces/books/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { GenreInterface } from '../audible'
import { AuthorInterface, NarratorInterface } from '../people/index'

export interface ApiSingleChapterInterface {
Expand All @@ -17,12 +18,6 @@ export interface ApiChapterInterface {
runtimeLengthSec: number
}

export interface GenreInterface {
asin: string,
name: string,
type: string
}

export interface SeriesInterface {
asin?: string,
name: string,
Expand Down
7 changes: 6 additions & 1 deletion src/interfaces/people/index.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import { GenreInterface } from '../audible'

interface Person {
name: string;
}

export interface AuthorInterface extends Person {
asin?: string
asin?: string;
description?: string;
genres?: GenreInterface[];
image?: string;
}

export interface NarratorInterface extends Person {}
10 changes: 9 additions & 1 deletion src/server.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import { fastify } from 'fastify'
// Book routes
import showBook from './config/routes/books/show'
import deleteBook from './config/routes/books/delete'
import showChapter from './config/routes/books/chapters/show'
// Author routes
import showAuthor from './config/routes/authors/show'
// System imports
import { fastify } from 'fastify'
import { connect, disconnect } from './config/papr'

// Heroku or local port
Expand All @@ -14,9 +18,13 @@ const server = fastify({
}
})

// Register book routes
server.register(showBook)
server.register(showChapter)
server.register(deleteBook)
// Register author routes
server.register(showAuthor)

server.register(require('fastify-redis'), { url: REDIS_URL })

server.listen(port, host, async (err, address) => {
Expand Down

0 comments on commit 2d533bd

Please sign in to comment.