-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(scraper): ✨ add scraping portion of Author
Still need to implement author creation on book fetch, and author update, etc This commit moves GenreInterface to Audible, from Books, since this is a common interface type
- Loading branch information
Showing
8 changed files
with
225 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import ScrapeHelper from '../../../helpers/authors/audibleScrape' | ||
import SharedHelper from '../../../helpers/shared' | ||
import Author from '../../models/Author' | ||
|
||
async function routes (fastify, options) { | ||
fastify.get('/authors/:asin', async (request, reply) => { | ||
// First, check ASIN validity | ||
const commonHelpers = new SharedHelper() | ||
if (!commonHelpers.checkAsinValidity(request.params.asin)) { | ||
throw new Error('Bad ASIN') | ||
} | ||
|
||
const { redis } = fastify | ||
const findInRedis = await redis.get(`author-${request.params.asin}`, (val: string) => { | ||
return JSON.parse(val) | ||
}) | ||
const findInDb = await Promise.resolve(Author.findOne({ | ||
asin: request.params.asin | ||
})) | ||
|
||
if (findInRedis) { | ||
return JSON.parse(findInRedis) | ||
} else if (findInDb) { | ||
redis.set(`author-${request.params.asin}`, JSON.stringify(findInDb, null, 2)) | ||
return findInDb | ||
} else { | ||
// Set up helpers | ||
// const api = new ApiHelper(request.params.asin) | ||
const scraper = new ScrapeHelper(request.params.asin) | ||
|
||
// Run fetch tasks in parallel/resolve promises | ||
const [scraperRes] = await Promise.all([scraper.fetchBook()]) | ||
|
||
// Run parse tasks in parallel/resolve promises | ||
const [parseScraper] = await Promise.all([scraper.parseResponse(scraperRes)]) | ||
|
||
// const stitch = new StitchHelper(parseApi) | ||
if (parseScraper === undefined) { | ||
return undefined | ||
} | ||
|
||
const newDbItem = await Promise.resolve(Author.insertOne(parseScraper)) | ||
redis.set(`author-${request.params.asin}`, JSON.stringify(newDbItem, null, 2)) | ||
return parseScraper | ||
} | ||
}) | ||
} | ||
|
||
export default routes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
// Import interfaces | ||
import { AuthorInterface } from '../../interfaces/people/index' | ||
import { GenreInterface } from '../../interfaces/audible' | ||
import fetch from 'isomorphic-fetch' | ||
// For HTML scraping | ||
import * as cheerio from 'cheerio' | ||
import SharedHelper from '../shared' | ||
import { htmlToText } from 'html-to-text' | ||
|
||
class ScrapeHelper { | ||
asin: string; | ||
reqUrl: string; | ||
constructor (asin: string) { | ||
this.asin = asin | ||
const helper = new SharedHelper() | ||
const baseDomain: string = 'https://www.audible.com' | ||
const baseUrl: string = 'author' | ||
this.reqUrl = helper.buildUrl(asin, baseDomain, baseUrl) | ||
} | ||
|
||
/** | ||
* Checks the presence of genres on html page and formats them into JSON | ||
* @param {NodeListOf<Element>} genres selected source from categoriesLabel | ||
* @returns {GenreInterface[]} | ||
*/ | ||
collectGenres (genres: cheerio.Cheerio<cheerio.Element>[]): GenreInterface[] | undefined { | ||
// Check and label each genre | ||
const genreArr: GenreInterface[] | undefined = genres.map((genre, index): any => { | ||
let thisGenre = {} as GenreInterface | ||
let asin: string | ||
let href: string | ||
const types: Array<string> = ['1st', '2nd', '3rd'] | ||
if (genre.attr('href')) { | ||
href = genre.attr('href')! | ||
asin = this.getAsinFromUrl(href) | ||
if (genre.text() && asin) { | ||
thisGenre = { | ||
asin: asin, | ||
name: genre.children().text(), | ||
type: types[index] | ||
} | ||
} | ||
return thisGenre | ||
} else { | ||
console.log(`Genre ${index} asin not available on: ${this.asin}`) | ||
} | ||
return undefined | ||
}) as GenreInterface[] | ||
|
||
return genreArr | ||
} | ||
|
||
/** | ||
* Fetches the html page and checks it's response | ||
* @returns {Promise<cheerio.CheerioAPI | undefined>} return text from the html page | ||
*/ | ||
async fetchBook (): Promise<cheerio.CheerioAPI | undefined> { | ||
const response = await fetch(this.reqUrl) | ||
if (!response.ok) { | ||
const message = `An error has occured while scraping HTML ${response.status}: ${this.reqUrl}` | ||
if (response.status !== 404) { | ||
console.log(message) | ||
} | ||
return undefined | ||
} else { | ||
const text = await response.text() | ||
const dom = cheerio.load(text) | ||
return dom | ||
} | ||
} | ||
|
||
/** | ||
* Parses fetched HTML page to extract genres and series' | ||
* @param {JSDOM} dom the fetched dom object | ||
* @returns {HtmlBookInterface} genre and series. | ||
*/ | ||
async parseResponse ($: cheerio.CheerioAPI | undefined): Promise<AuthorInterface | undefined> { | ||
// Base undefined check | ||
if (!$) { | ||
return undefined | ||
} | ||
|
||
const returnJson = {} as AuthorInterface | ||
|
||
// ID | ||
returnJson.asin = this.asin | ||
|
||
// Bio. | ||
try { | ||
returnJson.description = htmlToText( | ||
$('div.bc-expander-content').children().text(), | ||
{ wordwrap: false } | ||
) | ||
} catch (err) { | ||
console.log(`Bio not available on: ${this.asin}`) | ||
} | ||
|
||
// Genres. | ||
try { | ||
const genres = $('div.contentPositionClass div.bc-box a.bc-color-link') | ||
.toArray() | ||
.map(element => $(element)) | ||
returnJson.genres = this.collectGenres(genres) | ||
} catch (err) { | ||
console.log(`Genres not available on: ${this.asin}`) | ||
} | ||
|
||
// Image. | ||
try { | ||
// We'll ask for a *slightly* larger than postage-stamp-sized pic... | ||
returnJson.image = $('img.author-image-outline')[0].attribs.src.replace('__01_SX120_CR0,0,120,120__.', '') | ||
} catch (err) { | ||
console.log(`Image not available on: ${this.asin}`) | ||
} | ||
|
||
// Name. | ||
try { | ||
// Workaround data error: https://github.com/cheeriojs/cheerio/issues/1854 | ||
returnJson.name = ($('h1.bc-text-bold')[0].children[0] as any).data | ||
} catch (err) { | ||
console.error(err) | ||
} | ||
|
||
console.log(returnJson) | ||
return returnJson | ||
} | ||
|
||
// Helpers | ||
/** | ||
* Regex to return just the ASIN from the given URL | ||
* @param {string} url string to extract ASIN from | ||
* @returns {string} ASIN. | ||
*/ | ||
getAsinFromUrl (url: string): string { | ||
const asinRegex = /[0-9A-Z]{9}.+?(?=\?)/gm | ||
const ASIN = url.match(asinRegex)![0] | ||
return ASIN | ||
} | ||
|
||
/** | ||
* Regex to return just the book position from HTML input | ||
* @param {JSDOM} html block/object to retrieve book number from. | ||
* @returns {string} Cleaned book position string, like "Book 3" | ||
*/ | ||
getBookFromHTML (html): string { | ||
const bookRegex = /(Book ?(\d*\.)?\d+[+-]?[\d]?)/gm | ||
const matches = html.match(bookRegex) | ||
return matches | ||
} | ||
} | ||
|
||
export default ScrapeHelper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,14 @@ | ||
import { GenreInterface } from '../audible' | ||
|
||
interface Person { | ||
name: string; | ||
} | ||
|
||
export interface AuthorInterface extends Person { | ||
asin?: string | ||
asin?: string; | ||
description?: string; | ||
genres?: GenreInterface[]; | ||
image?: string; | ||
} | ||
|
||
export interface NarratorInterface extends Person {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters