Skip to content

Commit

Permalink
Merge branch 'dev' into rework/schema
Browse files Browse the repository at this point in the history
  • Loading branch information
jess-sys committed Mar 15, 2021
2 parents 827b29b + 37816fd commit 61ca5f3
Show file tree
Hide file tree
Showing 23 changed files with 811 additions and 33 deletions.
31 changes: 31 additions & 0 deletions doc/developers/PARSING.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,37 @@ export default async function parseAsCSV(pipeline: Pipeline, options?: OptionsPa
}
```

#### Special cases

##### Wait for complete data (not recommended)

**WARNING: Be careful, this practice bypass Node.js streams usage as it will put all stream data in memory during
parsing**

This special case must be used for formats which concern only small data sets / files, otherwise prefer the Node.js
stream method.

```typescript
import { ParsingOptions } from '../../types/Parsing'

export type OptionsParseAsJSON =
& ParsingOptions

/**
* Parse given Pipeline result stream as JSON format
*/
export default async function parseAsJSON(pipeline: Pipeline, options?: OptionsParseAsJSON): Promise<string> {
// Run Pipeline and wait for complete stream data before return
// WARNING: Will put all stream data in memory until return,
// can crash the process with heavy data set
const completeData = await pipeline.toString()

// Pass the complete data to a parsing library or use your own
// Don't forget options paramater
// Then return the result of the parsing
}
```

### Parser class method

After you add your new parsing utility function you must add it to the Parser class to be able to use it from getters (
Expand Down
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@
"dependencies": {
"class-transformer": "^0.4.0",
"class-validator": "^0.13.1",
"@types/jsdom": "^16.2.7",
"@types/yauzl": "^2.9.1",
"cross-env": "^7.0.3",
"dotenv": "^8.2.0",
"express": "^4.17.1",
"jsdom": "^16.5.1",
"ts-node": "^9.1.1",
"typescript": "^4.1.5",
"yauzl": "^2.10.0"
Expand Down
15 changes: 10 additions & 5 deletions src/classes/Archive/Archive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import Standardizer from '../Standardizer/Standardizer'
import Services from '../../types/Services'
import Config from '../../modules/Config'

export const PLUGINS_DIR = 'plugins'

export const OUTPUT_DIR = Config.archiveOutputDir

export default abstract class Archive {
Expand Down Expand Up @@ -39,7 +41,7 @@ export default abstract class Archive {
/**
* Explore non extracted archive to guess the source service
*/
abstract identifyService(): Promise<Archive>
abstract identifyService(): Promise<boolean>

/**
* Identify archive file format
Expand Down Expand Up @@ -70,18 +72,21 @@ export default abstract class Archive {
* List all Archive plugins contained in the services sub-directory asynchronously
*/
static getPlugins(): Promise<Array<typeof Archive>> {
return fs.promises.readdir(path.resolve(__dirname, 'services'))
.then(dirContent => dirContent.map(service => import(path.resolve(__dirname, 'services', service))))
return fs.promises.readdir(path.resolve(__dirname, PLUGINS_DIR))
.then(dirContent => dirContent.map(
service => import(path.resolve(__dirname, PLUGINS_DIR, service))
.then(importedModule => importedModule.default),
))
.then(promiseArr => Promise.all(promiseArr))
}

/**
* List all Archive plugins contained in the services sub-directory synchronously
*/
static getPluginsSync(): Array<typeof Archive> {
return fs.readdirSync(path.resolve(__dirname, 'services')).map(
return fs.readdirSync(path.resolve(__dirname, PLUGINS_DIR)).map(
// eslint-disable-next-line import/no-dynamic-require,global-require
service => require(path.resolve(__dirname, 'services', service)),
service => require(path.resolve(__dirname, PLUGINS_DIR, service)).default,
)
}
}
4 changes: 2 additions & 2 deletions src/classes/Archive/Unknown.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import Standardizer from '../Standardizer/Standardizer'
import UnknownStandardizer from '../Standardizer/Unknown'

export default class Unknown extends Archive {
identifyService(): Promise<Archive> {
return Promise.resolve(this)
identifyService(): Promise<boolean> {
return Promise.resolve(true)
}

get service(): Services {
Expand Down
21 changes: 21 additions & 0 deletions src/classes/Archive/plugins/Discord.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import Archive from '../Archive'
import Services from '../../../types/Services'
import Standardizer from '../../Standardizer/Standardizer'
import DiscordStandardizer from '../../Standardizer/plugins/Discord/Discord'

export default class Discord extends Archive {
identifyService(): Promise<boolean> {
throw new Error('Not implemented')
}

get service(): Services {
return Services.DISCORD
}

get standardizer(): Standardizer {
if (!this.isExtracted) {
throw new Error('Archive not extracted')
}
return new DiscordStandardizer(this.extractedArchivePath!)
}
}
21 changes: 21 additions & 0 deletions src/classes/Archive/plugins/Facebook.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import Archive from '../Archive'
import Services from '../../../types/Services'
import Standardizer from '../../Standardizer/Standardizer'
import FacebookStandardizer from '../../Standardizer/plugins/Facebook/Facebook'

export default class Facebook extends Archive {
identifyService(): Promise<Archive> {
throw new Error('Not implemented')
}

get service(): Services {
return Services.FACEBOOK
}

get standardizer(): Standardizer {
if (!this.isExtracted) {
throw new Error('Archive not extracted')
}
return new FacebookStandardizer(this.extractedArchivePath!)
}
}
21 changes: 21 additions & 0 deletions src/classes/Archive/plugins/Google.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import Archive from '../Archive'
import Services from '../../../types/Services'
import Standardizer from '../../Standardizer/Standardizer'
import GoogleStandardizer from '../../Standardizer/plugins/Google/Google'

export default class Google extends Archive {
identifyService(): Promise<boolean> {
throw new Error('Not implemented')
}

get service(): Services {
return Services.GOOGLE
}

get standardizer(): Standardizer {
if (!this.isExtracted) {
throw new Error('Archive not extracted')
}
return new GoogleStandardizer(this.extractedArchivePath!)
}
}
21 changes: 21 additions & 0 deletions src/classes/Archive/plugins/Reddit.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import Archive from '../Archive'
import Services from '../../../types/Services'
import Standardizer from '../../Standardizer/Standardizer'
import RedditStandardizer from '../../Standardizer/plugins/Reddit/Reddit'

export default class Reddit extends Archive {
identifyService(): Promise<boolean> {
throw new Error('Not implemented')
}

get service(): Services {
return Services.REDDIT
}

get standardizer(): Standardizer {
if (!this.isExtracted) {
throw new Error('Archive not extracted')
}
return new RedditStandardizer(this.extractedArchivePath!)
}
}
14 changes: 9 additions & 5 deletions src/classes/ArchiveFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ export default class ArchiveFactory {

outputDir: string

archivePlugins: Array<Archive> = Archive.getPluginsSync()
// @ts-ignore
.map(archivePlugin => new archivePlugin(this.path, this.outputDir))
archivePlugins: Array<Archive>

constructor(archivePath: string, outputDir?: string) {
this.path = archivePath
this.outputDir = outputDir ?? OUTPUT_DIR
this.archivePlugins = Archive.getPluginsSync()
// @ts-ignore
.map(archivePlugin => new archivePlugin(this.path, this.outputDir))
}

async identify(): Promise<Services> {
Expand All @@ -28,9 +29,12 @@ export default class ArchiveFactory {

async getArchivePlugin(): Promise<Archive> {
return Promise.any(
this.archivePlugins.map(plugin => plugin.identifyService()),
this.archivePlugins.map(
plugin => plugin
.identifyService()
.then(result => (result ? plugin : Promise.reject())),
),
)
// Return Unknown archive plugin if anyone else is found
.catch(() => new Unknown(this.path))
}

Expand Down
3 changes: 2 additions & 1 deletion src/classes/Parser.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import path from 'path'
import { JSDOM } from 'jsdom'
import { PaginationOptions, ParsingOptions, Preprocessor, PreprocessorOptions } from '../types/Parsing'
import listFiles, { OptionsListFiles } from '../modules/Parsing/listFiles'
import parseDir, { OptionsParseDir } from '../modules/Parsing/parseDir'
Expand Down Expand Up @@ -127,7 +128,7 @@ export default class Parser {
* Parse HTML file from given path
* Throw error if can't access file or if parsing fail
*/
async parseAsHTML(relativeFilePath: string, options?: OptionsParseAsHTML & PreprocessorOptions): Promise<any> {
async parseAsHTML(relativeFilePath: string, options?: OptionsParseAsHTML & PreprocessorOptions): Promise<JSDOM> {
const mergedOptions = this.mergeOptions(options)

return parseAsHTML(
Expand Down
13 changes: 9 additions & 4 deletions src/classes/Standardizer/Standardizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ export interface GetterOptions {
parsingOptions?: ParsingOptions & PaginationOptions
}

export const PLUGINS_DIR = 'plugins'

export default abstract class Standardizer {
path: string

Expand Down Expand Up @@ -129,18 +131,21 @@ export default abstract class Standardizer {
* List all Standardizer plugins contained in the services sub-directory asynchronously
*/
static getPlugins(): Promise<Array<typeof Standardizer>> {
return fs.promises.readdir(path.resolve(__dirname, 'services'))
.then(dirContent => dirContent.map(service => import(path.resolve(__dirname, 'services', service))))
return fs.promises.readdir(path.resolve(__dirname, PLUGINS_DIR))
.then(dirContent => dirContent.map(
service => import(path.resolve(__dirname, PLUGINS_DIR, service))
.then(importedModule => importedModule.default),
))
.then(promiseArr => Promise.all(promiseArr))
}

/**
* List all Standardizer plugins contained in the services sub-directory synchronously
*/
static getPluginsSync(): Array<typeof Standardizer> {
return fs.readdirSync(path.resolve(__dirname, 'services')).map(
return fs.readdirSync(path.resolve(__dirname, PLUGINS_DIR)).map(
// eslint-disable-next-line import/no-dynamic-require,global-require
service => require(path.resolve(__dirname, 'services', service)),
service => require(path.resolve(__dirname, PLUGINS_DIR, service)).default,
)
}
}
16 changes: 16 additions & 0 deletions src/classes/Standardizer/plugins/Discord/Discord.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import Standardizer from '../../Standardizer'
import Services from '../../../../types/Services'

export default class Discord extends Standardizer {
get service(): Services {
return Services.DISCORD
}

get subServices(): Array<Services> {
return []
}

get subStandardizers(): Array<Standardizer> {
return []
}
}
Empty file.
16 changes: 16 additions & 0 deletions src/classes/Standardizer/plugins/Facebook/Facebook.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import Standardizer from '../../Standardizer'
import Services from '../../../../types/Services'

export default class Facebook extends Standardizer {
get service(): Services {
return Services.FACEBOOK
}

get subServices(): Array<Services> {
return []
}

get subStandardizers(): Array<Standardizer> {
return []
}
}
Empty file.
16 changes: 16 additions & 0 deletions src/classes/Standardizer/plugins/Google/Google.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import Standardizer from '../../Standardizer'
import Services from '../../../../types/Services'

export default class Google extends Standardizer {
get service(): Services {
return Services.GOOGLE
}

get subServices(): Array<Services> {
return []
}

get subStandardizers(): Array<Standardizer> {
return []
}
}
Empty file.
16 changes: 16 additions & 0 deletions src/classes/Standardizer/plugins/Reddit/Reddit.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import Standardizer from '../../Standardizer'
import Services from '../../../../types/Services'

export default class Reddit extends Standardizer {
get service(): Services {
return Services.REDDIT
}

get subServices(): Array<Services> {
return []
}

get subStandardizers(): Array<Standardizer> {
return []
}
}
Empty file.
5 changes: 3 additions & 2 deletions src/classes/StandardizerFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ import Standardizer from './Standardizer/Standardizer'
export default class StandardizerFactory {
path: string

// @ts-ignore
standardizers: Array<Standardizer> = Standardizer.getPluginsSync().map(standardizer => new standardizer(this.path))
standardizers: Array<Standardizer>

constructor(extractedArchivePath: string) {
this.path = extractedArchivePath
// @ts-ignore
this.standardizers = Standardizer.getPluginsSync().map(standardizer => new standardizer(this.path))
}

getStandardizerFromService(service: Services): Standardizer | undefined {
Expand Down
6 changes: 4 additions & 2 deletions src/modules/Parsing/parseAsHTML.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { JSDOM } from 'jsdom'
import { ParsingOptions } from '../../types/Parsing'
import Pipeline from '../../classes/Pipeline'

Expand All @@ -6,6 +7,7 @@ export type OptionsParseAsHTML = ParsingOptions
/**
* Parse given Pipeline result stream as HTML format
*/
export default async function parseAsHTML(pipeline: Pipeline, options?: OptionsParseAsHTML): Promise<any> {
return Promise.reject(new Error('Not implemented'))
export default async function parseAsHTML(pipeline: Pipeline, options?: OptionsParseAsHTML): Promise<JSDOM> {
const completeData = await pipeline.toString()
return new JSDOM(completeData)
}
8 changes: 7 additions & 1 deletion src/modules/Parsing/parseFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import parseAsCSV from './parseAsCSV'
import parseAsMBOX from './parseAsMBOX'
import parseAsVCARD from './parseAsVCARD'
import parseAsICS from './parseAsICS'
import Pipeline from '../../classes/Pipeline'

/**
* List all file extensions for a file type
Expand Down Expand Up @@ -44,5 +45,10 @@ export type OptionsParseFile = ParsingOptions
* Throw error if can't access file or if parsing fail
*/
export default async function parseFile<T = any>(filePath: string, options?: OptionsParseFile): Promise<T> {
return Promise.reject(new Error('Not implemented'))
const extension = filePath.split('.').pop()?.toLowerCase() ?? ''
const result = ParserTypes.find(([extensions]) => extensions.includes(extension))
if (!result) {
return parseAsText(Pipeline.fromFile(filePath), options) as any
}
return result?.[1](Pipeline.fromFile(filePath), options)
}
Loading

0 comments on commit 61ca5f3

Please sign in to comment.