Skip to content

Commit

Permalink
Add CDS script
Browse files Browse the repository at this point in the history
  • Loading branch information
JackNeto committed Dec 5, 2019
1 parent 9239297 commit 225ad11
Show file tree
Hide file tree
Showing 7 changed files with 410 additions and 114 deletions.
63 changes: 63 additions & 0 deletions bin/csd-scraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/* eslint-disable array-callback-return */
/* eslint-disable prefer-destructuring */
/* eslint-disable no-plusplus */
/* eslint-disable prefer-arrow-callback */
/* eslint-disable func-names */
/* eslint-disable prefer-template */
const fs = require('fs')
const casper = require('casper').create({
pageSettings: {
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:23.0) Gecko/20130404 Firefox/23.0'
},
verbose: false,
logLevel: 'debug'
})

// https://www.methodsandtools.com/tools/casperjs.php
const baseUrl = 'https://services.cds.ca/applications/taxforms/taxforms.nsf'
const url = baseUrl + '/PROCESSED-EN-'
const params = 'OpenView&Start=1&Count=3000&RestrictToCategory=All-2018'

casper.start(url + '?' + params, function () {
this.echo('Clicking I accept link')
this.clickLabel('I accept', 'a')
})

casper.then(function () {
const tableId = '#taxlist'
this.echo('Loading table data ...')
if (this.exists(tableId)) {
this.echo('Found ' + tableId + ' table', 'INFO')
} else {
this.echo(tableId + ' not found', 'ERROR')
}

const data = this.evaluate(function () {
const tableRows = document.querySelectorAll('#taxlist > tbody > tr')
return Array.prototype.map.call(tableRows, function (row) {
return {
date: row.querySelector('td:nth-child(1) span.Date').textContent,
// company: row.querySelector('td:nth-child(4) a').textContent,
docType: row.querySelector('td:nth-child(5) span.Type').textContent,
docUrl: row.querySelector('td:nth-child(6) a').href
}
})
})
fs.write('downloads/data.js', JSON.stringify(data), 'w')

data.map(function (d, index) {
console.log(index + 1 + '/' + data.length)
const docUrl = data[index].docUrl
const filename = docUrl.substr(docUrl.lastIndexOf('/') + 1)
casper.download(docUrl, 'downloads/' + filename)
})
})

casper.options.onResourceRequested = function (_casper, requestData) {
console.log('[LOAD]', requestData.url)
}

casper.run(function () {
this.echo('Done')
this.exit()
})
146 changes: 146 additions & 0 deletions bin/xsl-scraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
/* eslint-disable no-plusplus */
/* eslint-disable camelcase */
const xlsx = require('xlsx')
const fs = require('fs')
const path = require('path')

const DOCUMENT_ROOT = path.resolve(process.cwd(), 'downloads')

// Set this to null to parse all documents
// Set this to a filename to parse only that document
const PARSE_SINGLE_DOCUMENT = null
// const PARSE_SINGLE_DOCUMENT = 'CDSP-AWES3N_T5013_R15_TY2018_2018_02_28_15_31_24.xls'
const PARSE_ONLY_DOCUMENT_TYPE = 'T3'

// eslint-disable-next-line camelcase
const parse_T3 = (ws) => {
console.log('parse_T3')
// console.log(ws)
const distributionColumns = Array.from({ length: 14 }, (_, i) => String.fromCharCode('D'.charCodeAt(0) + i))
let count = 0
let column = distributionColumns[count]
const distributions = []
while (ws[`${column}19`]) {
distributions.push({
totalAmountPerUnit: ws[`${column}19`].v,
paymentDate: ws[`${column}21`].v,
capitalGains: ws[`${column}25`] ? ws[`${column}25`].v : undefined,
amountOfEligibleDividends: ws[`${column}26`] ? ws[`${column}26`].v : undefined,
amountOfNonEligibleDividends: ws[`${column}27`] ? ws[`${column}27`].v : undefined,
foreignBusinessIncome: ws[`${column}28`] ? ws[`${column}28`].v : undefined,
foreignNonBusinessIncome: ws[`${column}29`] ? ws[`${column}29`].v : undefined,
otherInvestmentIncome: ws[`${column}30`] ? ws[`${column}30`].v : undefined,
returnOfCapital: ws[`${column}32`] ? ws[`${column}32`].v : undefined
})
count++
column = distributionColumns[count]
}

return {
year: ws.A2.v,
name: ws.C5.v,
symbol: ws.M5.v,
website: ws.M6 ? ws.M6.v : undefined,
calculationMethod: ws.G15.v === 1 ? 'RATE' : 'PER CENT',
distributions
}
}

// eslint-disable-next-line camelcase
const parse_T5013_R5 = (ws) => {
console.log('parse_T5013_R5')
const distributionColumns = Array.from({ length: 14 }, (_, i) => String.fromCharCode('E'.charCodeAt(0) + i))
let count = 0
let column = distributionColumns[count]
const distributions = []
while (ws[`${column}21`]) {
distributions.push({
paymentDate: ws[`${column}21`].v,

returnOfCapital: ws[`${column}42`] ? ws[`${column}42`].v : undefined

})
count++
column = distributionColumns[count]
}
return {
year: ws.A2.v,
name: ws.C7.v,
website: ws.M13 ? ws.M13.v : undefined,
calculationMethod: ws.P5.v === 1 ? 'YEAR-END POSITION' : 'DISTRIBUTION',
distributions
}
}

// eslint-disable-next-line camelcase
const parse_T5 = (ws) => {
console.log('parse_T5')

// console.log(ws['!objects'])
return {
year: 1
}
}

// eslint-disable-next-line camelcase
const parse_T3_French = (ws) => {
console.log('parse_T3_French')
return {
year: ws.A2.v,
name: ws.C5.v,
symbol: ws.M5.v,
website: ws.M6 ? ws.M6.v : undefined,
calculationMethod: ws.G15.v === 1 ? 'RATE' : 'PER CENT'
}
}

const parseXlsDocument = (folder, filename) => {
const wb = xlsx.readFile(`${folder}/${filename}`)
const ws = wb.Sheets[wb.SheetNames[0]]
let parsingFunction
switch (ws.A1.v) {
case 'T5013/R15 Form':
if (PARSE_ONLY_DOCUMENT_TYPE && PARSE_ONLY_DOCUMENT_TYPE === 'T5013') {
parsingFunction = parse_T5013_R5
}
break
case 'Statement of Split Shares Income Allocations and Designations':
if (PARSE_ONLY_DOCUMENT_TYPE && PARSE_ONLY_DOCUMENT_TYPE === 'T5') {
parsingFunction = parse_T5
}
break
case 'Statement of Trust Income Allocations and Designations':
case 'ETAT DES REVENUS DE FIDUCIE (REPARTITION ET ALLOCATION)':
if (PARSE_ONLY_DOCUMENT_TYPE && PARSE_ONLY_DOCUMENT_TYPE === 'T3') {
parsingFunction = parse_T3
}
break
default:
console.log('Unknown xsl document type')
console.log(ws.A1)
process.exit()
}

if (parsingFunction) {
console.log('Parsing: ', filename)
const data = parsingFunction(ws)
console.log(data)
if (!data.name) {
console.log('ERROR: unexpected data')
process.exit()
}
}
}

if (PARSE_SINGLE_DOCUMENT) {
parseXlsDocument(DOCUMENT_ROOT, PARSE_SINGLE_DOCUMENT)
} else {
fs.readdir(DOCUMENT_ROOT, (err, list) => {
if (err) throw err
list.forEach((filename) => {
if (path.extname(filename) === '.xls') {
parseXlsDocument(DOCUMENT_ROOT, filename)
}
})
})
}

0 comments on commit 225ad11

Please sign in to comment.