Permalink
Browse files

making routes explicitly regexes, and fixing js object titles

route strings weren't getting converted to regexes, causing the ORs in the js route string to not get picked up
  • Loading branch information...
1 parent 2db21d7 commit 2ddd04ee1fe9b02e480f9bb2ed2eb0f301e1205d @rgarcia rgarcia committed Dec 11, 2011
Showing with 4,034 additions and 1,530 deletions.
  1. +1 −1 scraper/css-mdn/scrape.js
  2. +1 −1 scraper/dom-mdn/scrape.js
  3. +1 −1 scraper/html-mdn/scrape.js
  4. +3 −1 scraper/js-mdn/scrape.js
  5. +4,028 −1,526 static/data/js-mdn.json
@@ -41,7 +41,7 @@ requirejs([
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
var titles = [];
- spidey.route('developer.mozilla.org', '/en/CSS/*', function ($, url) {
+ spidey.route('developer.mozilla.org', /\/en\/CSS\/*/, function ($, url) {
if ( _.indexOf(blacklist,url) !== -1 ) return;
visitLinks($);
@@ -40,7 +40,7 @@ requirejs([
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
var titles = [];
- spidey.route('developer.mozilla.org', '\/en\/DOM/*', function ($, url) {
+ spidey.route('developer.mozilla.org', /\/en\/DOM\/*/, function ($, url) {
if ( _.indexOf(blacklist,url) !== -1 ) return;
visitLinks($);
@@ -40,7 +40,7 @@ requirejs([
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
var titles = [];
- spidey.route('developer.mozilla.org', '/en/HTML/Element/*', function ($, url) {
+ spidey.route('developer.mozilla.org', /\/en\/HTML\/Element\/*/, function ($, url) {
if ( _.indexOf(blacklist,url) !== -1 ) return;
visitLinks($);
View
@@ -40,14 +40,16 @@ requirejs([
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
var titles = [];
- spidey.route('developer.mozilla.org', '(\/en\/JavaScript_typed_arrays|\/en\/JavaScript\/Reference\/(Global_Objects|Statement|Operators))\/*', function ($, url) {
+ spidey.route('developer.mozilla.org', /(\/en\/JavaScript_typed_arrays|\/en\/JavaScript\/Reference\/(Global_Objects|Statement|Operators))\/*/, function ($, url) {
if ( _.indexOf(blacklist,url) !== -1 ) return;
visitLinks($);
console.log('---------');
console.log('scraping:',url);
var title = $('article .page-title h1').text().trim();
+ if ( /Global_Objects/.test(url) && url.split('Global_Objects/').length > 1)
+ title = url.split('Global_Objects/')[1].replace(/\//g, '.');
if ( title === '' || title === null ) {
console.log('ERROR: could not get title, skipping');
return;
Oops, something went wrong.

0 comments on commit 2ddd04e

Please sign in to comment.