Skip to content

Commit

Permalink
making routes explicitly regexes, and fixing js object titles
Browse files Browse the repository at this point in the history
route strings weren't getting converted to regexes, causing the ORs in the js route string to not get picked up
  • Loading branch information
rgarcia committed Dec 11, 2011
1 parent 2db21d7 commit 2ddd04e
Show file tree
Hide file tree
Showing 5 changed files with 4,034 additions and 1,530 deletions.
2 changes: 1 addition & 1 deletion scraper/css-mdn/scrape.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ requirejs([
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
var titles = [];

spidey.route('developer.mozilla.org', '/en/CSS/*', function ($, url) {
spidey.route('developer.mozilla.org', /\/en\/CSS\/*/, function ($, url) {
if ( _.indexOf(blacklist,url) !== -1 ) return;
visitLinks($);

Expand Down
2 changes: 1 addition & 1 deletion scraper/dom-mdn/scrape.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ requirejs([
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
var titles = [];

spidey.route('developer.mozilla.org', '\/en\/DOM/*', function ($, url) {
spidey.route('developer.mozilla.org', /\/en\/DOM\/*/, function ($, url) {
if ( _.indexOf(blacklist,url) !== -1 ) return;
visitLinks($);

Expand Down
2 changes: 1 addition & 1 deletion scraper/html-mdn/scrape.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ requirejs([
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
var titles = [];

spidey.route('developer.mozilla.org', '/en/HTML/Element/*', function ($, url) {
spidey.route('developer.mozilla.org', /\/en\/HTML\/Element\/*/, function ($, url) {
if ( _.indexOf(blacklist,url) !== -1 ) return;
visitLinks($);

Expand Down
4 changes: 3 additions & 1 deletion scraper/js-mdn/scrape.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,16 @@ requirejs([
// so in addition to not visiting the same url twice, keep this list to prevent visiting the same title twice
var titles = [];

spidey.route('developer.mozilla.org', '(\/en\/JavaScript_typed_arrays|\/en\/JavaScript\/Reference\/(Global_Objects|Statement|Operators))\/*', function ($, url) {
spidey.route('developer.mozilla.org', /(\/en\/JavaScript_typed_arrays|\/en\/JavaScript\/Reference\/(Global_Objects|Statement|Operators))\/*/, function ($, url) {
if ( _.indexOf(blacklist,url) !== -1 ) return;
visitLinks($);

console.log('---------');
console.log('scraping:',url);

var title = $('article .page-title h1').text().trim();
if ( /Global_Objects/.test(url) && url.split('Global_Objects/').length > 1)
title = url.split('Global_Objects/')[1].replace(/\//g, '.');
if ( title === '' || title === null ) {
console.log('ERROR: could not get title, skipping');
return;
Expand Down
Loading

0 comments on commit 2ddd04e

Please sign in to comment.