Skip to content

Commit

Permalink
Adds the possibility to crawl concurrently
Browse files Browse the repository at this point in the history
Number of concurrent crawls is set via `apply_filters( 'wp2static_concurrent_crawl_rate', 1 )`
  • Loading branch information
palmiak authored and john-shaffer committed Nov 6, 2021
1 parent 4997967 commit 1d5ffe1
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 77 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- [#829](https://github.com/leonstafford/wp2static/pull/829) Move options labels and definitions out of the db and into code. @john-shaffer
- [#826](https://github.com/leonstafford/wp2static/pull/826) Allow multiple redirects and report on redirects in wp-cli. @bookwyrm, @jhatmaker
- [28fc58e5](https://github.com/leonstafford/wp2static/commit/28fc58e5f7694129e5919530adcd6c57435391fb) Add warning-level log messages. @john-shaffer
- [#834](https://github.com/leonstafford/wp2static/pull/834) Implement concurrent crawling. @palmiak

## WP2Static 7.1.7 (2021-09-04)

Expand Down
204 changes: 127 additions & 77 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
use WP2StaticGuzzleHttp\Client;
use WP2StaticGuzzleHttp\Psr7\Request;
use WP2StaticGuzzleHttp\Psr7\Response;
use WP2StaticGuzzleHttp\Exception\TooManyRedirectsException;
use Psr\Http\Message\ResponseInterface;
use WP2StaticGuzzleHttp\Exception\RequestException;
use WP2StaticGuzzleHttp\Exception\TooManyRedirectsException;
use WP2StaticGuzzleHttp\Pool;

define( 'WP2STATIC_REDIRECT_CODES', [ 301, 302, 303, 307, 308 ] );

Expand All @@ -27,6 +29,16 @@ class Crawler {
*/
private $site_path;

/**
* @var integer
*/
private $crawled = 0;

/**
* @var integer
*/
private $cache_hits = 0;

/**
* Crawler constructor
*/
Expand Down Expand Up @@ -77,9 +89,6 @@ public static function wp2staticCrawl( string $static_site_path, string $crawler
* Crawls URLs in WordPressSite, saving them to StaticSite
*/
public function crawlSite( string $static_site_path ) : void {
$crawled = 0;
$cache_hits = 0;

WsLog::l( 'Starting to crawl detected URLs.' );

$site_host = parse_url( $this->site_path, PHP_URL_HOST );
Expand All @@ -104,97 +113,138 @@ public function crawlSite( string $static_site_path ) : void {
*/

$crawlable_paths = CrawlQueue::getCrawlablePaths();
$urls = [];

foreach ( $crawlable_paths as $root_relative_path ) {
$absolute_uri = new URL( $this->site_path . $root_relative_path );
$url = $absolute_uri->get();
$urls[] = [
'url' => $absolute_uri->get(),
'path' => $root_relative_path,
];
}

$response = $this->crawlURL( $url );
$headers = [];

if ( ! $response ) {
continue;
}
$auth_user = CoreOptions::getValue( 'basicAuthUser' );

$crawled_contents = (string) $response->getBody();
$status_code = $response->getStatusCode();
if ( $auth_user ) {
$auth_password = CoreOptions::getValue( 'basicAuthPassword' );

if ( $status_code === 404 ) {
WsLog::l( '404 for URL ' . $root_relative_path );
CrawlCache::rmUrl( $root_relative_path );
$crawled_contents = null;
} elseif ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$crawled_contents = null;
if ( $auth_password ) {
$headers['auth'] = [ $auth_user, $auth_password ];
}
}

$redirect_to = null;

if ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$effective_url = $url;

// returns as string
$redirect_history =
$response->getHeaderLine( 'X-Guzzle-Redirect-History' );

if ( $redirect_history ) {
$redirects = explode( ', ', $redirect_history );
$effective_url = end( $redirects );
}

$redirect_to =
(string) str_replace( $site_urls, '', $effective_url );
$page_hash = md5( $status_code . $redirect_to );
} elseif ( ! is_null( $crawled_contents ) ) {
$page_hash = md5( $crawled_contents );
} else {
$page_hash = md5( (string) $status_code );
$requests = function ( $urls ) use ( $headers ) {
foreach ( $urls as $url ) {
yield new Request( 'GET', $url['url'], $headers );
}
};

// TODO: as John mentioned, we're only skipping the saving,
// not crawling here. Let's look at improving that... or speeding
// up with async requests, at least
if ( $use_crawl_cache ) {
// if not already cached
if ( CrawlCache::getUrl( $root_relative_path, $page_hash ) ) {
$cache_hits++;

continue;
}
}
$pool = new Pool(
$this->client,
$requests( $urls ),
[
'concurrency' => apply_filters( 'wp2static_concurrent_crawl_rate', 1 ),
'fulfilled' => function ( Response $response, $index ) use (
$urls, $use_crawl_cache, $site_urls
) {
$root_relative_path = $urls[ $index ]['path'];
$crawled_contents = (string) $response->getBody();
$status_code = $response->getStatusCode();

if ( $status_code === 404 ) {
WsLog::l( '404 for URL ' . $root_relative_path );
CrawlCache::rmUrl( $root_relative_path );
$crawled_contents = null;
} elseif ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$crawled_contents = null;
}

$redirect_to = null;

if ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$effective_url = $urls[ $index ]['url'];

// returns as string
$redirect_history =
$response->getHeaderLine( 'X-Guzzle-Redirect-History' );

if ( $redirect_history ) {
$redirects = explode( ', ', $redirect_history );
$effective_url = end( $redirects );
}

$redirect_to =
(string) str_replace( $site_urls, '', $effective_url );
$page_hash = md5( $status_code . $redirect_to );
} elseif ( ! is_null( $crawled_contents ) ) {
$page_hash = md5( $crawled_contents );
} else {
$page_hash = md5( (string) $status_code );
}

// TODO: as John mentioned, we're only skipping the saving,
// not crawling here. Let's look at improving that... or speeding
// up with async requests, at least
if ( $use_crawl_cache ) {
// if not already cached
if ( CrawlCache::getUrl( $root_relative_path, $page_hash ) ) {
$this->cache_hits++;
}
}

$this->crawled++;

if ( $crawled_contents ) {
// do some magic here - naive: if URL ends in /, save to /index.html
// TODO: will need love for example, XML files
// check content type, serve .xml/rss, etc instead
if ( mb_substr( $root_relative_path, -1 ) === '/' ) {
StaticSite::add(
$root_relative_path . 'index.html',
$crawled_contents
);
} else {
StaticSite::add( $root_relative_path, $crawled_contents );
}
}

CrawlCache::addUrl(
$root_relative_path,
$page_hash,
$status_code,
$redirect_to
);

// incrementally log crawl progress
if ( $this->crawled % 300 === 0 ) {
$notice = "Crawling progress: $this->crawled crawled," .
" $this->cache_hits skipped (cached).";
WsLog::l( $notice );
}
},
'rejected' => function ( RequestException $reason, $index ) use ( $urls ) {
$root_relative_path = $urls[ $index ]['path'];
WsLog::l( 'Failed ' . $root_relative_path );
},
]
);

$crawled++;

if ( $crawled_contents ) {
// do some magic here - naive: if URL ends in /, save to /index.html
// TODO: will need love for example, XML files
// check content type, serve .xml/rss, etc instead
if ( mb_substr( $root_relative_path, -1 ) === '/' ) {
StaticSite::add( $root_relative_path . 'index.html', $crawled_contents );
} else {
StaticSite::add( $root_relative_path, $crawled_contents );
}
}
// Initiate the transfers and create a promise
$promise = $pool->promise();

CrawlCache::addUrl(
$root_relative_path,
$page_hash,
$status_code,
$redirect_to
);

// incrementally log crawl progress
if ( $crawled % 300 === 0 ) {
$notice = "Crawling progress: $crawled crawled, $cache_hits skipped (cached).";
WsLog::l( $notice );
}
}
// Force the pool of requests to complete.
$promise->wait();

WsLog::l(
"Crawling complete. $crawled crawled, $cache_hits skipped (cached)."
"Crawling complete. $this->crawled crawled, $this->cache_hits skipped (cached)."
);

$args = [
'staticSitePath' => $static_site_path,
'crawled' => $crawled,
'cache_hits' => $cache_hits,
'crawled' => $this->crawled,
'cache_hits' => $this->cache_hits,
];

do_action( 'wp2static_crawling_complete', $args );
Expand Down

0 comments on commit 1d5ffe1

Please sign in to comment.