Skip to content

Commit

Permalink
Adds the possibility to crawl concurrently (#834)
Browse files Browse the repository at this point in the history
Number of concurrent crawls is set via
`apply_filters( 'wp2static_crawl_concurrency', 1 )`
  • Loading branch information
palmiak committed Nov 6, 2021
1 parent 4997967 commit bfb48da
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 77 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
- [#829](https://github.com/leonstafford/wp2static/pull/829) Move options labels and definitions out of the db and into code. @john-shaffer
- [#826](https://github.com/leonstafford/wp2static/pull/826) Allow multiple redirects and report on redirects in wp-cli. @bookwyrm, @jhatmaker
- [28fc58e5](https://github.com/leonstafford/wp2static/commit/28fc58e5f7694129e5919530adcd6c57435391fb) Add warning-level log messages. @john-shaffer
- [#834](https://github.com/leonstafford/wp2static/pull/834) Implement concurrent crawling. @palmiak
- Deprecate Crawler::crawlURL.

## WP2Static 7.1.7 (2021-09-04)

Expand Down
2 changes: 1 addition & 1 deletion phpstan.neon
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ parameters:
count: 6
- message: '#^In method "WP2Static\\\S+::\S+", you should not use the \$_(GET|POST) superglobal#'
path: src/CoreOptions.php
count: 22
count: 23
- message: '#^In method "WP2Static\\\S+::\S+", you should not use the \$_(GET|POST) superglobal#'
path: src/ViewRenderer.php
count: 32
Expand Down
13 changes: 13 additions & 0 deletions src/CoreOptions.php
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,12 @@ public static function optionSpecs() : array {
),

// Advanced options
self::makeOptionSpec(
'crawlConcurrency',
'1',
'Crawl Concurrency',
'The maximum number of files that will be crawled at the same time.'
),
self::makeOptionSpec(
'skipURLRewrite',
'0',
Expand Down Expand Up @@ -599,6 +605,13 @@ public static function savePosted( string $screen = 'core' ) : void {

break;
case 'advanced':
$crawl_concurrency = intval( $_POST['crawlConcurrency'] );
$wpdb->update(
$table_name,
[ 'value' => $crawl_concurrency < 1 ? 1 : $crawl_concurrency ],
[ 'name' => 'crawlConcurrency' ]
);

$wpdb->update(
$table_name,
[ 'value' => isset( $_POST['skipURLRewrite'] ) ? 1 : 0 ],
Expand Down
206 changes: 130 additions & 76 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
use WP2StaticGuzzleHttp\Client;
use WP2StaticGuzzleHttp\Psr7\Request;
use WP2StaticGuzzleHttp\Psr7\Response;
use WP2StaticGuzzleHttp\Exception\TooManyRedirectsException;
use Psr\Http\Message\ResponseInterface;
use WP2StaticGuzzleHttp\Exception\RequestException;
use WP2StaticGuzzleHttp\Exception\TooManyRedirectsException;
use WP2StaticGuzzleHttp\Pool;

define( 'WP2STATIC_REDIRECT_CODES', [ 301, 302, 303, 307, 308 ] );

Expand All @@ -27,6 +29,16 @@ class Crawler {
*/
private $site_path;

/**
* @var integer
*/
private $crawled = 0;

/**
* @var integer
*/
private $cache_hits = 0;

/**
* Crawler constructor
*/
Expand Down Expand Up @@ -77,9 +89,6 @@ public static function wp2staticCrawl( string $static_site_path, string $crawler
* Crawls URLs in WordPressSite, saving them to StaticSite
*/
public function crawlSite( string $static_site_path ) : void {
$crawled = 0;
$cache_hits = 0;

WsLog::l( 'Starting to crawl detected URLs.' );

$site_host = parse_url( $this->site_path, PHP_URL_HOST );
Expand All @@ -104,108 +113,153 @@ public function crawlSite( string $static_site_path ) : void {
*/

$crawlable_paths = CrawlQueue::getCrawlablePaths();
$urls = [];

foreach ( $crawlable_paths as $root_relative_path ) {
$absolute_uri = new URL( $this->site_path . $root_relative_path );
$url = $absolute_uri->get();
$urls[] = [
'url' => $absolute_uri->get(),
'path' => $root_relative_path,
];
}

$response = $this->crawlURL( $url );
$headers = [];

if ( ! $response ) {
continue;
}
$auth_user = CoreOptions::getValue( 'basicAuthUser' );

$crawled_contents = (string) $response->getBody();
$status_code = $response->getStatusCode();
if ( $auth_user ) {
$auth_password = CoreOptions::getValue( 'basicAuthPassword' );

if ( $status_code === 404 ) {
WsLog::l( '404 for URL ' . $root_relative_path );
CrawlCache::rmUrl( $root_relative_path );
$crawled_contents = null;
} elseif ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$crawled_contents = null;
if ( $auth_password ) {
$headers['auth'] = [ $auth_user, $auth_password ];
}
}

$redirect_to = null;

if ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$effective_url = $url;

// returns as string
$redirect_history =
$response->getHeaderLine( 'X-Guzzle-Redirect-History' );

if ( $redirect_history ) {
$redirects = explode( ', ', $redirect_history );
$effective_url = end( $redirects );
}

$redirect_to =
(string) str_replace( $site_urls, '', $effective_url );
$page_hash = md5( $status_code . $redirect_to );
} elseif ( ! is_null( $crawled_contents ) ) {
$page_hash = md5( $crawled_contents );
} else {
$page_hash = md5( (string) $status_code );
$requests = function ( $urls ) use ( $headers ) {
foreach ( $urls as $url ) {
yield new Request( 'GET', $url['url'], $headers );
}
};

// TODO: as John mentioned, we're only skipping the saving,
// not crawling here. Let's look at improving that... or speeding
// up with async requests, at least
if ( $use_crawl_cache ) {
// if not already cached
if ( CrawlCache::getUrl( $root_relative_path, $page_hash ) ) {
$cache_hits++;
$concurrency = intval( CoreOptions::getValue( 'crawlConcurrency' ) );
$concurrency = apply_filters( 'wp2static_crawl_concurrency', $concurrency );

continue;
}
}
$pool = new Pool(
$this->client,
$requests( $urls ),
[
'concurrency' => $concurrency,
'fulfilled' => function ( Response $response, $index ) use (
$urls, $use_crawl_cache, $site_urls
) {
$root_relative_path = $urls[ $index ]['path'];
$crawled_contents = (string) $response->getBody();
$status_code = $response->getStatusCode();

if ( $status_code === 404 ) {
WsLog::l( '404 for URL ' . $root_relative_path );
CrawlCache::rmUrl( $root_relative_path );
$crawled_contents = null;
} elseif ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$crawled_contents = null;
}

$redirect_to = null;

if ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$effective_url = $urls[ $index ]['url'];

// returns as string
$redirect_history =
$response->getHeaderLine( 'X-Guzzle-Redirect-History' );

if ( $redirect_history ) {
$redirects = explode( ', ', $redirect_history );
$effective_url = end( $redirects );
}

$redirect_to =
(string) str_replace( $site_urls, '', $effective_url );
$page_hash = md5( $status_code . $redirect_to );
} elseif ( ! is_null( $crawled_contents ) ) {
$page_hash = md5( $crawled_contents );
} else {
$page_hash = md5( (string) $status_code );
}

if ( $use_crawl_cache ) {
// if not already cached
if ( CrawlCache::getUrl( $root_relative_path, $page_hash ) ) {
$this->cache_hits++;
}
}

$this->crawled++;

if ( $crawled_contents ) {
// do some magic here - naive: if URL ends in /, save to /index.html
// TODO: will need love for example, XML files
// check content type, serve .xml/rss, etc instead
if ( mb_substr( $root_relative_path, -1 ) === '/' ) {
StaticSite::add(
$root_relative_path . 'index.html',
$crawled_contents
);
} else {
StaticSite::add( $root_relative_path, $crawled_contents );
}
}

CrawlCache::addUrl(
$root_relative_path,
$page_hash,
$status_code,
$redirect_to
);

// incrementally log crawl progress
if ( $this->crawled % 300 === 0 ) {
$notice = "Crawling progress: $this->crawled crawled," .
" $this->cache_hits skipped (cached).";
WsLog::l( $notice );
}
},
'rejected' => function ( RequestException $reason, $index ) use ( $urls ) {
$root_relative_path = $urls[ $index ]['path'];
WsLog::l( 'Failed ' . $root_relative_path );
},
]
);

$crawled++;

if ( $crawled_contents ) {
// do some magic here - naive: if URL ends in /, save to /index.html
// TODO: will need love for example, XML files
// check content type, serve .xml/rss, etc instead
if ( mb_substr( $root_relative_path, -1 ) === '/' ) {
StaticSite::add( $root_relative_path . 'index.html', $crawled_contents );
} else {
StaticSite::add( $root_relative_path, $crawled_contents );
}
}
// Initiate the transfers and create a promise
$promise = $pool->promise();

CrawlCache::addUrl(
$root_relative_path,
$page_hash,
$status_code,
$redirect_to
);

// incrementally log crawl progress
if ( $crawled % 300 === 0 ) {
$notice = "Crawling progress: $crawled crawled, $cache_hits skipped (cached).";
WsLog::l( $notice );
}
}
// Force the pool of requests to complete.
$promise->wait();

WsLog::l(
"Crawling complete. $crawled crawled, $cache_hits skipped (cached)."
"Crawling complete. $this->crawled crawled, $this->cache_hits skipped (cached)."
);

$args = [
'staticSitePath' => $static_site_path,
'crawled' => $crawled,
'cache_hits' => $cache_hits,
'crawled' => $this->crawled,
'cache_hits' => $this->cache_hits,
];

do_action( 'wp2static_crawling_complete', $args );
}

/**
* @deprecated
*
* Crawls a string of full URL within WordPressSite
*
* @return ResponseInterface|null response object
*/
public function crawlURL( string $url ) : ?ResponseInterface {
WsLog::w( 'WP2Static Crawler::crawlURL is deprecated.' );

$headers = [];
$response = null;

Expand Down
16 changes: 16 additions & 0 deletions views/advanced-options-page.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,22 @@

<table class="widefat striped">
<tbody>
<tr>
<td style="width:50%;">
<label
for="<?php echo $view['coreOptions']['crawlConcurrency']->name; ?>"
><b><?php echo $view['coreOptions']['crawlConcurrency']->label; ?></b></label>
<br/><?php echo $view['coreOptions']['crawlConcurrency']->description; ?>
</td>
<td>
<input
id="<?php echo $view['coreOptions']['crawlConcurrency']->name; ?>"
name="<?php echo $view['coreOptions']['crawlConcurrency']->name; ?>"
value="<?php echo (int) $view['coreOptions']['crawlConcurrency']->value; ?>"
type="number"
/>
</td>
</tr>
<tr>
<td style="width:50%;">
<label
Expand Down

0 comments on commit bfb48da

Please sign in to comment.