Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Concurrently crawling #834

Merged
merged 1 commit into from
Nov 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
- [#829](https://github.com/leonstafford/wp2static/pull/829) Move options labels and definitions out of the db and into code. @john-shaffer
- [#826](https://github.com/leonstafford/wp2static/pull/826) Allow multiple redirects and report on redirects in wp-cli. @bookwyrm, @jhatmaker
- [28fc58e5](https://github.com/leonstafford/wp2static/commit/28fc58e5f7694129e5919530adcd6c57435391fb) Add warning-level log messages. @john-shaffer
- [#834](https://github.com/leonstafford/wp2static/pull/834) Implement concurrent crawling. @palmiak
- Deprecate Crawler::crawlURL.

## WP2Static 7.1.7 (2021-09-04)

Expand Down
2 changes: 1 addition & 1 deletion phpstan.neon
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ parameters:
count: 6
- message: '#^In method "WP2Static\\\S+::\S+", you should not use the \$_(GET|POST) superglobal#'
path: src/CoreOptions.php
count: 22
count: 23
- message: '#^In method "WP2Static\\\S+::\S+", you should not use the \$_(GET|POST) superglobal#'
path: src/ViewRenderer.php
count: 32
Expand Down
13 changes: 13 additions & 0 deletions src/CoreOptions.php
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,12 @@ public static function optionSpecs() : array {
),

// Advanced options
self::makeOptionSpec(
'crawlConcurrency',
'1',
'Crawl Concurrency',
'The maximum number of files that will be crawled at the same time.'
),
self::makeOptionSpec(
'skipURLRewrite',
'0',
Expand Down Expand Up @@ -599,6 +605,13 @@ public static function savePosted( string $screen = 'core' ) : void {

break;
case 'advanced':
$crawl_concurrency = intval( $_POST['crawlConcurrency'] );
$wpdb->update(
$table_name,
[ 'value' => $crawl_concurrency < 1 ? 1 : $crawl_concurrency ],
[ 'name' => 'crawlConcurrency' ]
);

$wpdb->update(
$table_name,
[ 'value' => isset( $_POST['skipURLRewrite'] ) ? 1 : 0 ],
Expand Down
206 changes: 130 additions & 76 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
use WP2StaticGuzzleHttp\Client;
use WP2StaticGuzzleHttp\Psr7\Request;
use WP2StaticGuzzleHttp\Psr7\Response;
use WP2StaticGuzzleHttp\Exception\TooManyRedirectsException;
use Psr\Http\Message\ResponseInterface;
use WP2StaticGuzzleHttp\Exception\RequestException;
use WP2StaticGuzzleHttp\Exception\TooManyRedirectsException;
use WP2StaticGuzzleHttp\Pool;

define( 'WP2STATIC_REDIRECT_CODES', [ 301, 302, 303, 307, 308 ] );

Expand All @@ -27,6 +29,16 @@ class Crawler {
*/
private $site_path;

/**
* @var integer
*/
private $crawled = 0;

/**
* @var integer
*/
private $cache_hits = 0;

/**
* Crawler constructor
*/
Expand Down Expand Up @@ -77,9 +89,6 @@ public static function wp2staticCrawl( string $static_site_path, string $crawler
* Crawls URLs in WordPressSite, saving them to StaticSite
*/
public function crawlSite( string $static_site_path ) : void {
$crawled = 0;
$cache_hits = 0;

WsLog::l( 'Starting to crawl detected URLs.' );

$site_host = parse_url( $this->site_path, PHP_URL_HOST );
Expand All @@ -104,108 +113,153 @@ public function crawlSite( string $static_site_path ) : void {
*/

$crawlable_paths = CrawlQueue::getCrawlablePaths();
$urls = [];

foreach ( $crawlable_paths as $root_relative_path ) {
$absolute_uri = new URL( $this->site_path . $root_relative_path );
$url = $absolute_uri->get();
$urls[] = [
'url' => $absolute_uri->get(),
'path' => $root_relative_path,
];
}

$response = $this->crawlURL( $url );
$headers = [];

if ( ! $response ) {
continue;
}
$auth_user = CoreOptions::getValue( 'basicAuthUser' );

$crawled_contents = (string) $response->getBody();
$status_code = $response->getStatusCode();
if ( $auth_user ) {
$auth_password = CoreOptions::getValue( 'basicAuthPassword' );

if ( $status_code === 404 ) {
WsLog::l( '404 for URL ' . $root_relative_path );
CrawlCache::rmUrl( $root_relative_path );
$crawled_contents = null;
} elseif ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$crawled_contents = null;
if ( $auth_password ) {
$headers['auth'] = [ $auth_user, $auth_password ];
}
}

$redirect_to = null;

if ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$effective_url = $url;

// returns as string
$redirect_history =
$response->getHeaderLine( 'X-Guzzle-Redirect-History' );

if ( $redirect_history ) {
$redirects = explode( ', ', $redirect_history );
$effective_url = end( $redirects );
}

$redirect_to =
(string) str_replace( $site_urls, '', $effective_url );
$page_hash = md5( $status_code . $redirect_to );
} elseif ( ! is_null( $crawled_contents ) ) {
$page_hash = md5( $crawled_contents );
} else {
$page_hash = md5( (string) $status_code );
$requests = function ( $urls ) use ( $headers ) {
foreach ( $urls as $url ) {
yield new Request( 'GET', $url['url'], $headers );
}
};

// TODO: as John mentioned, we're only skipping the saving,
// not crawling here. Let's look at improving that... or speeding
// up with async requests, at least
if ( $use_crawl_cache ) {
// if not already cached
if ( CrawlCache::getUrl( $root_relative_path, $page_hash ) ) {
$cache_hits++;
$concurrency = intval( CoreOptions::getValue( 'crawlConcurrency' ) );
$concurrency = apply_filters( 'wp2static_crawl_concurrency', $concurrency );

continue;
}
}
$pool = new Pool(
$this->client,
$requests( $urls ),
[
'concurrency' => $concurrency,
'fulfilled' => function ( Response $response, $index ) use (
$urls, $use_crawl_cache, $site_urls
) {
$root_relative_path = $urls[ $index ]['path'];
$crawled_contents = (string) $response->getBody();
$status_code = $response->getStatusCode();

if ( $status_code === 404 ) {
WsLog::l( '404 for URL ' . $root_relative_path );
CrawlCache::rmUrl( $root_relative_path );
$crawled_contents = null;
} elseif ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$crawled_contents = null;
}

$redirect_to = null;

if ( in_array( $status_code, WP2STATIC_REDIRECT_CODES ) ) {
$effective_url = $urls[ $index ]['url'];

// returns as string
$redirect_history =
$response->getHeaderLine( 'X-Guzzle-Redirect-History' );

if ( $redirect_history ) {
$redirects = explode( ', ', $redirect_history );
$effective_url = end( $redirects );
}

$redirect_to =
(string) str_replace( $site_urls, '', $effective_url );
$page_hash = md5( $status_code . $redirect_to );
} elseif ( ! is_null( $crawled_contents ) ) {
$page_hash = md5( $crawled_contents );
} else {
$page_hash = md5( (string) $status_code );
}

if ( $use_crawl_cache ) {
// if not already cached
if ( CrawlCache::getUrl( $root_relative_path, $page_hash ) ) {
$this->cache_hits++;
}
}

$this->crawled++;

if ( $crawled_contents ) {
// do some magic here - naive: if URL ends in /, save to /index.html
// TODO: will need love for example, XML files
// check content type, serve .xml/rss, etc instead
if ( mb_substr( $root_relative_path, -1 ) === '/' ) {
StaticSite::add(
$root_relative_path . 'index.html',
$crawled_contents
);
} else {
StaticSite::add( $root_relative_path, $crawled_contents );
}
}

CrawlCache::addUrl(
$root_relative_path,
$page_hash,
$status_code,
$redirect_to
);

// incrementally log crawl progress
if ( $this->crawled % 300 === 0 ) {
$notice = "Crawling progress: $this->crawled crawled," .
" $this->cache_hits skipped (cached).";
WsLog::l( $notice );
}
},
'rejected' => function ( RequestException $reason, $index ) use ( $urls ) {
$root_relative_path = $urls[ $index ]['path'];
WsLog::l( 'Failed ' . $root_relative_path );
},
]
);

$crawled++;

if ( $crawled_contents ) {
// do some magic here - naive: if URL ends in /, save to /index.html
// TODO: will need love for example, XML files
// check content type, serve .xml/rss, etc instead
if ( mb_substr( $root_relative_path, -1 ) === '/' ) {
StaticSite::add( $root_relative_path . 'index.html', $crawled_contents );
} else {
StaticSite::add( $root_relative_path, $crawled_contents );
}
}
// Initiate the transfers and create a promise
$promise = $pool->promise();

CrawlCache::addUrl(
$root_relative_path,
$page_hash,
$status_code,
$redirect_to
);

// incrementally log crawl progress
if ( $crawled % 300 === 0 ) {
$notice = "Crawling progress: $crawled crawled, $cache_hits skipped (cached).";
WsLog::l( $notice );
}
}
// Force the pool of requests to complete.
$promise->wait();

WsLog::l(
"Crawling complete. $crawled crawled, $cache_hits skipped (cached)."
"Crawling complete. $this->crawled crawled, $this->cache_hits skipped (cached)."
);

$args = [
'staticSitePath' => $static_site_path,
'crawled' => $crawled,
'cache_hits' => $cache_hits,
'crawled' => $this->crawled,
'cache_hits' => $this->cache_hits,
];

do_action( 'wp2static_crawling_complete', $args );
}

/**
* @deprecated
*
* Crawls a string of full URL within WordPressSite
*
* @return ResponseInterface|null response object
*/
public function crawlURL( string $url ) : ?ResponseInterface {
WsLog::w( 'WP2Static Crawler::crawlURL is deprecated.' );

$headers = [];
$response = null;

Expand Down
16 changes: 16 additions & 0 deletions views/advanced-options-page.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,22 @@

<table class="widefat striped">
<tbody>
<tr>
<td style="width:50%;">
<label
for="<?php echo $view['coreOptions']['crawlConcurrency']->name; ?>"
><b><?php echo $view['coreOptions']['crawlConcurrency']->label; ?></b></label>
<br/><?php echo $view['coreOptions']['crawlConcurrency']->description; ?>
</td>
<td>
<input
id="<?php echo $view['coreOptions']['crawlConcurrency']->name; ?>"
name="<?php echo $view['coreOptions']['crawlConcurrency']->name; ?>"
value="<?php echo (int) $view['coreOptions']['crawlConcurrency']->value; ?>"
type="number"
/>
</td>
</tr>
<tr>
<td style="width:50%;">
<label
Expand Down