From 163efa4f2719bd3a52b025b8219247054378e1da Mon Sep 17 00:00:00 2001 From: Leon Date: Wed, 10 Jun 2020 03:40:43 +0930 Subject: [PATCH 01/39] 85 dont detect assets for initial filelist --- src/FilesHelper.php | 199 -------------------------------------------- 1 file changed, 199 deletions(-) diff --git a/src/FilesHelper.php b/src/FilesHelper.php index a5c12016..a4bd0b6c 100755 --- a/src/FilesHelper.php +++ b/src/FilesHelper.php @@ -34,161 +34,6 @@ public static function delete_dir_with_files( string $dir ) : bool { return $successfully_removed; } - /** - * Detect Active Plugin CSS URLs - * - * @return string[] list of URLs - */ - public static function getPluginCSSURLs() : array { - $files = []; - - $plugins_path = trailingslashit( WP_PLUGIN_DIR ); - $plugins_url = trailingslashit( plugins_url() ); - $active_plugins = get_option( 'active_plugins' ); - - $active_plugin_dirs = array_map( - function ( $active_plugin ) use ( $plugins_path ) { - $plugin_base = dirname( $active_plugin ); - - // exclude SSG plugin dirs and known uploads dir excludables - $exclude_plugins = [ - 'simplerstatic', - 'static-html-output-plugin', - 'wp2static', - ]; - - foreach ( $exclude_plugins as $exclude_plugin ) { - if ( strpos( $plugin_base, $exclude_plugin ) !== false ) { - return; - } - } - - return $plugins_path . $plugin_base; - }, - $active_plugins - ); - - foreach ( $active_plugin_dirs as $active_plugin_dir ) { - - if ( is_dir( $active_plugin_dir ) ) { - $iterator = new RecursiveIteratorIterator( - new RecursiveDirectoryIterator( - $active_plugin_dir, - RecursiveDirectoryIterator::SKIP_DOTS - ) - ); - - foreach ( $iterator as $filename => $file_object ) { - // exclude vendor dirs - if ( strpos( strtolower( $filename ), 'vendor' ) !== false ) { - continue; - } - - // exclude likely admin area assets - if ( strpos( strtolower( $filename ), 'admin' ) !== false ) { - continue; - } - - $extension = pathinfo( $filename, PATHINFO_EXTENSION ); - - if ( $extension !== 'css' ) { - continue; - } - - // Standardise all paths to use / (Windows support) - // TODO: should come earlier in chain - $filename = wp_normalize_path( $filename ); - - $detected_filename = - str_replace( - $plugins_path, - $plugins_url, - $filename - ); - - $detected_filename = - str_replace( - get_home_url(), - '', - $detected_filename - ); - - if ( is_string( $detected_filename ) ) { - array_push( - $files, - $detected_filename - ); - } - } - } - } - - return $files; - } - - /** - * @return string[] list of URLs - */ - public static function getThemeFiles( string $theme_type ) : array { - $wp_site = new WPSite(); - - $files = []; - $template_path = ''; - $template_url = ''; - - if ( $theme_type === 'parent' ) { - $template_path = $wp_site->parent_theme_path; - $template_url = get_template_directory_uri(); - } else { - $template_path = $wp_site->child_theme_path; - $template_url = get_stylesheet_directory_uri(); - } - - $directory = $template_path; - - if ( is_dir( $directory ) ) { - $iterator = new RecursiveIteratorIterator( - new RecursiveDirectoryIterator( - $directory, - RecursiveDirectoryIterator::SKIP_DOTS - ) - ); - - foreach ( $iterator as $filename => $file_object ) { - // $path_crawlable = self::filePathLooksCrawlable( $filename ); - // for theme files, let's just grab CSS files, as these will yield other link - $extension = pathinfo( $filename, PATHINFO_EXTENSION ); - - if ( $extension !== 'css' ) { - continue; - } - - $detected_filename = - str_replace( - $template_path, - $template_url, - $filename - ); - - $detected_filename = - str_replace( - get_home_url(), - '', - $detected_filename - ); - - if ( is_string( $detected_filename ) ) { - array_push( - $files, - $detected_filename - ); - } - } - } - - return $files; - } - /** * @return string[] list of URLs */ @@ -197,17 +42,6 @@ public static function detectVendorFiles( string $wp_site_url ) : array { $vendor_files = []; - if ( class_exists( '\\Elementor\Api' ) ) { - $elementor_font_dir = WP_PLUGIN_DIR . - '/elementor/assets/lib/font-awesome'; - - $elementor_urls = self::getListOfLocalFilesByUrl( - $elementor_font_dir - ); - - $vendor_files = array_merge( $vendor_files, $elementor_urls ); - } - if ( defined( 'WPSEO_VERSION' ) ) { $yoast_sitemaps = [ '/sitemap_index.xml', @@ -220,26 +54,6 @@ public static function detectVendorFiles( string $wp_site_url ) : array { $vendor_files = array_merge( $vendor_files, $yoast_sitemaps ); } - if ( class_exists( 'autoptimizeMain' ) ) { - $autoptimize_cache_dir = - $wp_site->wp_content_path . '/cache/autoptimize'; - - // get difference between home and wp-contents URL - $prefix = str_replace( - $wp_site->site_url, - '/', - $wp_site->wp_content_url - ); - - $autoptimize_urls = self::getAutoptimizeCacheFiles( - $autoptimize_cache_dir, - $wp_site->wp_content_path, - $prefix - ); - - $vendor_files = array_merge( $vendor_files, $autoptimize_urls ); - } - if ( class_exists( 'Custom_Permalinks' ) ) { global $wpdb; @@ -271,16 +85,6 @@ public static function detectVendorFiles( string $wp_site_url ) : array { } } - if ( class_exists( 'molongui_authorship' ) ) { - $molongui_path = WP_PLUGIN_DIR . '/molongui-authorship'; - - $molongui_urls = self::getListOfLocalFilesByUrl( - $molongui_path - ); - - $vendor_files = array_merge( $vendor_files, $molongui_urls ); - } - return $vendor_files; } @@ -506,9 +310,6 @@ public static function buildInitialFileList( $url_queue = array_merge( $url_queue, - self::getThemeFiles( 'parent' ), - self::getThemeFiles( 'child' ), - self::getPluginCSSURLs(), self::detectVendorFiles( $wp_site->site_url ), self::getAllWPPostURLs( $base_url ), self::getDateArchiveURLs() From a35334e33e1518ef72dc5e175c7537ad040a64fd Mon Sep 17 00:00:00 2001 From: Leon Date: Wed, 10 Jun 2020 03:50:13 +0930 Subject: [PATCH 02/39] 85 use fixed name for export dir --- src/Archive.php | 54 ++----------------------------------- src/ArchiveProcessor.php | 1 - src/Deployer.php | 1 - src/Exporter.php | 21 +-------------- src/FilesHelper.php | 3 ++- src/SiteCrawler.php | 19 +------------ src/SitePublisher.php | 6 +---- views/options-page-js.phtml | 14 +++------- 8 files changed, 10 insertions(+), 109 deletions(-) diff --git a/src/Archive.php b/src/Archive.php index 776b0de9..2fa4f5c3 100755 --- a/src/Archive.php +++ b/src/Archive.php @@ -17,62 +17,12 @@ public function __construct() { [ 'wpenv' ] ); - $this->path = ''; - $this->name = ''; - } - - public function setToCurrentArchive() : void { - $handle = fopen( - $this->settings['wp_uploads_path'] . - '/WP2STATIC-CURRENT-ARCHIVE.txt', - 'r' - ); - - if ( ! is_resource( $handle ) ) { - return; - } - - $path = stream_get_line( $handle, 0 ); - - if ( ! $path ) { - return; - } - - $this->path = $path; + $this->path = $this->settings['wp_uploads_path'] . '/static-html-output/'; $this->name = basename( $this->path ); } - public function currentArchiveExists() : bool { - return is_file( - $this->settings['wp_uploads_path'] . - '/WP2STATIC-CURRENT-ARCHIVE.txt' - ); - } - public function create() : void { - $this->name = $this->settings['wp_uploads_path'] . - '/wp-static-html-output-' . time(); - - $this->path = $this->name . '/'; - $this->name = basename( $this->path ); - - if ( wp_mkdir_p( $this->path ) ) { - $result = file_put_contents( - $this->settings['wp_uploads_path'] . - '/WP2STATIC-CURRENT-ARCHIVE.txt', - $this->path - ); - - if ( ! $result ) { - WsLog::l( 'USER WORKING DIRECTORY NOT WRITABLE' ); - } - - chmod( - $this->settings['wp_uploads_path'] . - '/WP2STATIC-CURRENT-ARCHIVE.txt', - 0664 - ); - } else { + if ( ! wp_mkdir_p( $this->path ) ) { WsLog::l( "Couldn't create archive directory at $this->path" ); } } diff --git a/src/ArchiveProcessor.php b/src/ArchiveProcessor.php index 3c363c7e..2e3bd5d8 100755 --- a/src/ArchiveProcessor.php +++ b/src/ArchiveProcessor.php @@ -19,7 +19,6 @@ class ArchiveProcessor extends StaticHTMLOutput { public function __construct() { $this->archive = new Archive(); - $this->archive->setToCurrentArchive(); $this->loadSettings( [ diff --git a/src/Deployer.php b/src/Deployer.php index ec8c2265..648a9b37 100755 --- a/src/Deployer.php +++ b/src/Deployer.php @@ -125,7 +125,6 @@ public function finalizeDeployment( string $deploy_result = '' ) : string { public function triggerPostDeployHooks() : void { $this->archive = new Archive(); - $this->archive->setToCurrentArchive(); do_action( 'statichtmloutput_post_deploy_trigger', $this->archive ); } diff --git a/src/Exporter.php b/src/Exporter.php index 59d0ac8a..7952e32e 100755 --- a/src/Exporter.php +++ b/src/Exporter.php @@ -41,25 +41,6 @@ public function pre_export_cleanup() : void { } public function cleanup_working_files() : void { - // keep log files here for debugging - // skip first export state - if ( is_file( - $this->settings['wp_uploads_path'] . - '/WP2STATIC-CURRENT-ARCHIVE.txt' - ) ) { - - $handle = fopen( - $this->settings['wp_uploads_path'] . - '/WP2STATIC-CURRENT-ARCHIVE.txt', - 'r' - ); - - if ( is_resource( $handle ) ) { - // TODO: looks like a random place for this... - $this->settings['archive_dir'] = stream_get_line( $handle, 0 ); - } - } - $files_to_clean = [ '/WP-STATIC-2ND-CRAWL-LIST.txt', '/WP-STATIC-CRAWLED-LINKS.txt', @@ -111,7 +92,7 @@ public function cleanup_leftover_archives() : void { ); foreach ( $leftover_files as $filename ) { - if ( strpos( $filename, 'wp-static-html-output-' ) !== false ) { + if ( strpos( $filename, 'static-html-output' ) !== false ) { $deletion_target = $this->settings['wp_uploads_path'] . '/' . $filename; if ( is_dir( $deletion_target ) ) { diff --git a/src/FilesHelper.php b/src/FilesHelper.php index a4bd0b6c..a3b64aa9 100755 --- a/src/FilesHelper.php +++ b/src/FilesHelper.php @@ -243,7 +243,8 @@ public static function filePathLooksCrawlable( string $file_name ) : bool { 'wp2static-crawled-site', 'thumbs.db', 'vendor', - 'wp-static-html-output', // exclude earlier version exports + 'wp-static-html-output', + 'static-html-output', ]; foreach ( $filenames_to_ignore as $ignorable ) { diff --git a/src/SiteCrawler.php b/src/SiteCrawler.php index 6beab1ba..75c5588b 100755 --- a/src/SiteCrawler.php +++ b/src/SiteCrawler.php @@ -307,24 +307,7 @@ public function crawlABitMore() : void { chmod( $this->list_of_urls_to_crawl_path, 0664 ); - // TODO: required in saving/copying, but not here? optimize... - $handle = fopen( - $this->settings['wp_uploads_path'] . - '/WP2STATIC-CURRENT-ARCHIVE.txt', - 'r' - ); - - if ( ! is_resource( $handle ) ) { - return; - } - - $line = stream_get_line( $handle, 0 ); - - if ( ! is_string( $line ) ) { - return; - } - - $this->archive_dir = $line; + $this->archive_dir = $this->settings['wp_uploads_path'] . '/static-html-output/'; $total_urls_path = $this->settings['wp_uploads_path'] . '/WP-STATIC-INITIAL-CRAWL-TOTAL.txt'; diff --git a/src/SitePublisher.php b/src/SitePublisher.php index ca2b4104..847f9b6e 100755 --- a/src/SitePublisher.php +++ b/src/SitePublisher.php @@ -65,7 +65,6 @@ public function loadSettings( string $deploy_method ) : void { public function loadArchive() : void { $this->archive = new Archive(); - $this->archive->setToCurrentArchive(); } public function bootstrap() : void { @@ -73,10 +72,7 @@ public function bootstrap() : void { $this->settings['wp_uploads_path'] . '/WP2STATIC-FILES-TO-DEPLOY.txt'; - $this->archive_dir = (string) file_get_contents( - $this->settings['wp_uploads_path'] . - '/WP2STATIC-CURRENT-ARCHIVE.txt' - ); + $this->archive_dir = $this->settings['wp_uploads_path'] . '/static-html-output/'; } public function pauseBetweenAPICalls() : void { diff --git a/views/options-page-js.phtml b/views/options-page-js.phtml index 795c7787..276e44df 100755 --- a/views/options-page-js.phtml +++ b/views/options-page-js.phtml @@ -195,18 +195,10 @@ jQuery(document).ready(function($){ } else { // if zip was selected, call to get zip name and enable the button with the link to download if (current_deployment_method === 'zip') { + zipURL = wp_site.uploads_url + '/static-html-output.zip?cacheBuster=' + Date.now(); + zipURL = zipURL.replace('https://', '//').replace('http://', '//'); + $('#downloadZIP').attr('href', zipURL); $('#downloadZIP').show(); - - $.get( wp_site.uploads_url + '/WP2STATIC-CURRENT-ARCHIVE.txt?cacheBuster=' + Date.now(), - function( archive_path ) { - path_segments = archive_path.split('/'); - current_archive.name = path_segments[path_segments.length - 2]; - zipURL = wp_site.uploads_url + '/' + current_archive.name + '.zip'; - zipURL = zipURL.replace('https://', '//').replace('http://', '//'); - $('#downloadZIP').attr('href', zipURL); - }, - 'text' - ); } else { // for other methods, show the Go to my static site link $('#goToMyStaticSite').attr('href', $('#baseUrl').val()); From 3c720191df9edf290a327a647fa32949bb886521 Mon Sep 17 00:00:00 2001 From: Leon Date: Wed, 10 Jun 2020 06:04:14 +0930 Subject: [PATCH 03/39] wip CrawlQueue;CrawlLog --- src/CSSProcessor.php | 18 ++---- src/Controller.php | 3 + src/CrawlLog.php | 130 ++++++++++++++++++++++++++++++++++++++++++ src/CrawlQueue.php | 120 ++++++++++++++++++++++++++++++++++++++ src/Exclusions.php | 54 ++++++++++++++++++ src/Exporter.php | 90 ++++------------------------- src/FilesHelper.php | 9 +-- src/HTMLProcessor.php | 18 ++---- 8 files changed, 334 insertions(+), 108 deletions(-) create mode 100644 src/CrawlLog.php create mode 100644 src/CrawlQueue.php create mode 100644 src/Exclusions.php diff --git a/src/CSSProcessor.php b/src/CSSProcessor.php index c79d8747..2807cae7 100755 --- a/src/CSSProcessor.php +++ b/src/CSSProcessor.php @@ -431,19 +431,13 @@ public function writeDiscoveredURLs() : void { } } - file_put_contents( - $this->wp_uploads_path . - '/WP-STATIC-DISCOVERED-URLS.txt', - PHP_EOL . - implode( PHP_EOL, array_unique( $this->discovered_urls ) ), - FILE_APPEND | LOCK_EX - ); + // TODO: check for existing URLs in CrawlLog and only add non-processed to CrawlQueue + $unique_urls = array_unique( $this->discovered_urls ); + array_filter( $unique_urls ); + sort( $unique_urls ); - chmod( - $this->wp_uploads_path . - '/WP-STATIC-DISCOVERED-URLS.txt', - 0664 - ); + // TODO: also add new URLs to CrawlLog + CrawlQueue::addUrls( $unique_urls ); } public function isValidURL( string $url ) : bool { diff --git a/src/Controller.php b/src/Controller.php index a9d2a69d..653c9f04 100755 --- a/src/Controller.php +++ b/src/Controller.php @@ -110,6 +110,9 @@ public function setDefaultOptions() : void { public function activate_for_single_site() : void { $this->setDefaultOptions(); + CrawlQueue::createTable(); + CrawlLog::createTable(); + Exclusions::createTable(); } /** diff --git a/src/CrawlLog.php b/src/CrawlLog.php new file mode 100644 index 00000000..bdeb0891 --- /dev/null +++ b/src/CrawlLog.php @@ -0,0 +1,130 @@ +prefix . 'statichtmloutput_crawl_log'; + + $charset_collate = $wpdb->get_charset_collate(); + + /** + * Detected/discovered URLs added with initial status of 0 + * and will be updated with response code after crawling + */ + $sql = "CREATE TABLE $table_name ( + id mediumint(9) NOT NULL AUTO_INCREMENT, + url VARCHAR(2083) NOT NULL, + note TEXT NOT NULL, + status SMALLINT DEFAULT 0 NOT NULL, + PRIMARY KEY (id) + ) $charset_collate;"; + + require_once ABSPATH . 'wp-admin/includes/upgrade.php'; + dbDelta( $sql ); + } + + /** + * Add all Urls to log + * + * @param string[] $urls List of URLs to log info for + */ + public static function addUrls( array $urls, string $note, int $status = 0 ) : void { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_crawl_log'; + + $placeholders = []; + $values = []; + + foreach ( $urls as $url ) { + if ( ! $url ) { + continue; + } + + $placeholders[] = '(%s)'; + $values[] = rawurldecode( $url ); + $placeholders[] = '(%s)'; + $values[] = $note; + $placeholders[] = '(%d)'; + $values[] = $status; + } + + $query_string = + 'INSERT INTO ' . $table_name . ' (url) VALUES ' . + implode( ', ', $placeholders ); + $query = $wpdb->prepare( $query_string, $values ); + + $wpdb->query( $query ); + } + + /** + * Get all crawlable URLs + * + * @return string[] All crawlable URLs + */ + public static function getCrawlablePaths() : array { + global $wpdb; + $urls = []; + + $table_name = $wpdb->prefix . 'statichtmloutput_crawl_log'; + + $rows = $wpdb->get_results( "SELECT url FROM $table_name ORDER by url ASC" ); + + foreach ( $rows as $row ) { + $urls[] = $row->url; + } + + return $urls; + } + + /** + * Get total crawlable URLs + * + * @return int Total crawlable URLs + */ + public static function getTotalCrawlableURLs() : int { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_crawl_log'; + + $total_crawl_log = $wpdb->get_var( "SELECT COUNT(*) FROM $table_name" ); + + return $total_crawl_log; + } + + /** + * Clear CrawlQueue via truncate or deletion + */ + public static function truncate() : void { + WsLog::l( 'Deleting CrawlQueue (Detected URLs)' ); + + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_crawl_log'; + + $wpdb->query( "TRUNCATE TABLE $table_name" ); + + $total_crawl_log = self::getTotalCrawlableURLs(); + + if ( $total_crawl_log > 0 ) { + WsLog::l( 'failed to truncate CrawlQueue: try deleting instead' ); + } + } + + /** + * Count URLs in Crawl Queue + */ + public static function getTotal() : int { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_crawl_log'; + + $total = $wpdb->get_var( "SELECT count(*) FROM $table_name" ); + + return $total; + } +} diff --git a/src/CrawlQueue.php b/src/CrawlQueue.php new file mode 100644 index 00000000..2ab0f024 --- /dev/null +++ b/src/CrawlQueue.php @@ -0,0 +1,120 @@ +prefix . 'statichtmloutput_urls'; + + $charset_collate = $wpdb->get_charset_collate(); + + $sql = "CREATE TABLE $table_name ( + id mediumint(9) NOT NULL AUTO_INCREMENT, + url VARCHAR(2083) NOT NULL, + PRIMARY KEY (id) + ) $charset_collate;"; + + require_once ABSPATH . 'wp-admin/includes/upgrade.php'; + dbDelta( $sql ); + } + + /** + * Add all Urls to queue + * + * @param string[] $urls List of URLs to crawl + */ + public static function addUrls( array $urls ) : void { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_urls'; + + $placeholders = []; + $values = []; + + foreach ( $urls as $url ) { + if ( ! $url ) { + continue; + } + + $placeholders[] = '(%s)'; + $values[] = rawurldecode( $url ); + } + + $query_string = + 'INSERT INTO ' . $table_name . ' (url) VALUES ' . + implode( ', ', $placeholders ); + $query = $wpdb->prepare( $query_string, $values ); + + $wpdb->query( $query ); + } + + /** + * Get all crawlable URLs + * + * @return string[] All crawlable URLs + */ + public static function getCrawlablePaths() : array { + global $wpdb; + $urls = []; + + $table_name = $wpdb->prefix . 'statichtmloutput_urls'; + + $rows = $wpdb->get_results( "SELECT url FROM $table_name ORDER by url ASC" ); + + foreach ( $rows as $row ) { + $urls[] = $row->url; + } + + return $urls; + } + + /** + * Get total crawlable URLs + * + * @return int Total crawlable URLs + */ + public static function getTotalCrawlableURLs() : int { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_urls'; + + $total_urls = $wpdb->get_var( "SELECT COUNT(*) FROM $table_name" ); + + return $total_urls; + } + + /** + * Clear CrawlQueue via truncate or deletion + */ + public static function truncate() : void { + WsLog::l( 'Deleting CrawlQueue (Detected URLs)' ); + + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_urls'; + + $wpdb->query( "TRUNCATE TABLE $table_name" ); + + $total_urls = self::getTotalCrawlableURLs(); + + if ( $total_urls > 0 ) { + WsLog::l( 'failed to truncate CrawlQueue: try deleting instead' ); + } + } + + /** + * Count URLs in Crawl Queue + */ + public static function getTotal() : int { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_urls'; + + $total = $wpdb->get_var( "SELECT count(*) FROM $table_name" ); + + return $total; + } +} diff --git a/src/Exclusions.php b/src/Exclusions.php new file mode 100644 index 00000000..36e436c5 --- /dev/null +++ b/src/Exclusions.php @@ -0,0 +1,54 @@ +prefix . 'statichtmloutput_exclusions'; + + $charset_collate = $wpdb->get_charset_collate(); + + $sql = "CREATE TABLE $table_name ( + id mediumint(9) NOT NULL AUTO_INCREMENT, + pattern VARCHAR(2083) NOT NULL, + PRIMARY KEY (id) + ) $charset_collate;"; + + require_once ABSPATH . 'wp-admin/includes/upgrade.php'; + dbDelta( $sql ); + } + + /** + * Add all Urls to queue + * + * @param string[] $urls List of URLs to crawl + */ + public static function addPatterns( array $patterns ) : void { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_exclusions'; + + $placeholders = []; + $values = []; + + foreach ( $patterns as $pattern ) { + if ( ! $pattern ) { + continue; + } + + $placeholders[] = '(%s)'; + $values[] = $pattern; + } + + $query_string = + 'INSERT INTO ' . $table_name . ' (pattern) VALUES ' . + implode( ', ', $placeholders ); + $query = $wpdb->prepare( $query_string, $values ); + + $wpdb->query( $query ); + } + +} diff --git a/src/Exporter.php b/src/Exporter.php index 7952e32e..91c1aa26 100755 --- a/src/Exporter.php +++ b/src/Exporter.php @@ -111,44 +111,13 @@ public function cleanup_leftover_archives() : void { } public function generateModifiedFileList() : void { - // preserve the initial crawl list, to be used in debugging + more - copy( - $this->settings['wp_uploads_path'] . - '/WP-STATIC-INITIAL-CRAWL-LIST.txt', - $this->settings['wp_uploads_path'] . - '/WP-STATIC-MODIFIED-CRAWL-LIST.txt' - ); - - chmod( - $this->settings['wp_uploads_path'] . - '/WP-STATIC-MODIFIED-CRAWL-LIST.txt', - 0664 - ); - - // if no excludes or includes, just copy to new target + // if no excludes or includes, no changes to CrawlLog if ( ! isset( $this->settings['excludeURLs'] ) && ! isset( $this->settings['additionalUrls'] ) ) { - copy( - $this->settings['wp_uploads_path'] . - '/WP-STATIC-INITIAL-CRAWL-LIST.txt', - $this->settings['wp_uploads_path'] . - '/WP-STATIC-FINAL-CRAWL-LIST.txt' - ); - return; } - $modified_crawl_list = []; - - // load crawl list into array - $crawl_list = file( - $this->settings['wp_uploads_path'] . - '/WP-STATIC-MODIFIED-CRAWL-LIST.txt' - ); - - if ( ! $crawl_list ) { - return; - } + // TODO: inclusions get added to CrawlQueue if not in CrawlLog // applying exclusions before inclusions if ( isset( $this->settings['excludeURLs'] ) ) { @@ -157,64 +126,29 @@ public function generateModifiedFileList() : void { str_replace( "\r", '', $this->settings['excludeURLs'] ) ); - // iterate through crawl list and add any that aren't excluded - foreach ( $crawl_list as $url_to_crawl ) { - $url_to_crawl = trim( $url_to_crawl ); - $match = false; - - foreach ( $exclusions as $exclusion ) { - $exclusion = trim( $exclusion ); - - if ( $exclusion != '' ) { - if ( strpos( $url_to_crawl, $exclusion ) !== false ) { - WsLog::l( "Excluding $url_to_crawl because of rule $exclusion" ); - - $match = true; - } - } - - if ( ! $match ) { - $modified_crawl_list[] = $url_to_crawl; - } - } - } - } else { - $modified_crawl_list = $crawl_list; + Exclusions::addPatterns( $exclusions ); } if ( isset( $this->settings['additionalUrls'] ) ) { - $inclusions = explode( + $inclusion_cadidates = explode( "\n", str_replace( "\r", '', $this->settings['additionalUrls'] ) ); + // check inclusion isn't already in CrawlLog, else inesert unique into CrawlQueue + $inclusions = []; + foreach ( $inclusions as $inclusion ) { $inclusion = trim( $inclusion ); - $inclusion = $inclusion; - $modified_crawl_list[] = $inclusion; + if ( ! CrawlLog::hasUrl( $inclusion ) ) { + $inclusions[] = $inclusion; + } } - } - if ( ! is_array( $modified_crawl_list ) ) { - return; + CrawlLog::addUrls( $inclusions, 'Included by user' ); + CrawlQueue::addUrls( $inclusions ); } - - $modified_crawl_list = array_unique( $modified_crawl_list ); - - $str = implode( PHP_EOL, $modified_crawl_list ); - - file_put_contents( - $this->settings['wp_uploads_path'] . - '/WP-STATIC-FINAL-CRAWL-LIST.txt', - $str - ); - - chmod( - $this->settings['wp_uploads_path'] . - '/WP-STATIC-FINAL-CRAWL-LIST.txt', - 0664 - ); } } diff --git a/src/FilesHelper.php b/src/FilesHelper.php index a3b64aa9..734711a0 100755 --- a/src/FilesHelper.php +++ b/src/FilesHelper.php @@ -324,16 +324,13 @@ public static function buildInitialFileList( ); $unique_urls = array_unique( $url_queue ); + array_filter( $unique_urls ); sort( $unique_urls ); $initial_crawl_list_total = count( $unique_urls ); - $str = implode( "\n", $unique_urls ); - - file_put_contents( - $uploads_path . '/WP-STATIC-INITIAL-CRAWL-LIST.txt', - $str - ); + // TODO: also add to CrawlLog + CrawlQueue::addUrls( $unique_urls ); file_put_contents( $uploads_path . '/WP-STATIC-INITIAL-CRAWL-TOTAL.txt', diff --git a/src/HTMLProcessor.php b/src/HTMLProcessor.php index 672ca3c6..2c806ad7 100755 --- a/src/HTMLProcessor.php +++ b/src/HTMLProcessor.php @@ -859,19 +859,13 @@ public function writeDiscoveredURLs() : void { } } - file_put_contents( - $this->wp_uploads_path . - '/WP-STATIC-DISCOVERED-URLS.txt', - PHP_EOL . - implode( PHP_EOL, array_unique( $this->discovered_urls ) ), - FILE_APPEND | LOCK_EX - ); + // TODO: check for existing URLs in CrawlLog and only add non-processed to CrawlQueue + $unique_urls = array_unique( $this->discovered_urls ); + array_filter( $unique_urls ); + sort( $unique_urls ); - chmod( - $this->wp_uploads_path . - '/WP-STATIC-DISCOVERED-URLS.txt', - 0664 - ); + // TODO: also add new URLs to CrawlLog + CrawlQueue::addUrls( $unique_urls ); } // make link absolute, using current page to determine full path From a3423b42712fb5c59fe42a83a4361ef22fdc95c6 Mon Sep 17 00:00:00 2001 From: Leon Date: Sat, 13 Jun 2020 02:15:41 +0930 Subject: [PATCH 04/39] bump deps;fix version --- composer.lock | 20 ++++++++++---------- static-html-output-plugin.php | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/composer.lock b/composer.lock index 4376e0e1..385ba62f 100644 --- a/composer.lock +++ b/composer.lock @@ -955,16 +955,16 @@ }, { "name": "php-stubs/wordpress-stubs", - "version": "v5.4.1", + "version": "v5.4.2", "source": { "type": "git", "url": "https://github.com/php-stubs/wordpress-stubs.git", - "reference": "726e541337276f1648ef860efcfe7298bce3a1c5" + "reference": "38b0963698ca5858658a5b09198062411f22932a" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/php-stubs/wordpress-stubs/zipball/726e541337276f1648ef860efcfe7298bce3a1c5", - "reference": "726e541337276f1648ef860efcfe7298bce3a1c5", + "url": "https://api.github.com/repos/php-stubs/wordpress-stubs/zipball/38b0963698ca5858658a5b09198062411f22932a", + "reference": "38b0963698ca5858658a5b09198062411f22932a", "shasum": "" }, "replace": { @@ -991,7 +991,7 @@ "static analysis", "wordpress" ], - "time": "2020-04-29T23:43:44+00:00" + "time": "2020-06-11T14:56:54+00:00" }, { "name": "phpcompatibility/php-compatibility", @@ -1264,16 +1264,16 @@ }, { "name": "phpstan/phpstan", - "version": "0.12.27", + "version": "0.12.28", "source": { "type": "git", "url": "https://github.com/phpstan/phpstan.git", - "reference": "2abbd3253e38a258137f647f4e5fdbcb13142c3e" + "reference": "76c0c4ec90b1eed66fa4855b8b4b53fa9054353f" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/phpstan/phpstan/zipball/2abbd3253e38a258137f647f4e5fdbcb13142c3e", - "reference": "2abbd3253e38a258137f647f4e5fdbcb13142c3e", + "url": "https://api.github.com/repos/phpstan/phpstan/zipball/76c0c4ec90b1eed66fa4855b8b4b53fa9054353f", + "reference": "76c0c4ec90b1eed66fa4855b8b4b53fa9054353f", "shasum": "" }, "require": { @@ -1316,7 +1316,7 @@ "type": "tidelift" } ], - "time": "2020-06-08T21:28:12+00:00" + "time": "2020-06-10T06:20:14+00:00" }, { "name": "phpunit/php-code-coverage", diff --git a/static-html-output-plugin.php b/static-html-output-plugin.php index 4a68eab2..42c72052 100755 --- a/static-html-output-plugin.php +++ b/static-html-output-plugin.php @@ -3,7 +3,7 @@ * Plugin Name: Static HTML Output * Plugin URI: https://statichtmloutput.com * Description: Security & Performance via static website publishing. - * Version: 6.6.17 + * Version: 6.6.18 * Author: Leon Stafford * Author URI: https://leonstafford.github.io * Text Domain: static-html-output-plugin From 89cd2c304e1d564ee8acbf075e51258e2a1c927e Mon Sep 17 00:00:00 2001 From: Leon Date: Sat, 13 Jun 2020 02:39:40 +0930 Subject: [PATCH 05/39] wip adjust CrawlQueue population;logging --- src/Controller.php | 9 +-- src/FilesHelper.php | 3 + src/Logger.php | 112 ++++++++++++++++++++++++++++++++++++ src/WsLog.php | 35 ----------- views/options-page-js.phtml | 41 ------------- views/tab_crawling.phtml | 6 +- 6 files changed, 123 insertions(+), 83 deletions(-) create mode 100755 src/Logger.php delete mode 100755 src/WsLog.php diff --git a/src/Controller.php b/src/Controller.php index 653c9f04..19dc4252 100755 --- a/src/Controller.php +++ b/src/Controller.php @@ -109,6 +109,7 @@ public function setDefaultOptions() : void { } public function activate_for_single_site() : void { + Logger::createTable(); $this->setDefaultOptions(); CrawlQueue::createTable(); CrawlLog::createTable(); @@ -197,8 +198,6 @@ public function generate_filelist_preview() : void { PostSettings::get( $target_settings ); } - $plugin_hook = 'statichtmloutput'; - $initial_file_list_count = FilesHelper::buildInitialFileList( true, @@ -206,14 +205,12 @@ public function generate_filelist_preview() : void { $this->wp_site->uploads_url, $this->settings ); - - if ( ! defined( 'WP_CLI' ) ) { - echo $initial_file_list_count; - } } public static function renderOptionsPage() : void { $instance = self::getInstance(); + $instance->generate_filelist_preview(); + $instance->total_detected_urls = CrawlQueue::getTotal(); $instance->wp_site = new WPSite(); $instance->current_archive = ''; diff --git a/src/FilesHelper.php b/src/FilesHelper.php index 734711a0..d804d55a 100755 --- a/src/FilesHelper.php +++ b/src/FilesHelper.php @@ -297,6 +297,9 @@ public static function buildInitialFileList( string $uploads_url, array $settings ) : int { + // clear CrawlQueue before rebuilding list + CrawlQueue::truncate(); + $wp_site = new WPSite(); $base_url = untrailingslashit( home_url() ); diff --git a/src/Logger.php b/src/Logger.php new file mode 100755 index 00000000..827ea7b6 --- /dev/null +++ b/src/Logger.php @@ -0,0 +1,112 @@ +prefix . 'statichtmloutput_log'; + + $wpdb->insert( + $table_name, + [ + 'log' => $text, + ] + ); + } + + public static function createTable() : void { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_log'; + + $charset_collate = $wpdb->get_charset_collate(); + + $sql = "CREATE TABLE $table_name ( + id mediumint(9) NOT NULL AUTO_INCREMENT, + time TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + log TEXT NOT NULL, + PRIMARY KEY (id) + ) $charset_collate;"; + + require_once ABSPATH . 'wp-admin/includes/upgrade.php'; + dbDelta( $sql ); + } + + /** + * Log multiple lines at once + * + * @param string[] $lines List of lines to log + */ + public static function lines( array $lines ) : void { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_log'; + + $current_time = current_time( 'mysql' ); + + $query = "INSERT INTO $table_name (log) VALUES "; + + foreach ( $lines as $line ) { + $query .= "('$line'),"; + } + + $query = rtrim( $query, ',' ); + + $wpdb->query( $query ); + } + + /** + * Get all log lines + * + * @return mixed[] array of Log items + */ + public static function getAll() : array { + global $wpdb; + $logs = []; + + $table_name = $wpdb->prefix . 'statichtmloutput_log'; + + $rows = $wpdb->get_results( "SELECT time, log FROM $table_name ORDER BY id DESC" ); + + foreach ( $rows as $row ) { + $logs[] = $row; + } + + return $logs; + } + + /** + * Poll latest log lines + */ + public static function poll() : string { + global $wpdb; + $logs = ''; + + $table_name = $wpdb->prefix . 'statichtmloutput_log'; + + $rows = $wpdb->get_results( "SELECT time, log FROM $table_name ORDER BY id DESC" ); + + foreach ( $rows as $row ) { + $logs .= $row->time . ': ' . $row->log . PHP_EOL; + } + + return $logs; + } + + /** + * Clear Log via truncation + */ + public static function truncate() : void { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_log'; + + $wpdb->query( "TRUNCATE TABLE $table_name" ); + + self::l( 'Deleted all Logs' ); + } +} + diff --git a/src/WsLog.php b/src/WsLog.php deleted file mode 100755 index 7ba400c2..00000000 --- a/src/WsLog.php +++ /dev/null @@ -1,35 +0,0 @@ - diff --git a/views/tab_crawling.phtml b/views/tab_crawling.phtml index 01e7f02b..b6866898 100755 --- a/views/tab_crawling.phtml +++ b/views/tab_crawling.phtml @@ -45,7 +45,11 @@ Generating initial file list -

+ +

total_detected_urls; ?> + + URLs were detected on your site that will be used to initiate the crawl. Other URLs will be discovered while crawling.

+

From 04fa65e38df560509a32cf8710f714e301fa5b8a Mon Sep 17 00:00:00 2001 From: Leon Date: Sat, 13 Jun 2020 02:40:11 +0930 Subject: [PATCH 06/39] WsLog to Logger --- src/Archive.php | 2 +- src/ArchiveProcessor.php | 18 +++++++++--------- src/BunnyCDN.php | 12 ++++++------ src/Controller.php | 16 ++++++++-------- src/CrawlLog.php | 4 ++-- src/CrawlQueue.php | 4 ++-- src/FileCopier.php | 4 ++-- src/FileWriter.php | 6 +++--- src/GitHub.php | 8 ++++---- src/GitLab.php | 2 +- src/Netlify.php | 2 +- src/S3.php | 10 +++++----- src/SiteCrawler.php | 16 ++++++++-------- src/SitePublisher.php | 6 +++--- tools/V6Cleanup.php | 8 ++++---- 15 files changed, 59 insertions(+), 59 deletions(-) diff --git a/src/Archive.php b/src/Archive.php index 2fa4f5c3..12a41071 100755 --- a/src/Archive.php +++ b/src/Archive.php @@ -23,7 +23,7 @@ public function __construct() { public function create() : void { if ( ! wp_mkdir_p( $this->path ) ) { - WsLog::l( "Couldn't create archive directory at $this->path" ); + Logger::l( "Couldn't create archive directory at $this->path" ); } } } diff --git a/src/ArchiveProcessor.php b/src/ArchiveProcessor.php index 2e3bd5d8..091d1e42 100755 --- a/src/ArchiveProcessor.php +++ b/src/ArchiveProcessor.php @@ -35,7 +35,7 @@ public function __construct() { public function renameWPDirectory( string $source, string $target ) : void { if ( empty( $source ) || empty( $target ) ) { - WsLog::l( + Logger::l( 'Failed trying to rename: ' . 'Source: ' . $source . ' to: ' . $target @@ -53,7 +53,7 @@ public function renameWPDirectory( string $source, string $target ) : void { $original_dir ); } else { - WsLog::l( + Logger::l( 'Trying to rename non-existent directory: ' . $original_dir ); @@ -167,7 +167,7 @@ public function copyStaticSiteToPublicFolder() : void { } else { if ( wp_mkdir_p( $target_folder ) ) { if ( ! $this->put_safety_file( $target_folder ) ) { - WsLog::l( + Logger::l( 'Couldn\'t put safety file in ' . 'Target Directory' . $target_folder @@ -176,7 +176,7 @@ public function copyStaticSiteToPublicFolder() : void { die(); } } else { - WsLog::l( + Logger::l( 'Couldn\'t create Target Directory: ' . $target_folder ); @@ -188,7 +188,7 @@ public function copyStaticSiteToPublicFolder() : void { // CHECK #2: check directory empty and add safety file if ( $directory_empty ) { if ( ! $this->put_safety_file( $target_folder ) ) { - WsLog::l( + Logger::l( 'Couldn\'t put safety file in ' . 'Target Directory' . $target_folder @@ -208,7 +208,7 @@ public function copyStaticSiteToPublicFolder() : void { ); if ( ! $this->put_safety_file( $target_folder ) ) { - WsLog::l( + Logger::l( 'Couldn\'t put safety file in ' . 'Target Directory' . $target_folder @@ -217,7 +217,7 @@ public function copyStaticSiteToPublicFolder() : void { die(); } } else { - WsLog::l( + Logger::l( 'Target Directory wasn\'t empty ' . 'or didn\'t contain safety file ' . $target_folder @@ -261,7 +261,7 @@ public function create_zip() : void { $zip_archive = new ZipArchive(); if ( $zip_archive->open( $temp_zip, ZIPARCHIVE::CREATE ) !== true ) { - WsLog::l( 'Could not create archive' ); + Logger::l( 'Could not create archive' ); return; } @@ -283,7 +283,7 @@ public function create_zip() : void { str_replace( $this->archive->path, '', $filename ) ) ) { - WsLog::l( 'Could not add file: ' . $filename ); + Logger::l( 'Could not add file: ' . $filename ); return; } } diff --git a/src/BunnyCDN.php b/src/BunnyCDN.php index 2968382f..5a8e2c07 100755 --- a/src/BunnyCDN.php +++ b/src/BunnyCDN.php @@ -147,8 +147,8 @@ public function purge_all_cache() : void { echo 'SUCCESS'; } } catch ( StaticHTMLOutputException $e ) { - WsLog::l( 'BUNNYCDN PURGE CACHE: error encountered' ); - WsLog::l( $e ); + Logger::l( 'BUNNYCDN PURGE CACHE: error encountered' ); + Logger::l( $e ); throw new StaticHTMLOutputException( $e ); } } @@ -178,8 +178,8 @@ public function test_deploy() : void { } } catch ( StaticHTMLOutputException $e ) { - WsLog::l( 'BUNNYCDN TEST EXPORT: error encountered' ); - WsLog::l( $e ); + Logger::l( 'BUNNYCDN TEST EXPORT: error encountered' ); + Logger::l( $e ); throw new StaticHTMLOutputException( $e ); } @@ -213,8 +213,8 @@ public function createFileInBunnyCDN() : void { } } catch ( StaticHTMLOutputException $e ) { - WsLog::l( 'BUNNYCDN EXPORT: error encountered' ); - WsLog::l( $e ); + Logger::l( 'BUNNYCDN EXPORT: error encountered' ); + Logger::l( $e ); $this->handleException( $e ); } } diff --git a/src/Controller.php b/src/Controller.php index 19dc4252..f519de41 100755 --- a/src/Controller.php +++ b/src/Controller.php @@ -265,7 +265,7 @@ public function prepare_for_export() : void { public function reset_default_settings() : void { if ( ! delete_option( 'statichtmloutput-options' ) ) { - WsLog::l( 'Error resetting options to defaults' ); + Logger::l( 'Error resetting options to defaults' ); echo 'ERROR'; } @@ -346,30 +346,30 @@ public function logEnvironmentalInfo() : void { $info[] = 'SERVER SOFTWARE ' . $_SERVER['SERVER_SOFTWARE']; } - WsLog::l( implode( PHP_EOL, $info ) ); + Logger::l( implode( PHP_EOL, $info ) ); - WsLog::l( 'Active plugins:' ); + Logger::l( 'Active plugins:' ); $active_plugins = get_option( 'active_plugins' ); foreach ( $active_plugins as $active_plugin ) { - WsLog::l( $active_plugin ); + Logger::l( $active_plugin ); } - WsLog::l( 'Plugin options:' ); + Logger::l( 'Plugin options:' ); $options = $this->options->getAllOptions( false ); foreach ( $options as $key => $value ) { - WsLog::l( "{$value['Option name']}: {$value['Value']}" ); + Logger::l( "{$value['Option name']}: {$value['Value']}" ); } - WsLog::l( 'Installed extensions:' ); + Logger::l( 'Installed extensions:' ); $extensions = get_loaded_extensions(); foreach ( $extensions as $extension ) { - WsLog::l( $extension ); + Logger::l( $extension ); } } } diff --git a/src/CrawlLog.php b/src/CrawlLog.php index bdeb0891..2c27acdd 100644 --- a/src/CrawlLog.php +++ b/src/CrawlLog.php @@ -100,7 +100,7 @@ public static function getTotalCrawlableURLs() : int { * Clear CrawlQueue via truncate or deletion */ public static function truncate() : void { - WsLog::l( 'Deleting CrawlQueue (Detected URLs)' ); + Logger::l( 'Deleting CrawlQueue (Detected URLs)' ); global $wpdb; @@ -111,7 +111,7 @@ public static function truncate() : void { $total_crawl_log = self::getTotalCrawlableURLs(); if ( $total_crawl_log > 0 ) { - WsLog::l( 'failed to truncate CrawlQueue: try deleting instead' ); + Logger::l( 'failed to truncate CrawlQueue: try deleting instead' ); } } diff --git a/src/CrawlQueue.php b/src/CrawlQueue.php index 2ab0f024..c281fbe2 100644 --- a/src/CrawlQueue.php +++ b/src/CrawlQueue.php @@ -90,7 +90,7 @@ public static function getTotalCrawlableURLs() : int { * Clear CrawlQueue via truncate or deletion */ public static function truncate() : void { - WsLog::l( 'Deleting CrawlQueue (Detected URLs)' ); + Logger::l( 'Deleting CrawlQueue (Detected URLs)' ); global $wpdb; @@ -101,7 +101,7 @@ public static function truncate() : void { $total_urls = self::getTotalCrawlableURLs(); if ( $total_urls > 0 ) { - WsLog::l( 'failed to truncate CrawlQueue: try deleting instead' ); + Logger::l( 'failed to truncate CrawlQueue: try deleting instead' ); } } diff --git a/src/FileCopier.php b/src/FileCopier.php index cd6b903d..a6535a31 100755 --- a/src/FileCopier.php +++ b/src/FileCopier.php @@ -44,7 +44,7 @@ public function getLocalFileForURL() : string { if ( is_file( $local_file ) ) { return $local_file; } else { - WsLog::l( + Logger::l( 'ERROR: trying to copy local file: ' . $local_file . ' for URL: ' . $this->url . ' (FILE NOT FOUND/UNREADABLE)' @@ -98,7 +98,7 @@ public function copyFile( string $archive_dir ) : void { if ( is_file( $local_file ) ) { copy( $local_file, $filename ); } else { - WsLog::l( + Logger::l( 'ERROR: trying to copy local file: ' . $local_file . ' to: ' . $filename . ' in archive dir: ' . $archive_dir . diff --git a/src/FileWriter.php b/src/FileWriter.php index 7a165ec3..1c71d0e8 100755 --- a/src/FileWriter.php +++ b/src/FileWriter.php @@ -112,17 +112,17 @@ public function saveFile( string $archive_dir ) : void { $write_result = file_put_contents( $filename, $file_contents ); if ( ! $write_result ) { - WsLog::l( "Failed saving $this->url to $filename" ); + Logger::l( "Failed saving $this->url to $filename" ); return; } $modified = chmod( $filename, 0664 ); if ( ! $modified ) { - WsLog::l( "Failed chmod'ing $filename" ); + Logger::l( "Failed chmod'ing $filename" ); } } else { - WsLog::l( "Not saving empty file $this->url" ); + Logger::l( "Not saving empty file $this->url" ); } } } diff --git a/src/GitHub.php b/src/GitHub.php index 6ca6cb26..bbd1c09f 100755 --- a/src/GitHub.php +++ b/src/GitHub.php @@ -178,13 +178,13 @@ public function test_upload() : void { $good_response_codes = [ 200, 201, 301, 302, 304 ]; if ( ! in_array( $status_code, $good_response_codes ) ) { - WsLog::l( "BAD RESPONSE STATUS ($status_code)" ); + Logger::l( "BAD RESPONSE STATUS ($status_code)" ); throw new StaticHTMLOutputException( 'GitHub API bad response status' ); } } catch ( StaticHTMLOutputException $e ) { - WsLog::l( 'GITHUB EXPORT: error encountered' ); - WsLog::l( $e ); + Logger::l( 'GITHUB EXPORT: error encountered' ); + Logger::l( $e ); throw new StaticHTMLOutputException( $e ); } @@ -244,7 +244,7 @@ public function fileExistsInGitHub() : bool { $commit_message = ''; if ( ! empty( $this->existing_file_object ) ) { - WsLog::l( "{$this->target_path} path exists in GitHub" ); + Logger::l( "{$this->target_path} path exists in GitHub" ); return true; } diff --git a/src/GitLab.php b/src/GitLab.php index 81527428..8e44d17b 100755 --- a/src/GitLab.php +++ b/src/GitLab.php @@ -206,7 +206,7 @@ public function getRepositoryTree( int $page ) : void { $good_response_codes = [ '200', '201', '301', '302', '304' ]; if ( ! in_array( $client->status_code, $good_response_codes ) ) { - WsLog::l( 'BAD RESPONSE STATUS (' . $client->status_code . '): ' ); + Logger::l( 'BAD RESPONSE STATUS (' . $client->status_code . '): ' ); throw new StaticHTMLOutputException( 'GitLab API bad response status' ); } diff --git a/src/Netlify.php b/src/Netlify.php index dd36d2ee..ae2f39f7 100755 --- a/src/Netlify.php +++ b/src/Netlify.php @@ -106,7 +106,7 @@ public function test_netlify() : void { } } else { $err = 'BAD RESPONSE STATUS FROM NETLIFY API'; - WsLog::l( $err ); + Logger::l( $err ); throw new StaticHTMLOutputException( $err ); } } catch ( StaticHTMLOutputException $e ) { diff --git a/src/S3.php b/src/S3.php index dbe99656..50640b95 100755 --- a/src/S3.php +++ b/src/S3.php @@ -130,7 +130,7 @@ public function test_s3() : void { echo 'SUCCESS'; } } catch ( StaticHTMLOutputException $e ) { - WsLog::l( 'S3 TEST ERROR RETURNED: ' . $e ); + Logger::l( 'S3 TEST ERROR RETURNED: ' . $e ); throw new StaticHTMLOutputException( $e ); } } @@ -260,10 +260,10 @@ public function put_s3_object( } public function cloudfront_invalidate_all_items() : void { - WsLog::l( 'Invalidating all CloudFront items' ); + Logger::l( 'Invalidating all CloudFront items' ); if ( ! isset( $this->settings['cfDistributionId'] ) ) { - WsLog::l( 'no Cloudfront ID found' ); + Logger::l( 'no Cloudfront ID found' ); if ( ! defined( 'WP_CLI' ) ) { echo 'SUCCESS'; } @@ -306,7 +306,7 @@ public function cloudfront_invalidate_all_items() : void { ); if ( ! $fp ) { - WsLog::l( "CLOUDFRONT CONNECTION ERROR: {$errno} {$errstr}" ); + Logger::l( "CLOUDFRONT CONNECTION ERROR: {$errno} {$errstr}" ); die( "Connection failed: {$errno} {$errstr}\n" ); } @@ -317,7 +317,7 @@ public function cloudfront_invalidate_all_items() : void { $resp .= fgets( $fp, 1024 ); } - WsLog::l( "CloudFront response body: {$resp}" ); + Logger::l( "CloudFront response body: {$resp}" ); fclose( $fp ); diff --git a/src/SiteCrawler.php b/src/SiteCrawler.php index 75c5588b..6a582980 100755 --- a/src/SiteCrawler.php +++ b/src/SiteCrawler.php @@ -211,7 +211,7 @@ public function crawl_discovered_links() : void { '/WP-STATIC-FINAL-2ND-CRAWL-LIST.txt'; if ( ! is_file( $this->list_of_urls_to_crawl_path ) ) { - WsLog::l( + Logger::l( 'ERROR: LIST OF URLS TO CRAWL NOT FOUND AT: ' . $this->list_of_urls_to_crawl_path ); @@ -244,7 +244,7 @@ public function crawl_site() : void { '/WP-STATIC-FINAL-CRAWL-LIST.txt'; if ( ! is_file( $this->list_of_urls_to_crawl_path ) ) { - WsLog::l( + Logger::l( 'ERROR: LIST OF URLS TO CRAWL NOT FOUND AT: ' . $this->list_of_urls_to_crawl_path ); @@ -278,7 +278,7 @@ public function crawlABitMore() : void { $total_links = count( $this->urls_to_crawl ); if ( $total_links < 1 ) { - WsLog::l( + Logger::l( 'ERROR: LIST OF URLS TO CRAWL NOT FOUND AT: ' . $this->list_of_urls_to_crawl_path ); @@ -348,7 +348,7 @@ public function crawlABitMore() : void { ); } - WsLog::l( + Logger::l( 'Exclusion rules ' . implode( PHP_EOL, $exclusions ) ); @@ -362,7 +362,7 @@ public function crawlABitMore() : void { $exclusion = trim( $exclusion ); if ( $exclusion != '' ) { if ( false !== strpos( $this->url, $exclusion ) ) { - WsLog::l( + Logger::l( 'Excluding ' . $this->url . ' because of rule ' . $exclusion ); @@ -452,7 +452,7 @@ public function loadFileForProcessing() : bool { $good_response_codes = [ 200, 201, 301, 302, 304 ]; if ( ! in_array( $status_code, $good_response_codes ) ) { - WsLog::l( + Logger::l( 'BAD RESPONSE STATUS (' . $status_code . '): ' . $this->url ); @@ -677,7 +677,7 @@ public function detectFileType() : void { } elseif ( stripos( $type, 'application/json' ) !== false ) { $this->file_type = 'json'; } else { - WsLog::l( + Logger::l( 'no filetype inferred from content-type: ' . $this->curl_content_type . ' url: ' . $this->url @@ -692,7 +692,7 @@ public function detectFileType() : void { public function checkForCurlErrors( string $response, $curl_handle ) : void { if ( ! $response ) { $response = curl_error( $curl_handle ); - WsLog::l( + Logger::l( 'cURL error:' . stripslashes( $response ) ); diff --git a/src/SitePublisher.php b/src/SitePublisher.php index 847f9b6e..f422b6ac 100755 --- a/src/SitePublisher.php +++ b/src/SitePublisher.php @@ -373,8 +373,8 @@ public function uploadsCompleted() : bool { * @throws StaticHTMLOutputException */ public function handleException( string $e ) : void { - WsLog::l( 'Deployment: error encountered' ); - WsLog::l( $e ); + Logger::l( 'Deployment: error encountered' ); + Logger::l( $e ); throw new StaticHTMLOutputException( $e ); } @@ -384,7 +384,7 @@ public function handleException( string $e ) : void { */ public function checkForValidResponses( int $code, array $good_codes ) : void { if ( ! in_array( $code, $good_codes ) ) { - WsLog::l( + Logger::l( 'BAD RESPONSE STATUS FROM API (' . $code . ')' ); diff --git a/tools/V6Cleanup.php b/tools/V6Cleanup.php index 72717e8b..25a8c444 100755 --- a/tools/V6Cleanup.php +++ b/tools/V6Cleanup.php @@ -9,7 +9,7 @@ public static function cleanup() : void { $deleted_v6_options = delete_option( 'wp2static-options' ); if ( $deleted_v6_options ) { - WsLog::l( 'Deleted Version 6 options from DB' ); + Logger::l( 'Deleted Version 6 options from DB' ); } $v6_txt_files = [ @@ -32,7 +32,7 @@ public static function cleanup() : void { $deleted_file = unlink( SiteInfo::getPath( 'uploads' ) . $txt_file ); if ( $deleted_file ) { - WsLog::l( 'Deleted Version 6 text file: ' . $txt_file ); + Logger::l( 'Deleted Version 6 text file: ' . $txt_file ); } } } @@ -44,7 +44,7 @@ public static function cleanup() : void { $deleted_zip = unlink( $v6_zip_file ); if ( $deleted_zip ) { - WsLog::l( 'Deleted Version 6 zip file: ' . $v6_zip_file ); + Logger::l( 'Deleted Version 6 zip file: ' . $v6_zip_file ); } } } @@ -54,7 +54,7 @@ public static function cleanup() : void { if ( is_array( $v6_archives ) ) { foreach ( $v6_archives as $v6_archive ) { if ( is_dir( $v6_archive ) ) { - WsLog::l( 'Deleting Version 6 archive: ' . $v6_archive ); + Logger::l( 'Deleting Version 6 archive: ' . $v6_archive ); FilesHelper::delete_dir_with_files( $v6_archive ); } } From 3523c65f144198c6b5ef42c68f47fab503982168 Mon Sep 17 00:00:00 2001 From: Leon Date: Sat, 13 Jun 2020 02:51:52 +0930 Subject: [PATCH 07/39] show detected urls immediately --- src/Controller.php | 2 +- statichtmloutput.css | 6 +----- views/options-page.phtml | 6 ++++-- views/tab_crawling.phtml | 2 +- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/Controller.php b/src/Controller.php index f519de41..9c89914d 100755 --- a/src/Controller.php +++ b/src/Controller.php @@ -210,7 +210,6 @@ public function generate_filelist_preview() : void { public static function renderOptionsPage() : void { $instance = self::getInstance(); $instance->generate_filelist_preview(); - $instance->total_detected_urls = CrawlQueue::getTotal(); $instance->wp_site = new WPSite(); $instance->current_archive = ''; @@ -225,6 +224,7 @@ public static function renderOptionsPage() : void { ->setTemplate( 'options-page' ) ->assign( 'wp_site', $instance->wp_site ) ->assign( 'options', $instance->options ) + ->assign( 'total_detected_urls', CrawlQueue::getTotal() ) ->assign( 'onceAction', self::HOOK . '-options' ) ->render(); } diff --git a/statichtmloutput.css b/statichtmloutput.css index fe34f432..1859e3e8 100755 --- a/statichtmloutput.css +++ b/statichtmloutput.css @@ -51,10 +51,6 @@ div.postbox div.inside { width: 100%; } -#progress { - display: none; -} - #progress-container { min-height: 40px; position: absolute; @@ -90,7 +86,7 @@ div.postbox div.inside { height: 18px; width: 18px; position: relative; - display: inline-block; + display: none; margin-top: 0px; margin-right: 10px; text-align: center; diff --git a/views/options-page.phtml b/views/options-page.phtml index 6503919d..d002363b 100755 --- a/views/options-page.phtml +++ b/views/options-page.phtml @@ -139,7 +139,7 @@ $tpl = new StaticHTMLOutput\TemplateHelper();
onceAction) ?> - + @@ -160,7 +160,9 @@ $tpl = new StaticHTMLOutput\TemplateHelper();
-
Starting export
+
+ total_detected_urls; ?> URLs were detected. The rest of your site's URLs will be discovered during crawling. +

diff --git a/views/tab_crawling.phtml b/views/tab_crawling.phtml index b6866898..83ceaa87 100755 --- a/views/tab_crawling.phtml +++ b/views/tab_crawling.phtml @@ -51,7 +51,7 @@ URLs were detected on your site that will be used to initiate the crawl. Other URLs will be discovered while crawling.


- + Preview initial crawl list From 8821fb3376fb0602231ab27b0054eb4e2687b43c Mon Sep 17 00:00:00 2001 From: Leon Date: Sat, 13 Jun 2020 03:32:18 +0930 Subject: [PATCH 08/39] wip crawling from DB --- src/CSSProcessor.php | 12 -- src/CrawlQueue.php | 4 +- src/HTMLProcessor.php | 12 -- src/SiteCrawler.php | 240 +++++++++------------------------- static-html-output-plugin.php | 4 - views/options-page-js.phtml | 3 +- 6 files changed, 63 insertions(+), 212 deletions(-) diff --git a/src/CSSProcessor.php b/src/CSSProcessor.php index 2807cae7..b241e322 100755 --- a/src/CSSProcessor.php +++ b/src/CSSProcessor.php @@ -419,18 +419,6 @@ public function addDiscoveredURL( string $url ) : void { } public function writeDiscoveredURLs() : void { - $ajax_method = filter_input( INPUT_POST, 'ajax_action' ); - - if ( $ajax_method === 'crawl_again' ) { - return; - } - - if ( defined( 'WP_CLI' ) ) { - if ( defined( 'CRAWLING_DISCOVERED' ) ) { - return; - } - } - // TODO: check for existing URLs in CrawlLog and only add non-processed to CrawlQueue $unique_urls = array_unique( $this->discovered_urls ); array_filter( $unique_urls ); diff --git a/src/CrawlQueue.php b/src/CrawlQueue.php index c281fbe2..1be254a1 100644 --- a/src/CrawlQueue.php +++ b/src/CrawlQueue.php @@ -56,13 +56,13 @@ public static function addUrls( array $urls ) : void { * * @return string[] All crawlable URLs */ - public static function getCrawlablePaths() : array { + public static function getCrawlablePaths( int $limit = 500 ) : array { global $wpdb; $urls = []; $table_name = $wpdb->prefix . 'statichtmloutput_urls'; - $rows = $wpdb->get_results( "SELECT url FROM $table_name ORDER by url ASC" ); + $rows = $wpdb->get_results( "SELECT url FROM $table_name ORDER by url ASC LIMIT $limit" ); foreach ( $rows as $row ) { $urls[] = $row->url; diff --git a/src/HTMLProcessor.php b/src/HTMLProcessor.php index 2c806ad7..dceeac5b 100755 --- a/src/HTMLProcessor.php +++ b/src/HTMLProcessor.php @@ -847,18 +847,6 @@ public function processMeta( DOMElement $element ) : void { } public function writeDiscoveredURLs() : void { - $ajax_method = filter_input( INPUT_POST, 'ajax_action' ); - - if ( $ajax_method === 'crawl_again' ) { - return; - } - - if ( defined( 'WP_CLI' ) ) { - if ( defined( 'CRAWLING_DISCOVERED' ) ) { - return; - } - } - // TODO: check for existing URLs in CrawlLog and only add non-processed to CrawlQueue $unique_urls = array_unique( $this->discovered_urls ); array_filter( $unique_urls ); diff --git a/src/SiteCrawler.php b/src/SiteCrawler.php index 6a582980..855ebb16 100755 --- a/src/SiteCrawler.php +++ b/src/SiteCrawler.php @@ -90,16 +90,6 @@ public function __construct() { $this->archive_dir = ''; $this->list_of_urls_to_crawl_path = ''; $this->urls_to_crawl = []; - - if ( ! defined( 'WP_CLI' ) ) { - // @codingStandardsIgnoreStart - if ( $_POST['ajax_action'] === 'crawl_again' ) { - $this->crawl_discovered_links(); - } elseif ( $_POST['ajax_action'] === 'crawl_site' ) { - $this->crawl_site(); - } - // @codingStandardsIgnoreEnd - } } public function generate_discovered_links_list() : void { @@ -192,70 +182,12 @@ public function generate_discovered_links_list() : void { ); } - public function crawl_discovered_links() : void { - if ( defined( 'WP_CLI' ) && ! defined( 'CRAWLING_DISCOVERED' ) ) { - define( 'CRAWLING_DISCOVERED', true ); - } - - $second_crawl_file_path = $this->settings['wp_uploads_path'] . - '/WP-STATIC-2ND-CRAWL-LIST.txt'; - - // NOTE: the first iteration of the 2nd crawl phase, - // the list of URLs for 2nd crawl is prepared - if ( ! is_file( $second_crawl_file_path ) ) { - $this->generate_discovered_links_list(); - } - - $this->list_of_urls_to_crawl_path = - $this->settings['wp_uploads_path'] . - '/WP-STATIC-FINAL-2ND-CRAWL-LIST.txt'; - - if ( ! is_file( $this->list_of_urls_to_crawl_path ) ) { - Logger::l( - 'ERROR: LIST OF URLS TO CRAWL NOT FOUND AT: ' . - $this->list_of_urls_to_crawl_path - ); - die(); - } else { - if ( filesize( $this->list_of_urls_to_crawl_path ) ) { - $this->crawlABitMore(); - } else { - if ( ! defined( 'WP_CLI' ) ) { - echo 'SUCCESS'; - } - } - } - } - public function crawl_site() : void { - // crude detection for CLI export to use 2nd crawl phase - $this->list_of_urls_to_crawl_path = - $this->settings['wp_uploads_path'] . - '/WP-STATIC-FINAL-2ND-CRAWL-LIST.txt'; - - if ( is_file( $this->list_of_urls_to_crawl_path ) ) { - $this->crawl_discovered_links(); - - return; - } - - $this->list_of_urls_to_crawl_path = - $this->settings['wp_uploads_path'] . - '/WP-STATIC-FINAL-CRAWL-LIST.txt'; - - if ( ! is_file( $this->list_of_urls_to_crawl_path ) ) { - Logger::l( - 'ERROR: LIST OF URLS TO CRAWL NOT FOUND AT: ' . - $this->list_of_urls_to_crawl_path - ); - die(); + if ( CrawlQueue::getTotal() > 0 ) { + $this->crawlABitMore(); } else { - if ( filesize( $this->list_of_urls_to_crawl_path ) ) { - $this->crawlABitMore(); - } else { - if ( ! defined( 'WP_CLI' ) ) { - echo 'SUCCESS'; - } + if ( ! defined( 'WP_CLI' ) ) { + echo 'SUCCESS'; } } } @@ -263,140 +195,88 @@ public function crawl_site() : void { public function crawlABitMore() : void { $batch_of_links_to_crawl = []; - $crawl_list = file( - $this->list_of_urls_to_crawl_path, - FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES - ); + $crawl_list = CrawlQueue::getCrawlablePaths(); if ( ! $crawl_list ) { return; } - $this->urls_to_crawl = $crawl_list; + // get total CrawlQueue + $total_urls = CrawlQueue::getTotal(); - if ( is_array( $this->urls_to_crawl ) ) { - $total_links = count( $this->urls_to_crawl ); + // get batch size (smaller of total urls or crawl_increment) + $batch_size = min( $total_urls, $this->settings['crawl_increment'] ); - if ( $total_links < 1 ) { - Logger::l( - 'ERROR: LIST OF URLS TO CRAWL NOT FOUND AT: ' . - $this->list_of_urls_to_crawl_path - ); - die(); - } + // fetch just amount of URLs needed (limit to crawl_increment) + $this->urls_to_crawl = CrawlQueue::getCrawlablePaths( $batch_size ); - if ( $this->settings['crawl_increment'] > $total_links ) { - $this->settings['crawl_increment'] = $total_links; - } - - for ( $i = 0; $i < $this->settings['crawl_increment']; $i++ ) { - $link_from_crawl_list = array_shift( $this->urls_to_crawl ); + $this->archive_dir = $this->settings['wp_uploads_path'] . '/static-html-output/'; - if ( $link_from_crawl_list ) { - $batch_of_links_to_crawl[] = $link_from_crawl_list; - } - } + // TODO: modify this to show Detected / Crawled URL progress + // if ( defined( 'WP_CLI' ) && empty( $this->progress_bar ) ) { + // $this->progress_bar = + // \WP_CLI\Utils\make_progress_bar( 'Crawling site', $total_urls_to_crawl ); + // } - $this->remaining_urls_to_crawl = count( $this->urls_to_crawl ); + // TODO: add these to Exclusions table + $exclusions = [ 'wp-json' ]; - // resave crawl list file, minus those from this batch - file_put_contents( - $this->list_of_urls_to_crawl_path, - implode( "\r\n", $this->urls_to_crawl ) + if ( isset( $this->settings['excludeURLs'] ) ) { + $user_exclusions = explode( + "\n", + str_replace( "\r", '', $this->settings['excludeURLs'] ) ); - chmod( $this->list_of_urls_to_crawl_path, 0664 ); - - $this->archive_dir = $this->settings['wp_uploads_path'] . '/static-html-output/'; - - $total_urls_path = $this->settings['wp_uploads_path'] . - '/WP-STATIC-INITIAL-CRAWL-TOTAL.txt'; - - // TODO: avoid mutation - // @codingStandardsIgnoreStart - if ( - defined( 'CRAWLING_DISCOVERED' ) || - ( isset( $_POST['ajax_action'] ) && - $_POST['ajax_action'] == 'crawl_again' - ) - ) { - $total_urls_path = $this->settings['wp_uploads_path'] . - '/WP-STATIC-DISCOVERED-URLS-TOTAL.txt'; - } - // @codingStandardsIgnoreEnd - - $total_urls_to_crawl = (int) file_get_contents( $total_urls_path ); - - if ( defined( 'WP_CLI' ) && empty( $this->progress_bar ) ) { - $this->progress_bar = - \WP_CLI\Utils\make_progress_bar( 'Crawling site', $total_urls_to_crawl ); - } - - $batch_index = 0; - - $exclusions = [ 'wp-json' ]; - - if ( isset( $this->settings['excludeURLs'] ) ) { - $user_exclusions = explode( - "\n", - str_replace( "\r", '', $this->settings['excludeURLs'] ) - ); - - $exclusions = array_merge( - $exclusions, - $user_exclusions - ); - } - - Logger::l( - 'Exclusion rules ' . implode( PHP_EOL, $exclusions ) + $exclusions = array_merge( + $exclusions, + $user_exclusions ); + } - foreach ( $batch_of_links_to_crawl as $link_to_crawl ) { - $this->url = $link_to_crawl; + Logger::l( + 'Exclusion rules ' . implode( PHP_EOL, $exclusions ) + ); + + foreach ( $this->urls_to_crawl as $link_to_crawl ) { + $this->url = $link_to_crawl; - $this->full_url = $this->settings['wp_site_url'] . - ltrim( $this->url, '/' ); + $this->full_url = $this->settings['wp_site_url'] . + ltrim( $this->url, '/' ); - foreach ( $exclusions as $exclusion ) { - $exclusion = trim( $exclusion ); - if ( $exclusion != '' ) { - if ( false !== strpos( $this->url, $exclusion ) ) { - Logger::l( - 'Excluding ' . $this->url . - ' because of rule ' . $exclusion - ); + foreach ( $exclusions as $exclusion ) { + $exclusion = trim( $exclusion ); + if ( $exclusion != '' ) { + if ( false !== strpos( $this->url, $exclusion ) ) { + Logger::l( + 'Excluding ' . $this->url . + ' because of rule ' . $exclusion + ); - if ( ! empty( $this->progress_bar ) ) { - $this->progress_bar->tick(); - } + // TODO: reimplement progress bar + // if ( ! empty( $this->progress_bar ) ) { + // $this->progress_bar->tick(); + // } - // skip the outer foreach loop - continue 2; - } + // skip the outer foreach loop + continue 2; } } + } - $this->file_extension = $this->getExtensionFromURL(); - - if ( $this->loadFileForProcessing() ) { - $this->saveFile(); - } + $this->file_extension = $this->getExtensionFromURL(); - $batch_index++; + if ( $this->loadFileForProcessing() ) { + $this->saveFile(); + } - $completed_urls = - $total_urls_to_crawl - - $this->remaining_urls_to_crawl - - count( $batch_of_links_to_crawl ) + - $batch_index; + // TODO: get crawl status and remove URL from CrawlQueue - ProgressLog::l( $completed_urls, $total_urls_to_crawl ); + // ProgressLog::l( $completed_urls, $total_urls_to_crawl ); - if ( ! empty( $this->progress_bar ) ) { - $this->progress_bar->tick(); - } - } + // TODO: reimplement progress bar + // if ( ! empty( $this->progress_bar ) ) { + // $this->progress_bar->tick(); + // } } $this->checkIfMoreCrawlingNeeded(); diff --git a/static-html-output-plugin.php b/static-html-output-plugin.php index 42c72052..8bda36d9 100755 --- a/static-html-output-plugin.php +++ b/static-html-output-plugin.php @@ -80,10 +80,6 @@ function static_html_output_ajax() { return null; } elseif ( strpos( $ajax_method, 'crawl' ) !== false ) { $class = new StaticHTMLOutput\SiteCrawler(); - // crawl_again is used to detemine 2nd run of crawling - if ( $ajax_method === 'crawl_again' ) { - $ajax_method = 'crawl_discovered_links'; - } } elseif ( strpos( $ajax_method, 'bitbucket' ) !== false ) { $class = new StaticHTMLOutput\BitBucket(); diff --git a/views/options-page-js.phtml b/views/options-page-js.phtml index efd0e567..8bbd2bea 100755 --- a/views/options-page-js.phtml +++ b/views/options-page-js.phtml @@ -31,7 +31,6 @@ var nonLocalhostDomainRE = /^[^\s\.]+\.\S{2,}$/; var pollingIntervalID = ''; var status_descriptions = { 'crawl_site' : 'Crawling initial file list', - 'crawl_again' : 'Crawling discovered URLs', 'post_process_archive_dir' : 'Processing the crawled files', 'post_export_teardown' : 'Cleaning up after processing', 'netlify_do_export' : 'Deploying to Netlify', @@ -278,7 +277,7 @@ jQuery(document).ready(function($){ dataType: 'html', method: 'POST', success: function(serverResponse) { - doAJAXExport('crawl_site', 'crawl_again', 'post_process_archive_dir'); + doAJAXExport('crawl_site', 'post_process_archive_dir'); }, error: ajaxErrorHandler }); From 43b2aad4d8c92d9d387b85916db0f18624dc14d4 Mon Sep 17 00:00:00 2001 From: Leon Date: Sat, 13 Jun 2020 04:03:09 +0930 Subject: [PATCH 09/39] wip using CrawlLog --- src/CSSProcessor.php | 4 ++++ src/CrawlLog.php | 6 ++---- src/FilesHelper.php | 2 ++ src/HTMLProcessor.php | 25 +++++++++++++++++++++---- src/SiteCrawler.php | 10 ++++------ 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/CSSProcessor.php b/src/CSSProcessor.php index b241e322..3e80bdf1 100755 --- a/src/CSSProcessor.php +++ b/src/CSSProcessor.php @@ -424,6 +424,10 @@ public function writeDiscoveredURLs() : void { array_filter( $unique_urls ); sort( $unique_urls ); + if ( ! $unique_urls ) { + return; + } + // TODO: also add new URLs to CrawlLog CrawlQueue::addUrls( $unique_urls ); } diff --git a/src/CrawlLog.php b/src/CrawlLog.php index 2c27acdd..0425aeaf 100644 --- a/src/CrawlLog.php +++ b/src/CrawlLog.php @@ -45,16 +45,14 @@ public static function addUrls( array $urls, string $note, int $status = 0 ) : v continue; } - $placeholders[] = '(%s)'; + $placeholders[] = '(%s, %s, %d)'; $values[] = rawurldecode( $url ); - $placeholders[] = '(%s)'; $values[] = $note; - $placeholders[] = '(%d)'; $values[] = $status; } $query_string = - 'INSERT INTO ' . $table_name . ' (url) VALUES ' . + 'INSERT INTO ' . $table_name . ' (url, note, status) VALUES ' . implode( ', ', $placeholders ); $query = $wpdb->prepare( $query_string, $values ); diff --git a/src/FilesHelper.php b/src/FilesHelper.php index d804d55a..ea465347 100755 --- a/src/FilesHelper.php +++ b/src/FilesHelper.php @@ -299,6 +299,7 @@ public static function buildInitialFileList( ) : int { // clear CrawlQueue before rebuilding list CrawlQueue::truncate(); + CrawlLog::truncate(); $wp_site = new WPSite(); @@ -334,6 +335,7 @@ public static function buildInitialFileList( // TODO: also add to CrawlLog CrawlQueue::addUrls( $unique_urls ); + CrawlLog::addUrls( $unique_urls, 'initial_crawl_list', 0 ); file_put_contents( $uploads_path . '/WP-STATIC-INITIAL-CRAWL-TOTAL.txt', diff --git a/src/HTMLProcessor.php b/src/HTMLProcessor.php index dceeac5b..173ac572 100755 --- a/src/HTMLProcessor.php +++ b/src/HTMLProcessor.php @@ -848,12 +848,29 @@ public function processMeta( DOMElement $element ) : void { public function writeDiscoveredURLs() : void { // TODO: check for existing URLs in CrawlLog and only add non-processed to CrawlQueue - $unique_urls = array_unique( $this->discovered_urls ); - array_filter( $unique_urls ); - sort( $unique_urls ); + $discovered_urls = array_unique( $this->discovered_urls ); + array_filter( $discovered_urls ); + sort( $discovered_urls ); + + if ( ! $discovered_urls ) { + return; + } + + // get all from CrawlLog + $known_urls = CrawlLog::getCrawlablePaths(); + + // filter only new URLs + $new_urls = array_diff( $discovered_urls, $known_urls ); + + if ( ! $new_urls ) { + return; + } + + $page_url = (string) parse_url( $this->page_url, PHP_URL_PATH ); // TODO: also add new URLs to CrawlLog - CrawlQueue::addUrls( $unique_urls ); + CrawlLog::addUrls( $new_urls, 'discovered on: ' . $page_url , 0 ); + CrawlQueue::addUrls( $new_urls ); } // make link absolute, using current page to determine full path diff --git a/src/SiteCrawler.php b/src/SiteCrawler.php index 855ebb16..0f6e1fc8 100755 --- a/src/SiteCrawler.php +++ b/src/SiteCrawler.php @@ -62,10 +62,6 @@ class SiteCrawler extends StaticHTMLOutput { * @var string */ public $crawled_links_file; - /** - * @var int - */ - public $remaining_urls_to_crawl; public function __construct() { $this->loadSettings( @@ -499,9 +495,11 @@ public function loadFileForProcessing() : bool { } public function checkIfMoreCrawlingNeeded() : void { - if ( $this->remaining_urls_to_crawl > 0 ) { + $remaining_urls_to_crawl = CrawlQueue::getTotal(); + + if ( $remaining_urls_to_crawl > 0 ) { if ( ! defined( 'WP_CLI' ) ) { - echo $this->remaining_urls_to_crawl; + echo $remaining_urls_to_crawl; } else { $this->crawl_site(); } From 87128bdb8c772c1dcc8e982bcedff1aa5569d719 Mon Sep 17 00:00:00 2001 From: Leon Date: Sat, 13 Jun 2020 05:26:40 +0930 Subject: [PATCH 10/39] wip infinite crawling --- src/CSSProcessor.php | 102 ++++++++++++++++++++++-------------------- src/CrawlLog.php | 25 +++++++++-- src/CrawlQueue.php | 19 ++++++++ src/HTMLProcessor.php | 11 ++++- src/SiteCrawler.php | 45 +++++++++---------- 5 files changed, 126 insertions(+), 76 deletions(-) diff --git a/src/CSSProcessor.php b/src/CSSProcessor.php index 3e80bdf1..31481226 100755 --- a/src/CSSProcessor.php +++ b/src/CSSProcessor.php @@ -67,10 +67,6 @@ class CSSProcessor extends StaticHTMLOutput { * @var string[] */ public $discovered_urls; - /** - * @var bool - */ - public $harvest_new_urls; /** * @var string[] */ @@ -126,7 +122,6 @@ public function processCSS( string $css_document, string $page_url ) : bool { $css_parser = new Sabberworm\CSS\Parser( $this->raw_css ); $this->css_doc = $css_parser->parse(); $this->page_url = new Net_URL2( $page_url ); - $this->detectIfURLsShouldBeHarvested(); $this->discovered_urls = []; $this->urls_to_rewrite = []; @@ -358,20 +353,6 @@ public function rewriteSiteURLsToPlaceholder( return $rewritten_source; } - public function detectIfURLsShouldBeHarvested() : void { - if ( defined( 'WP_CLI' ) ) { - if ( defined( 'CRAWLING_DISCOVERED' ) ) { - return; - } else { - $this->harvest_new_urls = true; - } - } else { - $ajax_method = filter_input( INPUT_POST, 'ajax_action' ); - - $this->harvest_new_urls = $ajax_method === 'crawl_site'; - } - } - public function addDiscoveredURL( string $url ) : void { // only discover assets, not HTML/XML. etc $extension = pathinfo( $url, PATHINFO_EXTENSION ); @@ -384,52 +365,77 @@ public function addDiscoveredURL( string $url ) : void { $url = strtok( $url, '#' ); $url = trim( (string) strtok( (string) $url, '?' ) ); - if ( ! $url ) { + if ( trim( (string) $url ) === '') { return; } - if ( $this->harvest_new_urls ) { - if ( ! $this->isValidURL( $url ) ) { - return; - } + if ( ! $url ) { + return; + } - if ( $this->isInternalLink( $url ) ) { - // get FQU resolved to this page - $url = $this->page_url->resolve( $url ); + if ( ! $this->isValidURL( $url ) ) { + return; + } - $discovered_url_without_site_url = - str_replace( - rtrim( $this->wp_site_url, '/' ), - '', - $url - ); + if ( $this->isInternalLink( $url ) ) { + // get FQU resolved to this page + $url = $this->page_url->resolve( $url ); + + $discovered_url_without_site_url = + str_replace( + rtrim( $this->wp_site_url, '/' ), + '', + $url + ); + + $discovered_url_without_site_url = + str_replace( + rtrim( $this->placeholder_url, '/' ), + '', + $discovered_url_without_site_url + ); + + if ( is_string( $discovered_url_without_site_url ) ) { + // ignore empty or root / (duct tapes issue with / being repeatedly added) + if ( trim( $discovered_url_without_site_url ) === '/') { + return; + } - $discovered_url_without_site_url = - str_replace( - rtrim( $this->placeholder_url, '/' ), - '', - $discovered_url_without_site_url - ); + error_log('adding discovered via CSS ' . $discovered_url_without_site_url . PHP_EOL); - if ( is_string( $discovered_url_without_site_url ) ) { - $this->discovered_urls[] = $discovered_url_without_site_url; - } + $this->discovered_urls[] = $discovered_url_without_site_url; } } } public function writeDiscoveredURLs() : void { - // TODO: check for existing URLs in CrawlLog and only add non-processed to CrawlQueue - $unique_urls = array_unique( $this->discovered_urls ); - array_filter( $unique_urls ); - sort( $unique_urls ); + $discovered_urls = array_unique( $this->discovered_urls ); + array_filter( $discovered_urls ); + sort( $discovered_urls ); - if ( ! $unique_urls ) { + if ( ! $discovered_urls ) { return; } + // get all from CrawlLog + $known_urls = CrawlLog::getCrawlablePaths(); + + // filter only new URLs + $new_urls = array_diff( $discovered_urls, $known_urls ); + + if ( ! $new_urls ) { + return; + } + + $page_url = (string) parse_url( $this->page_url, PHP_URL_PATH ); + + error_log( $page_url . PHP_EOL); + error_log( 'new urls from CSS' . PHP_EOL); + error_log( print_r( $new_urls, true ) . PHP_EOL); + // TODO: also add new URLs to CrawlLog - CrawlQueue::addUrls( $unique_urls ); + CrawlLog::addUrls( $new_urls, 'discovered on: ' . $page_url , 0 ); + CrawlQueue::addUrls( $new_urls ); } public function isValidURL( string $url ) : bool { diff --git a/src/CrawlLog.php b/src/CrawlLog.php index 0425aeaf..84f52e27 100644 --- a/src/CrawlLog.php +++ b/src/CrawlLog.php @@ -95,10 +95,10 @@ public static function getTotalCrawlableURLs() : int { } /** - * Clear CrawlQueue via truncate or deletion + * Clear CrawlCrawl Log via truncate or deletion */ public static function truncate() : void { - Logger::l( 'Deleting CrawlQueue (Detected URLs)' ); + Logger::l( 'Deleting CrawlCrawl Log' ); global $wpdb; @@ -109,12 +109,12 @@ public static function truncate() : void { $total_crawl_log = self::getTotalCrawlableURLs(); if ( $total_crawl_log > 0 ) { - Logger::l( 'failed to truncate CrawlQueue: try deleting instead' ); + Logger::l( 'failed to truncate CrawlCrawl Log: try deleting instead' ); } } /** - * Count URLs in Crawl Queue + * Count URLs in Crawl Log */ public static function getTotal() : int { global $wpdb; @@ -125,4 +125,21 @@ public static function getTotal() : int { return $total; } + + /** + * Update URL status + */ + public static function updateStatus( string $url, int $status) : void { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_crawl_log'; + + error_log("updating status $status $url" . PHP_EOL); + + $wpdb->update( + $table_name, + [ 'status' => $status ], + [ 'url' => $url ] + ); + } } diff --git a/src/CrawlQueue.php b/src/CrawlQueue.php index 1be254a1..f0ceccb1 100644 --- a/src/CrawlQueue.php +++ b/src/CrawlQueue.php @@ -117,4 +117,23 @@ public static function getTotal() : int { return $total; } + + /** + * Remove single URL from CrawlQueue + */ + public static function removeURL( string $url) : void { + global $wpdb; + + error_log('Deleting ' . $url . PHP_EOL); + + $table_name = $wpdb->prefix . 'statichtmloutput_urls'; + + $result = $wpdb->delete( + $table_name, + [ 'url' => $url ] + ); + + error_log( $result ? 'deleted ' . $result . ' rows' : 'failed to delete' ); + error_log( PHP_EOL ); + } } diff --git a/src/HTMLProcessor.php b/src/HTMLProcessor.php index 173ac572..5c135c29 100755 --- a/src/HTMLProcessor.php +++ b/src/HTMLProcessor.php @@ -624,6 +624,12 @@ public function addDiscoveredURL( string $url ) : void { return; } + if ( trim( $path ) === '/' ) { + return; + } + + error_log('adding discovered via HTML ' . $path . PHP_EOL); + $this->discovered_urls[] = $path; } } @@ -847,7 +853,6 @@ public function processMeta( DOMElement $element ) : void { } public function writeDiscoveredURLs() : void { - // TODO: check for existing URLs in CrawlLog and only add non-processed to CrawlQueue $discovered_urls = array_unique( $this->discovered_urls ); array_filter( $discovered_urls ); sort( $discovered_urls ); @@ -868,6 +873,10 @@ public function writeDiscoveredURLs() : void { $page_url = (string) parse_url( $this->page_url, PHP_URL_PATH ); + error_log( $page_url . PHP_EOL); + error_log( 'new urls from HTML' . PHP_EOL); + error_log( print_r( $new_urls, true ) . PHP_EOL); + // TODO: also add new URLs to CrawlLog CrawlLog::addUrls( $new_urls, 'discovered on: ' . $page_url , 0 ); CrawlQueue::addUrls( $new_urls ); diff --git a/src/SiteCrawler.php b/src/SiteCrawler.php index 0f6e1fc8..48a079a7 100755 --- a/src/SiteCrawler.php +++ b/src/SiteCrawler.php @@ -248,6 +248,16 @@ public function crawlABitMore() : void { ' because of rule ' . $exclusion ); + $url_path = (string) parse_url( $this->url, PHP_URL_PATH ); + + if ( ! $url_path ) { + continue 2; + } + + // TODO: dummy status to denote skipped due to exclusion rule + CrawlLog::updateStatus( $url_path, 777 ); + CrawlQueue::removeURL( $url_path ); + // TODO: reimplement progress bar // if ( ! empty( $this->progress_bar ) ) { // $this->progress_bar->tick(); @@ -327,33 +337,22 @@ public function loadFileForProcessing() : bool { $good_response_codes = [ 200, 201, 301, 302, 304 ]; - if ( ! in_array( $status_code, $good_response_codes ) ) { - Logger::l( - 'BAD RESPONSE STATUS (' . $status_code . '): ' . $this->url - ); + $url_path = (string) parse_url( $this->url, PHP_URL_PATH ); - file_put_contents( - $this->settings['wp_uploads_path'] . - '/WP-STATIC-404-LOG.txt', - $status_code . ':' . $this->url . PHP_EOL, - FILE_APPEND | LOCK_EX - ); + if ( ! $url_path ) { + return false; + } - chmod( - $this->settings['wp_uploads_path'] . - '/WP-STATIC-404-LOG.txt', - 0664 - ); + CrawlLog::updateStatus( $url_path, $status_code ); + CrawlQueue::removeURL( $url_path ); - return false; - } else { - file_put_contents( - $this->crawled_links_file, - $this->url . PHP_EOL, - FILE_APPEND | LOCK_EX - ); + error_log('Queue:' . PHP_EOL); + error_log( print_r( CrawlQueue::getCrawlablePaths(), true ) . PHP_EOL); + + if ( ! in_array( $status_code, $good_response_codes ) ) { + Logger::l( "BAD RESPONSE STATUS ($status_code): $this->url" ); - chmod( $this->crawled_links_file, 0664 ); + return false; } $base_url = $this->settings['baseUrl']; From 5ccba11a4fb6ba51cd0566dbadb1795cbd17a517 Mon Sep 17 00:00:00 2001 From: Leon Date: Sat, 13 Jun 2020 05:44:26 +0930 Subject: [PATCH 11/39] rm pagination detection --- src/FilesHelper.php | 143 -------------------------------------------- 1 file changed, 143 deletions(-) diff --git a/src/FilesHelper.php b/src/FilesHelper.php index ea465347..69b54c0d 100755 --- a/src/FilesHelper.php +++ b/src/FilesHelper.php @@ -509,27 +509,6 @@ public static function getAllWPPostURLs( string $wp_site_url ) : array { } } - // get all pagination links for each category - $category_pagination_urls = - self::getPaginationURLsForCategories( $category_links ); - - // get all pagination links for each post_type - $post_pagination_urls = - self::getPaginationURLsForPosts( - array_unique( $unique_post_types ) - ); - - // get all comment links - $comment_pagination_urls = - self::getPaginationURLsForComments( $wp_site_url ); - - $post_urls = array_merge( - $post_urls, - $post_pagination_urls, - $category_pagination_urls, - $comment_pagination_urls - ); - return $post_urls; } @@ -566,127 +545,5 @@ function ( string $url ) { return $cleaned_urls; } - - /** - * @param string[] $post_types to get pagination URLs from - * @return string[] list of URLs - */ - public static function getPaginationURLsForPosts( array $post_types ) : array { - global $wpdb, $wp_rewrite; - $wp_site = new WPSite(); - $pagination_base = $wp_rewrite->pagination_base; - $default_posts_per_page = get_option( 'posts_per_page' ); - $urls_to_include = []; - - foreach ( $post_types as $post_type ) { - $query = " - SELECT ID,post_type - FROM %s - WHERE post_status = '%s' - AND post_type = '%s'"; - - $count = $wpdb->get_results( - sprintf( - $query, - $wpdb->posts, - 'publish', - $post_type - ) - ); - - $post_type_obj = get_post_type_object( $post_type ); - - if ( ! $post_type_obj || ! isset( $post_type_obj->labels->name ) ) { - continue; - } - - $plural_form = strtolower( $post_type_obj->labels->name ); - $count = $wpdb->num_rows; - $total_pages = ceil( $count / $default_posts_per_page ); - $archive_link = get_post_type_archive_link( $post_type ); - - // only use pagination base when post type is page - for ( $page = 2; $page <= $total_pages; $page++ ) { - if ( $post_type === 'page' ) { - $pagination_url = - // TODO: check this against custom post types - // "/{$plural_form}/{$pagination_base}/{$page}"; - "/{$pagination_base}/{$page}"; - } else { - $pagination_url = - "/{$archive_link}/{$pagination_base}/{$page}"; - } - - $urls_to_include[] = str_replace( - $wp_site->site_url, - '/', - $pagination_url - ); - } - } - - return $urls_to_include; - } - - /** - * @param mixed[] $categories with total counts - * @return string[] list of URLs - */ - public static function getPaginationURLsForCategories( array $categories ) : array { - if ( ! $categories ) { - return []; - } - - global $wp_rewrite; - - $urls_to_include = []; - $pagination_base = $wp_rewrite->pagination_base; - $default_posts_per_page = get_option( 'posts_per_page' ); - - foreach ( $categories as $term => $total_posts ) { - $total_pages = ceil( $total_posts / $default_posts_per_page ); - - for ( $page = 2; $page <= $total_pages; $page++ ) { - $urls_to_include[] = - "{$term}/{$pagination_base}/{$page}"; - } - } - - return $urls_to_include; - } - - /** - * @return string[] list of URLs - */ - public static function getPaginationURLsForComments( string $wp_site_url ) : array { - global $wp_rewrite; - - $comments_pagination_base = $wp_rewrite->comments_pagination_base; - - $comments = get_comments(); - - if ( ! is_array( $comments ) ) { - return []; - } - - $urls_to_include = []; - - foreach ( $comments as $comment ) { - $comment_url = get_comment_link( $comment->comment_ID ); - $comment_url = strtok( $comment_url, '#' ); - - if ( ! is_string( $comment_url ) ) { - continue; - } - - $urls_to_include[] = str_replace( - $wp_site_url, - '', - $comment_url - ); - } - - return array_unique( $urls_to_include ); - } } From 3e41c39ebbb554064877ee358c55e17be04f22dd Mon Sep 17 00:00:00 2001 From: Leon Date: Sat, 13 Jun 2020 22:35:31 +0930 Subject: [PATCH 12/39] generate and view crawl queue --- src/CSSProcessor.php | 11 ++--------- src/Controller.php | 18 +++++++++++++++--- src/CrawlLog.php | 2 -- src/CrawlQueue.php | 5 ----- src/HTMLProcessor.php | 6 ------ src/SiteCrawler.php | 3 --- src/ViewRenderer.php | 18 ++++++++++++++++++ static-html-output-plugin.php | 2 +- views/crawl-queue-page.php | 23 +++++++++++++++++++++++ views/options-page-js.phtml | 3 ++- views/options-page.phtml | 7 ++++++- views/tab_crawling.phtml | 7 ++++++- 12 files changed, 73 insertions(+), 32 deletions(-) create mode 100644 src/ViewRenderer.php create mode 100755 views/crawl-queue-page.php diff --git a/src/CSSProcessor.php b/src/CSSProcessor.php index 31481226..36f5ce1b 100755 --- a/src/CSSProcessor.php +++ b/src/CSSProcessor.php @@ -401,8 +401,6 @@ public function addDiscoveredURL( string $url ) : void { return; } - error_log('adding discovered via CSS ' . $discovered_url_without_site_url . PHP_EOL); - $this->discovered_urls[] = $discovered_url_without_site_url; } } @@ -418,9 +416,9 @@ public function writeDiscoveredURLs() : void { } // get all from CrawlLog - $known_urls = CrawlLog::getCrawlablePaths(); + $known_urls = CrawlLog::getCrawlablePaths(); - // filter only new URLs + // filter only new URLs $new_urls = array_diff( $discovered_urls, $known_urls ); if ( ! $new_urls ) { @@ -429,11 +427,6 @@ public function writeDiscoveredURLs() : void { $page_url = (string) parse_url( $this->page_url, PHP_URL_PATH ); - error_log( $page_url . PHP_EOL); - error_log( 'new urls from CSS' . PHP_EOL); - error_log( print_r( $new_urls, true ) . PHP_EOL); - - // TODO: also add new URLs to CrawlLog CrawlLog::addUrls( $new_urls, 'discovered on: ' . $page_url , 0 ); CrawlQueue::addUrls( $new_urls ); } diff --git a/src/Controller.php b/src/Controller.php index 9c89914d..fae9fbeb 100755 --- a/src/Controller.php +++ b/src/Controller.php @@ -64,6 +64,7 @@ public static function init( string $bootstrap_file ) : Controller { add_filter( 'custom_menu_order', '__return_true' ); add_filter( 'menu_order', [ 'StaticHTMLOutput\Controller', 'set_menu_order' ] ); } + return $instance; } @@ -147,7 +148,14 @@ public static function activate( $network_wide ) : void { } public static function registerOptionsPage() : void { - $plugins_url = plugin_dir_url( dirname( __FILE__ ) ); + add_submenu_page( + '', + 'Static HTML Output Crawl Queue', + 'Crawl Queue', + 'manage_options', + 'statichtmloutput-crawl-queue', + [ 'StaticHTMLOutput\ViewRenderer', 'renderCrawlQueue' ] + ); $page = add_menu_page( 'Static HTML', @@ -182,7 +190,7 @@ public function finalize_deployment() : void { echo 'SUCCESS'; } - public function generate_filelist_preview() : void { + public function detect_urls() : void { $this->wp_site = new WPSite(); $target_settings = [ @@ -205,11 +213,15 @@ public function generate_filelist_preview() : void { $this->wp_site->uploads_url, $this->settings ); + + if ( ! defined( 'WP_CLI' ) ) { + echo 'SUCCESS'; + } } public static function renderOptionsPage() : void { $instance = self::getInstance(); - $instance->generate_filelist_preview(); + $instance->detect_urls(); $instance->wp_site = new WPSite(); $instance->current_archive = ''; diff --git a/src/CrawlLog.php b/src/CrawlLog.php index 84f52e27..84b2b01d 100644 --- a/src/CrawlLog.php +++ b/src/CrawlLog.php @@ -134,8 +134,6 @@ public static function updateStatus( string $url, int $status) : void { $table_name = $wpdb->prefix . 'statichtmloutput_crawl_log'; - error_log("updating status $status $url" . PHP_EOL); - $wpdb->update( $table_name, [ 'status' => $status ], diff --git a/src/CrawlQueue.php b/src/CrawlQueue.php index f0ceccb1..c3bab471 100644 --- a/src/CrawlQueue.php +++ b/src/CrawlQueue.php @@ -124,16 +124,11 @@ public static function getTotal() : int { public static function removeURL( string $url) : void { global $wpdb; - error_log('Deleting ' . $url . PHP_EOL); - $table_name = $wpdb->prefix . 'statichtmloutput_urls'; $result = $wpdb->delete( $table_name, [ 'url' => $url ] ); - - error_log( $result ? 'deleted ' . $result . ' rows' : 'failed to delete' ); - error_log( PHP_EOL ); } } diff --git a/src/HTMLProcessor.php b/src/HTMLProcessor.php index 5c135c29..3afd8ac6 100755 --- a/src/HTMLProcessor.php +++ b/src/HTMLProcessor.php @@ -628,8 +628,6 @@ public function addDiscoveredURL( string $url ) : void { return; } - error_log('adding discovered via HTML ' . $path . PHP_EOL); - $this->discovered_urls[] = $path; } } @@ -873,10 +871,6 @@ public function writeDiscoveredURLs() : void { $page_url = (string) parse_url( $this->page_url, PHP_URL_PATH ); - error_log( $page_url . PHP_EOL); - error_log( 'new urls from HTML' . PHP_EOL); - error_log( print_r( $new_urls, true ) . PHP_EOL); - // TODO: also add new URLs to CrawlLog CrawlLog::addUrls( $new_urls, 'discovered on: ' . $page_url , 0 ); CrawlQueue::addUrls( $new_urls ); diff --git a/src/SiteCrawler.php b/src/SiteCrawler.php index 48a079a7..21771845 100755 --- a/src/SiteCrawler.php +++ b/src/SiteCrawler.php @@ -346,9 +346,6 @@ public function loadFileForProcessing() : bool { CrawlLog::updateStatus( $url_path, $status_code ); CrawlQueue::removeURL( $url_path ); - error_log('Queue:' . PHP_EOL); - error_log( print_r( CrawlQueue::getCrawlablePaths(), true ) . PHP_EOL); - if ( ! in_array( $status_code, $good_response_codes ) ) { Logger::l( "BAD RESPONSE STATUS ($status_code): $this->url" ); diff --git a/src/ViewRenderer.php b/src/ViewRenderer.php new file mode 100644 index 00000000..5a01d513 --- /dev/null +++ b/src/ViewRenderer.php @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + + + + +
URLs in Crawl Queue
Crawl queue is empty.
+ diff --git a/views/options-page-js.phtml b/views/options-page-js.phtml index 8bbd2bea..3eed7915 100755 --- a/views/options-page-js.phtml +++ b/views/options-page-js.phtml @@ -30,6 +30,7 @@ var localhostDomainRE = /^localhost[\:?\d]*(?:[^\:?\d]\S*)?$/ var nonLocalhostDomainRE = /^[^\s\.]+\.\S{2,}$/; var pollingIntervalID = ''; var status_descriptions = { + 'detect_urls' : 'Redetecting initial crawl URLs', 'crawl_site' : 'Crawling initial file list', 'post_process_archive_dir' : 'Processing the crawled files', 'post_export_teardown' : 'Cleaning up after processing', @@ -277,7 +278,7 @@ jQuery(document).ready(function($){ dataType: 'html', method: 'POST', success: function(serverResponse) { - doAJAXExport('crawl_site', 'post_process_archive_dir'); + doAJAXExport('detect_urls', 'crawl_site', 'post_process_archive_dir'); }, error: ajaxErrorHandler }); diff --git a/views/options-page.phtml b/views/options-page.phtml index d002363b..2e3df81d 100755 --- a/views/options-page.phtml +++ b/views/options-page.phtml @@ -161,7 +161,12 @@ $tpl = new StaticHTMLOutput\TemplateHelper();
- total_detected_urls; ?> URLs were detected. The rest of your site's URLs will be discovered during crawling. + + total_detected_urls; ?> URLs + + were detected. The rest of your site's URLs will be discovered during crawling.
diff --git a/views/tab_crawling.phtml b/views/tab_crawling.phtml index 83ceaa87..359f999e 100755 --- a/views/tab_crawling.phtml +++ b/views/tab_crawling.phtml @@ -51,7 +51,12 @@ URLs were detected on your site that will be used to initiate the crawl. Other URLs will be discovered while crawling.


- Preview initial crawl list + + Show Detected URLs + From 6931aafff77dc79ba57c7f90638d94127611181a Mon Sep 17 00:00:00 2001 From: Leon Date: Sun, 14 Jun 2020 00:05:32 +0930 Subject: [PATCH 13/39] show incremental crawl progress --- src/Controller.php | 16 ++++++---------- src/CrawlLog.php | 18 ++++++++++++++++++ static-html-output-plugin.php | 20 ++++++++++++++++++++ views/options-page-js.phtml | 29 +++++++++++------------------ 4 files changed, 55 insertions(+), 28 deletions(-) diff --git a/src/Controller.php b/src/Controller.php index fae9fbeb..f7ea79d7 100755 --- a/src/Controller.php +++ b/src/Controller.php @@ -68,7 +68,6 @@ public static function init( string $bootstrap_file ) : Controller { return $instance; } - /** * Adjusts position of dashboard menu icons * @@ -110,6 +109,8 @@ public function setDefaultOptions() : void { } public function activate_for_single_site() : void { + // add_action( 'init', [ 'StaticHTMLOutput\Controller', 'add_custom_routes' ], 0 ); + Logger::createTable(); $this->setDefaultOptions(); CrawlQueue::createTable(); @@ -148,15 +149,6 @@ public static function activate( $network_wide ) : void { } public static function registerOptionsPage() : void { - add_submenu_page( - '', - 'Static HTML Output Crawl Queue', - 'Crawl Queue', - 'manage_options', - 'statichtmloutput-crawl-queue', - [ 'StaticHTMLOutput\ViewRenderer', 'renderCrawlQueue' ] - ); - $page = add_menu_page( 'Static HTML', 'Static HTML', @@ -227,6 +219,10 @@ public static function renderOptionsPage() : void { $instance->view ->setTemplate( 'options-page-js' ) + ->assign( + 'crawl_progress_url', + admin_url('admin.php?page=statichtmloutput&statichtmloutput-crawl-progress=1') + ) ->assign( 'options', $instance->options ) ->assign( 'wp_site', $instance->wp_site ) ->assign( 'onceAction', self::HOOK . '-options' ) diff --git a/src/CrawlLog.php b/src/CrawlLog.php index 84b2b01d..f08f8702 100644 --- a/src/CrawlLog.php +++ b/src/CrawlLog.php @@ -94,6 +94,24 @@ public static function getTotalCrawlableURLs() : int { return $total_crawl_log; } + /** + * Get crawled URLs + * + * @return int Total crawled URLs + */ + public static function getTotalCrawledURLs() : int { + global $wpdb; + + $table_name = $wpdb->prefix . 'statichtmloutput_crawl_log'; + + $total_crawl_log = + $wpdb->get_var( + "SELECT COUNT(*) FROM $table_name WHERE status > 0" + ); + + return $total_crawl_log; + } + /** * Clear CrawlCrawl Log via truncate or deletion */ diff --git a/static-html-output-plugin.php b/static-html-output-plugin.php index bd1bbcab..9a3eb466 100755 --- a/static-html-output-plugin.php +++ b/static-html-output-plugin.php @@ -25,6 +25,24 @@ StaticHTMLOutput\Controller::init( __FILE__ ); +$crawl_progress = filter_input( INPUT_GET, 'statichtmloutput-crawl-progress' ); + +if ( $crawl_progress ) { + if ( ! is_admin() ) { + wp_send_json( [ 'message' => 'Not permitted' ], 403 ); + } + + $detected_urls = StaticHTMLOutput\CrawlLog::getTotalCrawlableURLs(); + $crawled_urls = StaticHTMLOutput\CrawlLog::getTotalCrawledURLs(); + + $json_response = [ + 'detected' => $detected_urls, + 'crawled' => $crawled_urls + ]; + + wp_send_json( $json_response, 200 ); +} + function static_html_output_action_links( $links ) { $settings_link = 'Settings'; array_unshift( $links, $settings_link ); @@ -46,6 +64,8 @@ function wp_static_html_output_server_side_export() { 0 ); + + add_filter( 'plugin_action_links_' . plugin_basename( __FILE__ ), 'static_html_output_action_links' diff --git a/views/options-page-js.phtml b/views/options-page-js.phtml index 3eed7915..716d7737 100755 --- a/views/options-page-js.phtml +++ b/views/options-page-js.phtml @@ -1,6 +1,8 @@