diff --git a/inc/indexer.php b/inc/indexer.php index 7cddb7c545..0fbd939be6 100644 --- a/inc/indexer.php +++ b/inc/indexer.php @@ -54,7 +54,7 @@ * Version of the indexer taking into consideration the external tokenizer. * The indexer is only compatible with data written by the same version. * - * Triggers INDEXER_VERSION_GET + * @triggers INDEXER_VERSION_GET * Plugins that modify what gets indexed should hook this event and * add their version info to the event data like so: * $data[$plugin_name] = $plugin_version; @@ -66,10 +66,7 @@ function idx_get_version(){ static $indexer_version = null; if ($indexer_version == null) { global $conf; - if($conf['external_tokenizer']) - $version = INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); - else - $version = INDEXER_VERSION; + $version = INDEXER_VERSION; // DokuWiki version is included for the convenience of plugins $data = array('dokuwiki'=>$version); @@ -405,6 +402,10 @@ public function deletePage($page) { * * TODO: does this also need &$stopwords ? * + * @triggers INDEXER_TEXT_PREPARE + * This event allows plugins to modify the text before it gets tokenized. + * Plugins intercepting this event should also intercept INDEX_VERSION_GET + * * @param string $text plain text * @param boolean $wc are wildcards allowed? * @return array list of words in the text @@ -417,16 +418,18 @@ public function tokenizer($text, $wc=false) { $wc = ($wc) ? '' : '\*'; $stopwords =& idx_get_stopwords(); - if ($conf['external_tokenizer'] && $conf['tokenizer_cmd'] != '') { - if (0 == io_exec($conf['tokenizer_cmd'], $text, $output)) - $text = $output; - } else { + // prepare the text to be tokenized + $evt = new Doku_Event('INDEXER_TEXT_PREPARE', $text); + if ($evt->advise_before(true)) { if (preg_match('/[^0-9A-Za-z ]/u', $text)) { // handle asian chars as single words (may fail on older PHP version) $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text); if (!is_null($asia)) $text = $asia; // recover from regexp falure } } + $evt->advise_after(); + unset($evt); + $text = strtr($text, array( "\r" => ' ', diff --git a/lib/plugins/config/lang/en/lang.php b/lib/plugins/config/lang/en/lang.php index d7a5448502..18bfb56fa6 100644 --- a/lib/plugins/config/lang/en/lang.php +++ b/lib/plugins/config/lang/en/lang.php @@ -142,8 +142,6 @@ $lang['renderer__core'] = '%s (dokuwiki core)'; $lang['renderer__plugin'] = '%s (plugin)'; $lang['rememberme'] = 'Allow permanent login cookies (remember me)'; -$lang['external_tokenizer'] = 'Use an external program to split pages into words for indexing'; -$lang['tokenizer_cmd'] = 'Command line to start the external tokenizer'; $lang['rss_type'] = 'XML feed type'; $lang['rss_linkto'] = 'XML feed links to'; diff --git a/lib/plugins/config/settings/config.metadata.php b/lib/plugins/config/settings/config.metadata.php index ca2cd0c12e..af7e63a618 100644 --- a/lib/plugins/config/settings/config.metadata.php +++ b/lib/plugins/config/settings/config.metadata.php @@ -194,8 +194,6 @@ $meta['xsendfile'] = array('multichoice','_choices' => array(0,1,2,3)); $meta['renderer_xhtml'] = array('renderer','_format' => 'xhtml','_choices' => array('xhtml')); $meta['readdircache'] = array('numeric'); -$meta['external_tokenizer'] = array('onoff'); -$meta['tokenizer_cmd'] = array('string'); $meta['_network'] = array('fieldset'); $meta['proxy____host'] = array('string','_pattern' => '#^(|[a-z0-9\-\.+]+)$#i');