Merge branch 'indexer_rewrite'

dokuwiki · Mar 7, 2011 · d8e733e · d8e733e
2 parents dea1115 + ad79cb7
commit d8e733e
Show file tree

Hide file tree

Showing 11 changed files with 1,229 additions and 644 deletions.
diff --git a/bin/indexer.php b/bin/indexer.php
@@ -13,10 +13,6 @@
 require_once(DOKU_INC.'inc/cliopts.php');
 session_write_close();
 
-// Version tag used to force rebuild on upgrade
-// Need to keep in sync with lib/exe/indexer.php
-if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 2);
-
 // handle options
 $short_opts = 'hcuq';
 $long_opts  = array('help', 'clear', 'update', 'quiet');
@@ -28,6 +24,7 @@
 }
 $CLEAR = false;
 $QUIET = false;
+$INDEXER = null;
 foreach ($OPTS->options as $key => $val) {
     switch ($key) {
         case 'h':
@@ -70,6 +67,9 @@ function _usage() {
 
 function _update(){
     global $conf;
+    global $INDEXER;
+
+    $INDEXER = idx_get_indexer();
 
     $data = array();
     _quietecho("Searching pages... ");
@@ -82,25 +82,21 @@ function _update(){
 }
 
 function _index($id){
+    global $INDEXER;
     global $CLEAR;
+    global $QUIET;
 
     // if not cleared only update changed and new files
-    if(!$CLEAR){
+    if($CLEAR){
         $idxtag = metaFN($id,'.indexed');
         if(@file_exists($idxtag)){
-            if(io_readFile($idxtag) >= INDEXER_VERSION){
-                $last = @filemtime(metaFN($id,'.indexed'));
-                if($last > @filemtime(wikiFN($id))) return;
-            }
+            @unlink($idxtag);
         }
     }
 
-    _lock();
     _quietecho("$id... ");
-    idx_addPage($id);
-    io_saveFile(metaFN($id,'.indexed'),INDEXER_VERSION);
+    idx_addPage($id, !$QUIET);
     _quietecho("done.\n");
-    _unlock();
 }
 
 /**
@@ -145,7 +141,7 @@ function _clearindex(){
     _lock();
     _quietecho("Clearing index... ");
     io_saveFile($conf['indexdir'].'/page.idx','');
-    io_saveFile($conf['indexdir'].'/title.idx','');
+    //io_saveFile($conf['indexdir'].'/title.idx','');
     $dir = @opendir($conf['indexdir']);
     if($dir!==false){
         while(($f = readdir($dir)) !== false){
@@ -154,6 +150,7 @@ function _clearindex(){
                 @unlink($conf['indexdir']."/$f");
         }
     }
+    @unlink($conf['indexdir'].'/lengths.idx');
     _quietecho("done.\n");
     _unlock();
 }

diff --git a/conf/dokuwiki.php b/conf/dokuwiki.php
@@ -134,6 +134,8 @@
 $conf['xsendfile']   = 0;                //Use X-Sendfile (1 = lighttpd, 2 = standard)
 $conf['renderer_xhtml'] = 'xhtml';       //renderer to use for main page generation
 $conf['rememberme'] = 1;                 //Enable/disable remember me on login
+$conf['external_tokenizer'] = 0;         //Use an external program to split pages into words for indexing
+$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati';
 
 //Set target to use when creating links - leave empty for same window
 $conf['target']['wiki']      = '';

diff --git a/inc/Sitemapper.php b/inc/Sitemapper.php
@@ -45,7 +45,7 @@ public function generate(){
 
         dbglog("Sitemapper::generate(): using $sitemap"); // FIXME: Only in debug mode
 
-        $pages = idx_getIndex('page', '');
+        $pages = idx_get_indexer()->getPages();
         dbglog('Sitemapper::generate(): creating sitemap using '.count($pages).' pages');
         $items = array();
 

diff --git a/inc/fulltext.php b/inc/fulltext.php
@@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){
  * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  */
 function _ft_pageSearch(&$data) {
+    $Indexer = idx_get_indexer();
+
     // parse the given query
-    $q = ft_queryParser($data['query']);
+    $q = ft_queryParser($Indexer, $data['query']);
     $data['highlight'] = $q['highlight'];
 
     if (empty($q['parsed_ary'])) return array();
 
     // lookup all words found in the query
-    $lookup = idx_lookup($q['words']);
+    $lookup = $Indexer->lookup($q['words']);
 
     // get all pages in this dokuwiki site (!: includes nonexistent pages)
     $pages_all = array();
-    foreach (idx_getIndex('page', '') as $id) {
-        $pages_all[trim($id)] = 0; // base: 0 hit
+    foreach ($Indexer->getPages() as $id) {
+        $pages_all[$id] = 0; // base: 0 hit
     }
 
     // process the query
@@ -122,29 +124,12 @@ function _ft_pageSearch(&$data) {
 /**
  * Returns the backlinks for a given page
  *
- * Does a quick lookup with the fulltext index, then
- * evaluates the instructions of the found pages
+ * Uses the metadata index.
  */
 function ft_backlinks($id){
-    global $conf;
-    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
-    $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
     $result = array();
 
-    // quick lookup of the pagename
-    $page    = noNS($id);
-    $matches = idx_lookup(idx_tokenizer($page,$stopwords));  // pagename may contain specials (_ or .)
-    $docs    = array_keys(ft_resultCombine(array_values($matches)));
-    $docs    = array_filter($docs,'isVisiblePage'); // discard hidden pages
-    if(!count($docs)) return $result;
-
-    // check metadata for matching links
-    foreach($docs as $match){
-        // metadata relation reference links are already resolved
-        $links = p_get_metadata($match,'relation references',false);
-        if (isset($links[$id])) $result[] = $match;
-    }
+    $result = idx_get_indexer()->lookupKey('relation_references', $id);
 
     if(!count($result)) return $result;
 
@@ -168,17 +153,14 @@ function ft_backlinks($id){
  * Aborts after $max found results
  */
 function ft_mediause($id,$max){
-    global $conf;
-    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
-    $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
     if(!$max) $max = 1; // need to find at least one
 
     $result = array();
 
     // quick lookup of the mediafile
+    // FIXME use metadata key lookup
     $media   = noNS($id);
-    $matches = idx_lookup(idx_tokenizer($media,$stopwords));
+    $matches = idx_lookup(idx_tokenizer($media));
     $docs    = array_keys(ft_resultCombine(array_values($matches)));
     if(!count($docs)) return $result;
 
@@ -229,7 +211,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){
 }
 
 function _ft_pageLookup(&$data){
-    global $conf;
     // split out original parameters
     $id = $data['id'];
     if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) {
@@ -239,29 +220,32 @@ function _ft_pageLookup(&$data){
 
     $in_ns    = $data['in_ns'];
     $in_title = $data['in_title'];
+    $cleaned = cleanID($id);
 
-    $pages  = array_map('rtrim', idx_getIndex('page', ''));
-    $titles = array_map('rtrim', idx_getIndex('title', ''));
-    // check for corrupt title index #FS2076
-    if(count($pages) != count($titles)){
-        $titles = array_fill(0,count($pages),'');
-        @unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php
-    }
-    $pages = array_combine($pages, $titles);
+    $Indexer = idx_get_indexer();
+    $page_idx = $Indexer->getPages();
 
-    $cleaned = cleanID($id);
+    $pages = array();
     if ($id !== '' && $cleaned !== '') {
-        foreach ($pages as $p_id => $p_title) {
-            if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) &&
-                (!$in_title || (stripos($p_title, $id) === false)) ) {
-                unset($pages[$p_id]);
+        foreach ($page_idx as $p_id) {
+            if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
+                if (!isset($pages[$p_id]))
+                    $pages[$p_id] = p_get_first_heading($p_id, false);
+            }
+        }
+        if ($in_title) {
+            $wildcard_id = "*$id*";
+            foreach ($Indexer->lookupKey('title', $wildcard_id) as $p_id) {
+                if (!isset($pages[$p_id]))
+                    $pages[$p_id] = p_get_first_heading($p_id, false);
             }
         }
     }
     if (isset($ns)) {
-        foreach (array_keys($pages) as $p_id) {
-            if (strpos($p_id, $ns) !== 0) {
-                unset($pages[$p_id]);
+        foreach ($page_idx as $p_id) {
+            if (strpos($p_id, $ns) === 0) {
+                if (!isset($pages[$p_id]))
+                    $pages[$p_id] = p_get_first_heading($p_id, false);
             }
         }
     }
@@ -500,11 +484,7 @@ function ft_resultComplement($args) {
  * @author Andreas Gohr <andi@splitbrain.org>
  * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  */
-function ft_queryParser($query){
-    global $conf;
-    $swfile    = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
-    $stopwords = @file_exists($swfile) ? file($swfile) : array();
-
+function ft_queryParser($Indexer, $query){
     /**
      * parse a search query and transform it into intermediate representation
      *
@@ -550,7 +530,7 @@ function ft_queryParser($query){
         if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
             // phrase-include and phrase-exclude
             $not = $matches[1] ? 'NOT' : '';
-            $parsed = $not.ft_termParser($matches[2], $stopwords, false, true);
+            $parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
         } else {
             // fix incomplete phrase
             $term = str_replace('"', ' ', $term);
@@ -597,10 +577,10 @@ function ft_queryParser($query){
                     $parsed .= '(N+:'.$matches[1].')';
                 } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
                     // word-exclude
-                    $parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')';
+                    $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
                 } else {
                     // word-include
-                    $parsed .= ft_termParser($token, $stopwords);
+                    $parsed .= ft_termParser($Indexer, $token);
                 }
             }
         }
@@ -734,18 +714,18 @@ function ft_queryParser($query){
  *
  * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  */
-function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) {
+function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
     $parsed = '';
     if ($consider_asian) {
         // successive asian characters need to be searched as a phrase
         $words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
         foreach ($words as $word) {
             $phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word);
-            $parsed .= ft_termParser($word, $stopwords, false, $phrase_mode);
+            $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
         }
     } else {
         $term_noparen = str_replace(array('(', ')'), ' ', $term);
-        $words = idx_tokenizer($term_noparen, $stopwords, true);
+        $words = $Indexer->tokenizer($term_noparen, true);
 
         // W_: no need to highlight
         if (empty($words)) {