Skip to content

Commit

Permalink
Merge branch 'indexer_rewrite'
Browse files Browse the repository at this point in the history
  • Loading branch information
michitux committed Mar 7, 2011
2 parents dea1115 + ad79cb7 commit d8e733e
Show file tree
Hide file tree
Showing 11 changed files with 1,229 additions and 644 deletions.
25 changes: 11 additions & 14 deletions bin/indexer.php
Expand Up @@ -13,10 +13,6 @@
require_once(DOKU_INC.'inc/cliopts.php');
session_write_close();

// Version tag used to force rebuild on upgrade
// Need to keep in sync with lib/exe/indexer.php
if(!defined('INDEXER_VERSION')) define('INDEXER_VERSION', 2);

// handle options
$short_opts = 'hcuq';
$long_opts = array('help', 'clear', 'update', 'quiet');
Expand All @@ -28,6 +24,7 @@
}
$CLEAR = false;
$QUIET = false;
$INDEXER = null;
foreach ($OPTS->options as $key => $val) {
switch ($key) {
case 'h':
Expand Down Expand Up @@ -70,6 +67,9 @@ function _usage() {

function _update(){
global $conf;
global $INDEXER;

$INDEXER = idx_get_indexer();

$data = array();
_quietecho("Searching pages... ");
Expand All @@ -82,25 +82,21 @@ function _update(){
}

function _index($id){
global $INDEXER;
global $CLEAR;
global $QUIET;

// if not cleared only update changed and new files
if(!$CLEAR){
if($CLEAR){
$idxtag = metaFN($id,'.indexed');
if(@file_exists($idxtag)){
if(io_readFile($idxtag) >= INDEXER_VERSION){
$last = @filemtime(metaFN($id,'.indexed'));
if($last > @filemtime(wikiFN($id))) return;
}
@unlink($idxtag);
}
}

_lock();
_quietecho("$id... ");
idx_addPage($id);
io_saveFile(metaFN($id,'.indexed'),INDEXER_VERSION);
idx_addPage($id, !$QUIET);
_quietecho("done.\n");
_unlock();
}

/**
Expand Down Expand Up @@ -145,7 +141,7 @@ function _clearindex(){
_lock();
_quietecho("Clearing index... ");
io_saveFile($conf['indexdir'].'/page.idx','');
io_saveFile($conf['indexdir'].'/title.idx','');
//io_saveFile($conf['indexdir'].'/title.idx','');
$dir = @opendir($conf['indexdir']);
if($dir!==false){
while(($f = readdir($dir)) !== false){
Expand All @@ -154,6 +150,7 @@ function _clearindex(){
@unlink($conf['indexdir']."/$f");
}
}
@unlink($conf['indexdir'].'/lengths.idx');
_quietecho("done.\n");
_unlock();
}
Expand Down
2 changes: 2 additions & 0 deletions conf/dokuwiki.php
Expand Up @@ -134,6 +134,8 @@
$conf['xsendfile'] = 0; //Use X-Sendfile (1 = lighttpd, 2 = standard)
$conf['renderer_xhtml'] = 'xhtml'; //renderer to use for main page generation
$conf['rememberme'] = 1; //Enable/disable remember me on login
$conf['external_tokenizer'] = 0; //Use an external program to split pages into words for indexing
$conf['tokenizer_cmd'] = '/usr/bin/mecab -O wakati';

//Set target to use when creating links - leave empty for same window
$conf['target']['wiki'] = '';
Expand Down
2 changes: 1 addition & 1 deletion inc/Sitemapper.php
Expand Up @@ -45,7 +45,7 @@ public function generate(){

dbglog("Sitemapper::generate(): using $sitemap"); // FIXME: Only in debug mode

$pages = idx_getIndex('page', '');
$pages = idx_get_indexer()->getPages();
dbglog('Sitemapper::generate(): creating sitemap using '.count($pages).' pages');
$items = array();

Expand Down
92 changes: 36 additions & 56 deletions inc/fulltext.php
Expand Up @@ -36,19 +36,21 @@ function ft_pageSearch($query,&$highlight){
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
*/
function _ft_pageSearch(&$data) {
$Indexer = idx_get_indexer();

// parse the given query
$q = ft_queryParser($data['query']);
$q = ft_queryParser($Indexer, $data['query']);
$data['highlight'] = $q['highlight'];

if (empty($q['parsed_ary'])) return array();

// lookup all words found in the query
$lookup = idx_lookup($q['words']);
$lookup = $Indexer->lookup($q['words']);

// get all pages in this dokuwiki site (!: includes nonexistent pages)
$pages_all = array();
foreach (idx_getIndex('page', '') as $id) {
$pages_all[trim($id)] = 0; // base: 0 hit
foreach ($Indexer->getPages() as $id) {
$pages_all[$id] = 0; // base: 0 hit
}

// process the query
Expand Down Expand Up @@ -122,29 +124,12 @@ function _ft_pageSearch(&$data) {
/**
* Returns the backlinks for a given page
*
* Does a quick lookup with the fulltext index, then
* evaluates the instructions of the found pages
* Uses the metadata index.
*/
function ft_backlinks($id){
global $conf;
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
$stopwords = @file_exists($swfile) ? file($swfile) : array();

$result = array();

// quick lookup of the pagename
$page = noNS($id);
$matches = idx_lookup(idx_tokenizer($page,$stopwords)); // pagename may contain specials (_ or .)
$docs = array_keys(ft_resultCombine(array_values($matches)));
$docs = array_filter($docs,'isVisiblePage'); // discard hidden pages
if(!count($docs)) return $result;

// check metadata for matching links
foreach($docs as $match){
// metadata relation reference links are already resolved
$links = p_get_metadata($match,'relation references',false);
if (isset($links[$id])) $result[] = $match;
}
$result = idx_get_indexer()->lookupKey('relation_references', $id);

if(!count($result)) return $result;

Expand All @@ -168,17 +153,14 @@ function ft_backlinks($id){
* Aborts after $max found results
*/
function ft_mediause($id,$max){
global $conf;
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
$stopwords = @file_exists($swfile) ? file($swfile) : array();

if(!$max) $max = 1; // need to find at least one

$result = array();

// quick lookup of the mediafile
// FIXME use metadata key lookup
$media = noNS($id);
$matches = idx_lookup(idx_tokenizer($media,$stopwords));
$matches = idx_lookup(idx_tokenizer($media));
$docs = array_keys(ft_resultCombine(array_values($matches)));
if(!count($docs)) return $result;

Expand Down Expand Up @@ -229,7 +211,6 @@ function ft_pageLookup($id, $in_ns=false, $in_title=false){
}

function _ft_pageLookup(&$data){
global $conf;
// split out original parameters
$id = $data['id'];
if (preg_match('/(?:^| )@(\w+)/', $id, $matches)) {
Expand All @@ -239,29 +220,32 @@ function _ft_pageLookup(&$data){

$in_ns = $data['in_ns'];
$in_title = $data['in_title'];
$cleaned = cleanID($id);

$pages = array_map('rtrim', idx_getIndex('page', ''));
$titles = array_map('rtrim', idx_getIndex('title', ''));
// check for corrupt title index #FS2076
if(count($pages) != count($titles)){
$titles = array_fill(0,count($pages),'');
@unlink($conf['indexdir'].'/title.idx'); // will be rebuilt in inc/init.php
}
$pages = array_combine($pages, $titles);
$Indexer = idx_get_indexer();
$page_idx = $Indexer->getPages();

$cleaned = cleanID($id);
$pages = array();
if ($id !== '' && $cleaned !== '') {
foreach ($pages as $p_id => $p_title) {
if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) === false) &&
(!$in_title || (stripos($p_title, $id) === false)) ) {
unset($pages[$p_id]);
foreach ($page_idx as $p_id) {
if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
if (!isset($pages[$p_id]))
$pages[$p_id] = p_get_first_heading($p_id, false);
}
}
if ($in_title) {
$wildcard_id = "*$id*";
foreach ($Indexer->lookupKey('title', $wildcard_id) as $p_id) {
if (!isset($pages[$p_id]))
$pages[$p_id] = p_get_first_heading($p_id, false);
}
}
}
if (isset($ns)) {
foreach (array_keys($pages) as $p_id) {
if (strpos($p_id, $ns) !== 0) {
unset($pages[$p_id]);
foreach ($page_idx as $p_id) {
if (strpos($p_id, $ns) === 0) {
if (!isset($pages[$p_id]))
$pages[$p_id] = p_get_first_heading($p_id, false);
}
}
}
Expand Down Expand Up @@ -500,11 +484,7 @@ function ft_resultComplement($args) {
* @author Andreas Gohr <andi@splitbrain.org>
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
*/
function ft_queryParser($query){
global $conf;
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
$stopwords = @file_exists($swfile) ? file($swfile) : array();

function ft_queryParser($Indexer, $query){
/**
* parse a search query and transform it into intermediate representation
*
Expand Down Expand Up @@ -550,7 +530,7 @@ function ft_queryParser($query){
if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
// phrase-include and phrase-exclude
$not = $matches[1] ? 'NOT' : '';
$parsed = $not.ft_termParser($matches[2], $stopwords, false, true);
$parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
} else {
// fix incomplete phrase
$term = str_replace('"', ' ', $term);
Expand Down Expand Up @@ -597,10 +577,10 @@ function ft_queryParser($query){
$parsed .= '(N+:'.$matches[1].')';
} elseif (preg_match('/^-(.+)$/', $token, $matches)) {
// word-exclude
$parsed .= 'NOT('.ft_termParser($matches[1], $stopwords).')';
$parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
} else {
// word-include
$parsed .= ft_termParser($token, $stopwords);
$parsed .= ft_termParser($Indexer, $token);
}
}
}
Expand Down Expand Up @@ -734,18 +714,18 @@ function ft_queryParser($query){
*
* @author Kazutaka Miyasaka <kazmiya@gmail.com>
*/
function ft_termParser($term, &$stopwords, $consider_asian = true, $phrase_mode = false) {
function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
$parsed = '';
if ($consider_asian) {
// successive asian characters need to be searched as a phrase
$words = preg_split('/('.IDX_ASIAN.'+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
foreach ($words as $word) {
$phrase_mode = $phrase_mode ? true : preg_match('/'.IDX_ASIAN.'/u', $word);
$parsed .= ft_termParser($word, $stopwords, false, $phrase_mode);
$parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
}
} else {
$term_noparen = str_replace(array('(', ')'), ' ', $term);
$words = idx_tokenizer($term_noparen, $stopwords, true);
$words = $Indexer->tokenizer($term_noparen, true);

// W_: no need to highlight
if (empty($words)) {
Expand Down

0 comments on commit d8e733e

Please sign in to comment.