Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
AJ ONeal committed Jul 21, 2012
0 parents commit 84b7eff
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
@@ -0,0 +1,3 @@
*.sqlext
*.sqlite3
*.sw*
13 changes: 13 additions & 0 deletions Makefile
@@ -0,0 +1,13 @@
all: linux
#Linux

linux:
gcc -shared -fPIC -Isqlite3 -o fts4-rank.sqlext fts4-rank.c
#Using GCC on Mac OSX

darwin:
gcc -bundle -fPIC -Isqlite3 -o fts4-rank.sqlext fts4-rank.c

win32:
#Microsoft Tools on Windows
cl /Gd fts4-rank.c /I sqlite3 /DDLL /LD /link /export:sqlite3_extension_init /out:fts4-rank.sqlext
20 changes: 20 additions & 0 deletions README.md
@@ -0,0 +1,20 @@
Loadable Extensions in SQLite
===

There is a wonderful [example of a rank extension](http://www.sqlite.org/fts3.html#appendix_a) provided in the sqlite3 documentation for full-text search.
However, the example requires building an extension which is not explicitly detailed in the example.
This repository is for the sole purpose of building the un-adulterated example from the docs.

git clone http://github.com/coolaj86/sqlite3-fts4-rank.git
cd sqlite3-fts4-rank
make linux # or darwin (OSX) or win32 (Visual Studio)

sqlite3 --init test-fts4.sql ':memory:'

See Also
===

* <http://www.sqlite.org/cvstrac/wiki?p=LoadableExtensions>
* <http://www.sqlite.org/c3ref/create_function.html>
* <http://www.sqlite.org/fts3.html#appendix_a>
* <http://www.sqlite.org/c3ref/load_extension.html>
103 changes: 103 additions & 0 deletions fts4-rank.c
@@ -0,0 +1,103 @@
#include <sqlite3ext.h>
#include <assert.h>
SQLITE_EXTENSION_INIT1

/*
** SQLite user defined function to use with matchinfo() to calculate the
** relevancy of an FTS match. The value returned is the relevancy score
** (a real value greater than or equal to zero). A larger value indicates
** a more relevant document.
**
** The overall relevancy returned is the sum of the relevancies of each
** column value in the FTS table. The relevancy of a column value is the
** sum of the following for each reportable phrase in the FTS query:
**
** (<hit count> / <global hit count>) * <column weight>
**
** where <hit count> is the number of instances of the phrase in the
** column value of the current row and <global hit count> is the number
** of instances of the phrase in the same column of all rows in the FTS
** table. The <column weight> is a weighting factor assigned to each
** column by the caller (see below).
**
** The first argument to this function must be the return value of the FTS
** matchinfo() function. Following this must be one argument for each column
** of the FTS table containing a numeric weight factor for the corresponding
** column. Example:
**
** CREATE VIRTUAL TABLE documents USING fts3(title, content)
**
** The following query returns the docids of documents that match the full-text
** query <query> sorted from most to least relevant. When calculating
** relevance, query term instances in the 'title' column are given twice the
** weighting of those in the 'content' column.
**
** SELECT docid FROM documents
** WHERE documents MATCH <query>
** ORDER BY rank(matchinfo(documents), 1.0, 0.5) DESC
*/
static void rankfunc(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal){
int *aMatchinfo; /* Return value of matchinfo() */
int nCol; /* Number of columns in the table */
int nPhrase; /* Number of phrases in the query */
int iPhrase; /* Current phrase */
double score = 0.0; /* Value to return */

assert( sizeof(int)==4 );

/* Check that the number of arguments passed to this function is correct.
** If not, jump to wrong_number_args. Set aMatchinfo to point to the array
** of unsigned integer values returned by FTS function matchinfo. Set
** nPhrase to contain the number of reportable phrases in the users full-text
** query, and nCol to the number of columns in the table.
*/
if( nVal<1 ) goto wrong_number_args;
aMatchinfo = (unsigned int *)sqlite3_value_blob(apVal[0]);
nPhrase = aMatchinfo[0];
nCol = aMatchinfo[1];
if( nVal!=(1+nCol) ) goto wrong_number_args;

/* Iterate through each phrase in the users query. */
for(iPhrase=0; iPhrase<nPhrase; iPhrase++){
int iCol; /* Current column */

/* Now iterate through each column in the users query. For each column,
** increment the relevancy score by:
**
** (<hit count> / <global hit count>) * <column weight>
**
** aPhraseinfo[] points to the start of the data for phrase iPhrase. So
** the hit count and global hit counts for each column are found in
** aPhraseinfo[iCol*3] and aPhraseinfo[iCol*3+1], respectively.
*/
int *aPhraseinfo = &aMatchinfo[2 + iPhrase*nCol*3];
for(iCol=0; iCol<nCol; iCol++){
int nHitCount = aPhraseinfo[3*iCol];
int nGlobalHitCount = aPhraseinfo[3*iCol+1];
double weight = sqlite3_value_double(apVal[iCol+1]);
if( nHitCount>0 ){
score += ((double)nHitCount / (double)nGlobalHitCount) * weight;
}
}
}

sqlite3_result_double(pCtx, score);
return;

/* Jump here if the wrong number of arguments are passed to this function */
wrong_number_args:
sqlite3_result_error(pCtx, "wrong number of arguments to function rank()", -1);
}

/* SQLite invokes this routine once when it loads the extension.
** Create new functions, collating sequences, and virtual table
** modules here. This is usually the only exported symbol in
** the shared library.
*/
int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi) {
SQLITE_EXTENSION_INIT2(pApi)

// rank call
sqlite3_create_function(db, "rank", 3, SQLITE_ANY, 0, rankfunc, 0, 0);
return 0;
}
18 changes: 18 additions & 0 deletions test-fts4.sql
@@ -0,0 +1,18 @@
.load fts4-rank.sqlext

-- This example (and all others in this section) assumes the following schema
CREATE VIRTUAL TABLE documents USING fts4(title, content);

INSERT INTO mail VALUES('hello world', 'This message is a hello world message.');
INSERT INTO mail VALUES('urgent: serious', 'This mail is seen as a more serious mail');

SELECT title FROM documents WHERE documents MATCH 'effect* known';

SELECT title FROM documents JOIN (
SELECT docid, rank(matchinfo(documents)) AS rank
FROM documents
WHERE documents MATCH 'effect* known once'
ORDER BY rank DESC
LIMIT 10 OFFSET 0
) AS ranktable USING(docid)
ORDER BY ranktable.rank DESC;
39 changes: 39 additions & 0 deletions test-matchinfo.sql
@@ -0,0 +1,39 @@
-- Create and populate an FTS4 table with two columns:
CREATE VIRTUAL TABLE t1 USING fts4(a, b);
INSERT INTO t1 VALUES('transaction default models default', 'Non transaction reads');
INSERT INTO t1 VALUES('the default transaction', 'these semantics present');
INSERT INTO t1 VALUES('single request', 'default data');

-- In the following query, no format string is specified and so it defaults
-- to "pcx". It therefore returns a single row consisting of a single blob
-- value 80 bytes in size (20 32-bit integers - 1 for "p", 1 for "c" and
-- 3*2*3 for "x"). If each block of 4 bytes in the blob is interpreted
-- as an unsigned integer in machine byte-order, the values will be:
--
-- 3 2 1 3 2 0 1 1 1 2 2 0 1 1 0 0 0 1 1 1
--
-- The row returned corresponds to the second entry inserted into table t1.
-- The first two integers in the blob show that the query contained three
-- phrases and the table being queried has two columns. The next block of
-- three integers describes column 0 (in this case column "a") and phrase
-- 0 (in this case "default"). The current row contains 1 hit for "default"
-- in column 0, of a total of 3 hits for "default" that occur in column
-- 0 of any table row. The 3 hits are spread across 2 different rows.
--
-- The next set of three integers (0 1 1) pertain to the hits for "default"
-- in column 1 of the table (0 in this row, 1 in all rows, spread across
-- 1 rows).
--
SELECT matchinfo(t1) FROM t1 WHERE t1 MATCH 'default transaction "these semantics"';

-- The format string for this query is "ns". The output array will therefore
-- contain 3 integer values - 1 for "n" and 2 for "s". The query returns
-- two rows (the first two rows in the table match). The values returned are:
--
-- 3 1 1
-- 3 2 0
--
-- The first value in the matchinfo array returned for both rows is 3 (the
-- number of rows in the table). The following two values are the lengths
-- of the longest common subsequence of phrase matches in each column.
SELECT matchinfo(t1, 'ns') FROM t1 WHERE t1 MATCH 'default transaction';

0 comments on commit 84b7eff

Please sign in to comment.