Permalink
Browse files

initial commit

  • Loading branch information...
0 parents commit 84b7eff4c94dd952b44ebb0e69c54d25f125cda4 @coolaj86 committed Jul 21, 2012
Showing with 196 additions and 0 deletions.
  1. +3 −0 .gitignore
  2. +13 −0 Makefile
  3. +20 −0 README.md
  4. +103 −0 fts4-rank.c
  5. +18 −0 test-fts4.sql
  6. +39 −0 test-matchinfo.sql
@@ -0,0 +1,3 @@
+*.sqlext
+*.sqlite3
+*.sw*
@@ -0,0 +1,13 @@
+all: linux
+ #Linux
+
+linux:
+ gcc -shared -fPIC -Isqlite3 -o fts4-rank.sqlext fts4-rank.c
+ #Using GCC on Mac OSX
+
+darwin:
+ gcc -bundle -fPIC -Isqlite3 -o fts4-rank.sqlext fts4-rank.c
+
+win32:
+ #Microsoft Tools on Windows
+ cl /Gd fts4-rank.c /I sqlite3 /DDLL /LD /link /export:sqlite3_extension_init /out:fts4-rank.sqlext
@@ -0,0 +1,20 @@
+Loadable Extensions in SQLite
+===
+
+There is a wonderful [example of a rank extension](http://www.sqlite.org/fts3.html#appendix_a) provided in the sqlite3 documentation for full-text search.
+However, the example requires building an extension which is not explicitly detailed in the example.
+This repository is for the sole purpose of building the un-adulterated example from the docs.
+
+ git clone http://github.com/coolaj86/sqlite3-fts4-rank.git
+ cd sqlite3-fts4-rank
+ make linux # or darwin (OSX) or win32 (Visual Studio)
+
+ sqlite3 --init test-fts4.sql ':memory:'
+
+See Also
+===
+
+ * <http://www.sqlite.org/cvstrac/wiki?p=LoadableExtensions>
+ * <http://www.sqlite.org/c3ref/create_function.html>
+ * <http://www.sqlite.org/fts3.html#appendix_a>
+ * <http://www.sqlite.org/c3ref/load_extension.html>
@@ -0,0 +1,103 @@
+#include <sqlite3ext.h>
+#include <assert.h>
+SQLITE_EXTENSION_INIT1
+
+/*
+** SQLite user defined function to use with matchinfo() to calculate the
+** relevancy of an FTS match. The value returned is the relevancy score
+** (a real value greater than or equal to zero). A larger value indicates
+** a more relevant document.
+**
+** The overall relevancy returned is the sum of the relevancies of each
+** column value in the FTS table. The relevancy of a column value is the
+** sum of the following for each reportable phrase in the FTS query:
+**
+** (<hit count> / <global hit count>) * <column weight>
+**
+** where <hit count> is the number of instances of the phrase in the
+** column value of the current row and <global hit count> is the number
+** of instances of the phrase in the same column of all rows in the FTS
+** table. The <column weight> is a weighting factor assigned to each
+** column by the caller (see below).
+**
+** The first argument to this function must be the return value of the FTS
+** matchinfo() function. Following this must be one argument for each column
+** of the FTS table containing a numeric weight factor for the corresponding
+** column. Example:
+**
+** CREATE VIRTUAL TABLE documents USING fts3(title, content)
+**
+** The following query returns the docids of documents that match the full-text
+** query <query> sorted from most to least relevant. When calculating
+** relevance, query term instances in the 'title' column are given twice the
+** weighting of those in the 'content' column.
+**
+** SELECT docid FROM documents
+** WHERE documents MATCH <query>
+** ORDER BY rank(matchinfo(documents), 1.0, 0.5) DESC
+*/
+static void rankfunc(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal){
+ int *aMatchinfo; /* Return value of matchinfo() */
+ int nCol; /* Number of columns in the table */
+ int nPhrase; /* Number of phrases in the query */
+ int iPhrase; /* Current phrase */
+ double score = 0.0; /* Value to return */
+
+ assert( sizeof(int)==4 );
+
+ /* Check that the number of arguments passed to this function is correct.
+ ** If not, jump to wrong_number_args. Set aMatchinfo to point to the array
+ ** of unsigned integer values returned by FTS function matchinfo. Set
+ ** nPhrase to contain the number of reportable phrases in the users full-text
+ ** query, and nCol to the number of columns in the table.
+ */
+ if( nVal<1 ) goto wrong_number_args;
+ aMatchinfo = (unsigned int *)sqlite3_value_blob(apVal[0]);
+ nPhrase = aMatchinfo[0];
+ nCol = aMatchinfo[1];
+ if( nVal!=(1+nCol) ) goto wrong_number_args;
+
+ /* Iterate through each phrase in the users query. */
+ for(iPhrase=0; iPhrase<nPhrase; iPhrase++){
+ int iCol; /* Current column */
+
+ /* Now iterate through each column in the users query. For each column,
+ ** increment the relevancy score by:
+ **
+ ** (<hit count> / <global hit count>) * <column weight>
+ **
+ ** aPhraseinfo[] points to the start of the data for phrase iPhrase. So
+ ** the hit count and global hit counts for each column are found in
+ ** aPhraseinfo[iCol*3] and aPhraseinfo[iCol*3+1], respectively.
+ */
+ int *aPhraseinfo = &aMatchinfo[2 + iPhrase*nCol*3];
+ for(iCol=0; iCol<nCol; iCol++){
+ int nHitCount = aPhraseinfo[3*iCol];
+ int nGlobalHitCount = aPhraseinfo[3*iCol+1];
+ double weight = sqlite3_value_double(apVal[iCol+1]);
+ if( nHitCount>0 ){
+ score += ((double)nHitCount / (double)nGlobalHitCount) * weight;
+ }
+ }
+ }
+
+ sqlite3_result_double(pCtx, score);
+ return;
+
+ /* Jump here if the wrong number of arguments are passed to this function */
+wrong_number_args:
+ sqlite3_result_error(pCtx, "wrong number of arguments to function rank()", -1);
+}
+
+/* SQLite invokes this routine once when it loads the extension.
+** Create new functions, collating sequences, and virtual table
+** modules here. This is usually the only exported symbol in
+** the shared library.
+*/
+int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi) {
+ SQLITE_EXTENSION_INIT2(pApi)
+
+ // rank call
+ sqlite3_create_function(db, "rank", 3, SQLITE_ANY, 0, rankfunc, 0, 0);
+ return 0;
+}
@@ -0,0 +1,18 @@
+.load fts4-rank.sqlext
+
+-- This example (and all others in this section) assumes the following schema
+CREATE VIRTUAL TABLE documents USING fts4(title, content);
+
+INSERT INTO mail VALUES('hello world', 'This message is a hello world message.');
+INSERT INTO mail VALUES('urgent: serious', 'This mail is seen as a more serious mail');
+
+SELECT title FROM documents WHERE documents MATCH 'effect* known';
+
+SELECT title FROM documents JOIN (
+ SELECT docid, rank(matchinfo(documents)) AS rank
+ FROM documents
+ WHERE documents MATCH 'effect* known once'
+ ORDER BY rank DESC
+ LIMIT 10 OFFSET 0
+) AS ranktable USING(docid)
+ORDER BY ranktable.rank DESC;
@@ -0,0 +1,39 @@
+-- Create and populate an FTS4 table with two columns:
+CREATE VIRTUAL TABLE t1 USING fts4(a, b);
+INSERT INTO t1 VALUES('transaction default models default', 'Non transaction reads');
+INSERT INTO t1 VALUES('the default transaction', 'these semantics present');
+INSERT INTO t1 VALUES('single request', 'default data');
+
+-- In the following query, no format string is specified and so it defaults
+-- to "pcx". It therefore returns a single row consisting of a single blob
+-- value 80 bytes in size (20 32-bit integers - 1 for "p", 1 for "c" and
+-- 3*2*3 for "x"). If each block of 4 bytes in the blob is interpreted
+-- as an unsigned integer in machine byte-order, the values will be:
+--
+-- 3 2 1 3 2 0 1 1 1 2 2 0 1 1 0 0 0 1 1 1
+--
+-- The row returned corresponds to the second entry inserted into table t1.
+-- The first two integers in the blob show that the query contained three
+-- phrases and the table being queried has two columns. The next block of
+-- three integers describes column 0 (in this case column "a") and phrase
+-- 0 (in this case "default"). The current row contains 1 hit for "default"
+-- in column 0, of a total of 3 hits for "default" that occur in column
+-- 0 of any table row. The 3 hits are spread across 2 different rows.
+--
+-- The next set of three integers (0 1 1) pertain to the hits for "default"
+-- in column 1 of the table (0 in this row, 1 in all rows, spread across
+-- 1 rows).
+--
+SELECT matchinfo(t1) FROM t1 WHERE t1 MATCH 'default transaction "these semantics"';
+
+-- The format string for this query is "ns". The output array will therefore
+-- contain 3 integer values - 1 for "n" and 2 for "s". The query returns
+-- two rows (the first two rows in the table match). The values returned are:
+--
+-- 3 1 1
+-- 3 2 0
+--
+-- The first value in the matchinfo array returned for both rows is 3 (the
+-- number of rows in the table). The following two values are the lengths
+-- of the longest common subsequence of phrase matches in each column.
+SELECT matchinfo(t1, 'ns') FROM t1 WHERE t1 MATCH 'default transaction';

0 comments on commit 84b7eff

Please sign in to comment.