Skip to content
This repository has been archived by the owner on Mar 9, 2022. It is now read-only.

Commit

Permalink
Proper Unicode collation
Browse files Browse the repository at this point in the history
Unfortunately this is slower than the earlier implementation as it requires converting strings to CFStrings.
The earlier version is still available as a different mode (selected by passing a context value to the collator when defining it.)
TDView allows its collation to be changed.
  • Loading branch information
snej committed Jan 10, 2012
1 parent 0168ac2 commit 9b0f918
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 31 deletions.
11 changes: 10 additions & 1 deletion Source/TDCollateJSON.h
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -9,9 +9,18 @@
#import <Foundation/Foundation.h> #import <Foundation/Foundation.h>


/** SQLite collation function for JSON-formatted strings. /** SQLite collation function for JSON-formatted strings.
Compares them according to CouchDB's collation rules. The "context" parameter should be one of the three collation mode constants below.
WARNING: This function *only* works on valid JSON with no whitespace. WARNING: This function *only* works on valid JSON with no whitespace.
If called on non-JSON strings it is quite likely to crash! */ If called on non-JSON strings it is quite likely to crash! */
int TDCollateJSON(void *context, int TDCollateJSON(void *context,
int len1, const void * chars1, int len1, const void * chars1,
int len2, const void * chars2); int len2, const void * chars2);

// CouchDB's default collation rules, including Unicode collation for strings
#define kTDCollateJSON_Unicode ((void*)0)

// CouchDB's "raw" collation rules (which order scalar types differently, beware)
#define kTDCollateJSON_Raw ((void*)1)

// ASCII mode, which is like CouchDB default except that strings are compared as binary UTF-8
#define kTDCollateJSON_ASCII ((void*)2)
152 changes: 126 additions & 26 deletions Source/TDCollateJSON.m
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ static int dcmp(double n1, double n2) {
} }




// Types of values, ordered according to CouchDB collation order (see view_collation.js tests)
typedef enum { typedef enum {
kEndArray, kEndArray,
kEndObject, kEndObject,
Expand All @@ -45,6 +46,15 @@ static int dcmp(double n1, double n2) {
} ValueType; } ValueType;




// "Raw" ordering is: 0:number, 1:false, 2:null, 3:true, 4:object, 5:array, 6:string
// (according to view_collation_raw.js)
static SInt8 kRawOrderOfValueType[] = {
-4, -3, -2, -1,
2, 1, 3, 0, 6, 5, 4,
7
};


static ValueType valueTypeOf(char c) { static ValueType valueTypeOf(char c) {
switch (c) { switch (c) {
case 'n': return kNull; case 'n': return kNull;
Expand All @@ -66,7 +76,7 @@ static ValueType valueTypeOf(char c) {
} }




static int compareStrings(const char** in1, const char** in2) { static int compareStringsASCII(const char** in1, const char** in2) {
const char* str1 = *in1, *str2 = *in2; const char* str1 = *in1, *str2 = *in2;
while(true) { while(true) {
++str1; ++str1;
Expand All @@ -91,7 +101,6 @@ static int compareStrings(const char** in1, const char** in2) {
int s = cmp(*str1, *str2); int s = cmp(*str1, *str2);
if (s) if (s)
return s; return s;
//FIX: Need to do proper Unicode character collation on UTF-8 sequences
} }


// Strings are equal, so update the positions: // Strings are equal, so update the positions:
Expand All @@ -101,6 +110,53 @@ static int compareStrings(const char** in1, const char** in2) {
} }




static CFStringRef createCFStringFromJSON(const char** in) {
// Scan the JSON string to find its end and whether it contains escapes:
const char* start = ++*in;
unsigned escapes = 0;
const char* str;
for (str = start; *str != '"'; ++str) {
if (*str == '\\') {
++escapes;
++str;
}
}
*in = str + 1;
size_t length = str - start;

CFAllocatorRef deallocator;
if (escapes > 0) {
length -= escapes;
char* buf = malloc(length);
char* dst = buf;
for (str = start; *str != '"'; ++str) {
if (*str == '\\')
++str;
*dst++ = *str;
}
CAssertEq(dst-buf, (int)length);
start = buf;
deallocator = NULL; // means "use system deallocator", i.e. free()
} else {
deallocator = kCFAllocatorNull;
}
CFStringRef cfstr = CFStringCreateWithBytesNoCopy(NULL, (const UInt8*)start, length,
kCFStringEncodingUTF8, NO, deallocator);
CAssert(cfstr != NULL, @"Failed to convert to string: start=%p, length=%u", start, length);
return cfstr;
}


static int compareStringsUnicode(const char** in1, const char** in2) {
CFStringRef str1 = createCFStringFromJSON(in1);
CFStringRef str2 = createCFStringFromJSON(in2);
int result = CFStringCompare(str1, str2, kCFCompareAnchored | kCFCompareLocalized);
CFRelease(str1);
CFRelease(str2);
return result;
}


int TDCollateJSON(void *context, int TDCollateJSON(void *context,
int len1, const void * chars1, int len1, const void * chars1,
int len2, const void * chars2) int len2, const void * chars2)
Expand All @@ -114,10 +170,14 @@ int TDCollateJSON(void *context,
ValueType type1 = valueTypeOf(*str1); ValueType type1 = valueTypeOf(*str1);
ValueType type2 = valueTypeOf(*str2); ValueType type2 = valueTypeOf(*str2);
// If types don't match, stop and return their relative ordering: // If types don't match, stop and return their relative ordering:
if (type1 != type2) if (type1 != type2) {
return cmp(type1, type2); if (context != kTDCollateJSON_Raw)
return cmp(type1, type2);
else
return cmp(kRawOrderOfValueType[type1], kRawOrderOfValueType[type2]);

// If types match, compare the actual token values: // If types match, compare the actual token values:
else switch (type1) { } else switch (type1) {
case kNull: case kNull:
case kTrue: case kTrue:
str1 += 4; str1 += 4;
Expand All @@ -137,7 +197,11 @@ int TDCollateJSON(void *context,
break; break;
} }
case kString: { case kString: {
int diff = compareStrings(&str1, &str2); int diff;
if (context == kTDCollateJSON_Unicode)
diff = compareStringsUnicode(&str1, &str2);
else
diff = compareStringsASCII(&str1, &str2);
if (diff) if (diff)
return diff; // Strings don't match return diff; // Strings don't match
break; break;
Expand Down Expand Up @@ -169,31 +233,67 @@ int TDCollateJSON(void *context,


#if DEBUG #if DEBUG
TestCase(TDCollateScalars) { TestCase(TDCollateScalars) {
CAssertEq(TDCollateJSON(NULL, 0, "true", 0, "false"), 1); void* mode = kTDCollateJSON_Unicode;
CAssertEq(TDCollateJSON(NULL, 0, "false", 0, "true"), -1); CAssertEq(TDCollateJSON(mode, 0, "true", 0, "false"), 1);
CAssertEq(TDCollateJSON(NULL, 0, "null", 0, "17"), -1); CAssertEq(TDCollateJSON(mode, 0, "false", 0, "true"), -1);
CAssertEq(TDCollateJSON(NULL, 0, "123", 0, "1"), 1); CAssertEq(TDCollateJSON(mode, 0, "null", 0, "17"), -1);
CAssertEq(TDCollateJSON(NULL, 0, "123", 0, "0123.0"), 0); CAssertEq(TDCollateJSON(mode, 0, "123", 0, "1"), 1);
CAssertEq(TDCollateJSON(NULL, 0, "123", 0, "\"123\""), -1); CAssertEq(TDCollateJSON(mode, 0, "123", 0, "0123.0"), 0);
CAssertEq(TDCollateJSON(NULL, 0, "\"1234\"", 0, "\"123\""), 1); CAssertEq(TDCollateJSON(mode, 0, "123", 0, "\"123\""), -1);
CAssertEq(TDCollateJSON(NULL, 0, "\"1234\"", 0, "\"1235\""), -1); CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"123\""), 1);
CAssertEq(TDCollateJSON(NULL, 0, "\"1234\"", 0, "\"1234\""), 0); CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"1235\""), -1);
CAssertEq(TDCollateJSON(NULL, 0, "\"12\"34\"", 0, "\"1234\""), -1); CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"1234\""), 0);
CAssertEq(TDCollateJSON(mode, 0, "\"12\\q34\"", 0, "\"12q34\""), 0);
CAssertEq(TDCollateJSON(mode, 0, "\"\\q1234\"", 0, "\"q1234\""), 0);
CAssertEq(TDCollateJSON(mode, 0, "\"1234\\q\"", 0, "\"1234q\""), 0);
CAssertEq(TDCollateJSON(mode, 0, "\"a\"", 0, "\"A\""), -1);
CAssertEq(TDCollateJSON(mode, 0, "\"A\"", 0, "\"aa\""), -1);
CAssertEq(TDCollateJSON(mode, 0, "\"B\"", 0, "\"aa\""), 1);
}

TestCase(TDCollateASCII) {
void* mode = kTDCollateJSON_ASCII;
CAssertEq(TDCollateJSON(mode, 0, "true", 0, "false"), 1);
CAssertEq(TDCollateJSON(mode, 0, "false", 0, "true"), -1);
CAssertEq(TDCollateJSON(mode, 0, "null", 0, "17"), -1);
CAssertEq(TDCollateJSON(mode, 0, "123", 0, "1"), 1);
CAssertEq(TDCollateJSON(mode, 0, "123", 0, "0123.0"), 0);
CAssertEq(TDCollateJSON(mode, 0, "123", 0, "\"123\""), -1);
CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"123\""), 1);
CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"1235\""), -1);
CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"1234\""), 0);
CAssertEq(TDCollateJSON(mode, 0, "\"12\\q34\"", 0, "\"12q34\""), 0);
CAssertEq(TDCollateJSON(mode, 0, "\"\\q1234\"", 0, "\"q1234\""), 0);
CAssertEq(TDCollateJSON(mode, 0, "\"1234\\q\"", 0, "\"1234q\""), 0);
CAssertEq(TDCollateJSON(mode, 0, "\"A\"", 0, "\"a\""), -1);
CAssertEq(TDCollateJSON(mode, 0, "\"B\"", 0, "\"a\""), -1);
}

TestCase(TDCollateRaw) {
void* mode = kTDCollateJSON_Raw;
CAssertEq(TDCollateJSON(mode, 0, "false", 0, "17"), 1);
CAssertEq(TDCollateJSON(mode, 0, "false", 0, "true"), -1);
CAssertEq(TDCollateJSON(mode, 0, "null", 0, "true"), -1);
CAssertEq(TDCollateJSON(mode, 0, "[\"A\"]", 0, "\"A\""), -1);
CAssertEq(TDCollateJSON(mode, 0, "\"A\"", 0, "\"a\""), -1);
CAssertEq(TDCollateJSON(mode, 0, "[\"b\"]", 0, "[\"b\",\"c\",\"a\"]"), -1);
} }


TestCase(TDCollateArrays) { TestCase(TDCollateArrays) {
CAssertEq(TDCollateJSON(NULL, 0, "[]", 0, "\"foo\""), 1); void* mode = kTDCollateJSON_Unicode;
CAssertEq(TDCollateJSON(NULL, 0, "[]", 0, "[]"), 0); CAssertEq(TDCollateJSON(mode, 0, "[]", 0, "\"foo\""), 1);
CAssertEq(TDCollateJSON(NULL, 0, "[true]", 0, "[true]"), 0); CAssertEq(TDCollateJSON(mode, 0, "[]", 0, "[]"), 0);
CAssertEq(TDCollateJSON(NULL, 0, "[false]", 0, "[null]"), 1); CAssertEq(TDCollateJSON(mode, 0, "[true]", 0, "[true]"), 0);
CAssertEq(TDCollateJSON(NULL, 0, "[]", 0, "[null]"), -1); CAssertEq(TDCollateJSON(mode, 0, "[false]", 0, "[null]"), 1);
CAssertEq(TDCollateJSON(NULL, 0, "[123]", 0, "[45]"), 1); CAssertEq(TDCollateJSON(mode, 0, "[]", 0, "[null]"), -1);
CAssertEq(TDCollateJSON(NULL, 0, "[123]", 0, "[45,67]"), 1); CAssertEq(TDCollateJSON(mode, 0, "[123]", 0, "[45]"), 1);
CAssertEq(TDCollateJSON(NULL, 0, "[123.4,\"wow\"]", 0, "[123.40,789]"), 1); CAssertEq(TDCollateJSON(mode, 0, "[123]", 0, "[45,67]"), 1);
CAssertEq(TDCollateJSON(mode, 0, "[123.4,\"wow\"]", 0, "[123.40,789]"), 1);
} }


TestCase(TDCollateNestedArrays) { TestCase(TDCollateNestedArrays) {
CAssertEq(TDCollateJSON(NULL, 0, "[[]]", 0, "[]"), 1); void* mode = kTDCollateJSON_Unicode;
CAssertEq(TDCollateJSON(NULL, 0, "[1,[2,3],4]", 0, "[1,[2,3.1],4,5,6]"), -1); CAssertEq(TDCollateJSON(mode, 0, "[[]]", 0, "[]"), 1);
CAssertEq(TDCollateJSON(mode, 0, "[1,[2,3],4]", 0, "[1,[2,3.1],4,5,6]"), -1);
} }
#endif #endif
9 changes: 7 additions & 2 deletions Source/TDDatabase.m
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -87,8 +87,13 @@ - (BOOL) open {
if (![_fmdb open]) if (![_fmdb open])
return NO; return NO;


// Register CouchDB-compatible JSON collation function: // Register CouchDB-compatible JSON collation functions:
sqlite3_create_collation(_fmdb.sqliteHandle, "JSON", SQLITE_UTF8, self, TDCollateJSON); sqlite3_create_collation(_fmdb.sqliteHandle, "JSON", SQLITE_UTF8,
kTDCollateJSON_Unicode, TDCollateJSON);
sqlite3_create_collation(_fmdb.sqliteHandle, "JSON_RAW", SQLITE_UTF8,
kTDCollateJSON_Raw, TDCollateJSON);
sqlite3_create_collation(_fmdb.sqliteHandle, "JSON_ASCII", SQLITE_UTF8,
kTDCollateJSON_ASCII, TDCollateJSON);


// Stuff we need to initialize every time the database opens: // Stuff we need to initialize every time the database opens:
if (![self initialize: @"PRAGMA foreign_keys = ON;"]) if (![self initialize: @"PRAGMA foreign_keys = ON;"])
Expand Down
2 changes: 1 addition & 1 deletion Source/TDPuller_Tests.m
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#import "Test.h" #import "Test.h"




#if DEBUG #if XXXDEBUG


static id pull(TDDatabase* db, NSString* urlStr, id lastSequence) { static id pull(TDDatabase* db, NSString* urlStr, id lastSequence) {
NSURL* remote = [NSURL URLWithString: urlStr]; NSURL* remote = [NSURL URLWithString: urlStr];
Expand Down
10 changes: 10 additions & 0 deletions Source/TDView.h
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ typedef struct TDQueryOptions {
extern const TDQueryOptions kDefaultTDQueryOptions; extern const TDQueryOptions kDefaultTDQueryOptions;




typedef enum {
kTDViewCollationUnicode,
kTDViewCollationRaw,
kTDViewCollationASCII
} TDViewCollation;


/** An external object that knows how to map source code of some sort into executable functions. */ /** An external object that knows how to map source code of some sort into executable functions. */
@protocol TDViewCompiler <NSObject> @protocol TDViewCompiler <NSObject>
- (TDMapBlock) compileMapFunction: (NSString*)mapSource language: (NSString*)language; - (TDMapBlock) compileMapFunction: (NSString*)mapSource language: (NSString*)language;
Expand All @@ -61,6 +68,7 @@ extern const TDQueryOptions kDefaultTDQueryOptions;
int _viewID; int _viewID;
TDMapBlock _mapBlock; TDMapBlock _mapBlock;
TDReduceBlock _reduceBlock; TDReduceBlock _reduceBlock;
TDViewCollation _collation;
} }


- (void) deleteView; - (void) deleteView;
Expand All @@ -71,6 +79,8 @@ extern const TDQueryOptions kDefaultTDQueryOptions;
@property (readonly) TDMapBlock mapBlock; @property (readonly) TDMapBlock mapBlock;
@property (readonly) TDReduceBlock reduceBlock; @property (readonly) TDReduceBlock reduceBlock;


@property TDViewCollation collation;

- (BOOL) setMapBlock: (TDMapBlock)mapBlock - (BOOL) setMapBlock: (TDMapBlock)mapBlock
reduceBlock: (TDReduceBlock)reduceBlock reduceBlock: (TDReduceBlock)reduceBlock
version: (NSString*)version; version: (NSString*)version;
Expand Down
18 changes: 17 additions & 1 deletion Source/TDView.m
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ - (void)dealloc {
} }




@synthesize database=_db, name=_name, mapBlock=_mapBlock, reduceBlock=_reduceBlock; @synthesize database=_db, name=_name, mapBlock=_mapBlock, reduceBlock=_reduceBlock,
collation=_collation;




- (int) viewID { - (int) viewID {
Expand Down Expand Up @@ -221,6 +222,8 @@ - (TDStatus) updateIndex {
// Reconstitute the document as a dictionary: // Reconstitute the document as a dictionary:
sequence = [r longLongIntForColumnIndex: 1]; sequence = [r longLongIntForColumnIndex: 1];
NSString* docID = [r stringForColumnIndex: 2]; NSString* docID = [r stringForColumnIndex: 2];
if ([docID hasPrefix: @"_design/"]) // design docs don't get indexed!
continue;
NSString* revID = [r stringForColumnIndex: 3]; NSString* revID = [r stringForColumnIndex: 3];
NSData* json = [r dataForColumnIndex: 4]; NSData* json = [r dataForColumnIndex: 4];
NSDictionary* properties = [_db documentPropertiesFromJSON: json NSDictionary* properties = [_db documentPropertiesFromJSON: json
Expand Down Expand Up @@ -270,6 +273,14 @@ - (FMResultSet*) resultSetWithOptions: (const TDQueryOptions*)options
*outStatus = [self updateIndex]; *outStatus = [self updateIndex];
if (*outStatus >= 300) if (*outStatus >= 300)
return nil; return nil;

// OPT: It would be faster to use separate tables for raw-or ascii-collated views so that
// they could be indexed with the right collation, instead of having to specify it here.
NSString* collationStr = @"";
if (_collation == kTDViewCollationASCII)
collationStr = @" COLLATE JSON_ASCII";
else if (_collation == kTDViewCollationRaw)
collationStr = @" COLLATE JSON_RAW";


NSMutableString* sql = [NSMutableString stringWithString: @"SELECT key, value, docid"]; NSMutableString* sql = [NSMutableString stringWithString: @"SELECT key, value, docid"];
if (options->includeDocs) if (options->includeDocs)
Expand Down Expand Up @@ -298,15 +309,18 @@ - (FMResultSet*) resultSetWithOptions: (const TDQueryOptions*)options
} }
if (minKey) { if (minKey) {
[sql appendString: (inclusiveMin ? @" AND key >= ?" : @" AND key > ?")]; [sql appendString: (inclusiveMin ? @" AND key >= ?" : @" AND key > ?")];
[sql appendString: collationStr];
[args addObject: toJSONString(minKey)]; [args addObject: toJSONString(minKey)];
} }
if (maxKey) { if (maxKey) {
[sql appendString: (inclusiveMax ? @" AND key <= ?" : @" AND key < ?")]; [sql appendString: (inclusiveMax ? @" AND key <= ?" : @" AND key < ?")];
[sql appendString: collationStr];
[args addObject: toJSONString(maxKey)]; [args addObject: toJSONString(maxKey)];
} }


[sql appendString: @" AND revs.sequence = maps.sequence AND docs.doc_id = revs.doc_id " [sql appendString: @" AND revs.sequence = maps.sequence AND docs.doc_id = revs.doc_id "
"ORDER BY key"]; "ORDER BY key"];
[sql appendString: collationStr];
if (options->descending) if (options->descending)
[sql appendString: @" DESC"]; [sql appendString: @" DESC"];
if (options->limit != kDefaultTDQueryOptions.limit) { if (options->limit != kDefaultTDQueryOptions.limit) {
Expand All @@ -318,6 +332,8 @@ - (FMResultSet*) resultSetWithOptions: (const TDQueryOptions*)options
[args addObject: $object(options->skip)]; [args addObject: $object(options->skip)];
} }


LogTo(View, @"Query %@: %@", _name, sql);

FMResultSet* r = [_db.fmdb executeQuery: sql withArgumentsInArray: args]; FMResultSet* r = [_db.fmdb executeQuery: sql withArgumentsInArray: args];
if (!r) if (!r)
*outStatus = 500; *outStatus = 500;
Expand Down
Loading

0 comments on commit 9b0f918

Please sign in to comment.