Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Proper Unicode collation

Unfortunately this is slower than the earlier implementation as it requires converting strings to CFStrings.
The earlier version is still available as a different mode (selected by passing a context value to the collator when defining it.)
TDView allows its collation to be changed.
  • Loading branch information...
commit 9b0f9186cca6a78e049d4a90550c79d9fd988e60 1 parent 0168ac2
@snej snej authored
View
11 Source/TDCollateJSON.h
@@ -9,9 +9,18 @@
#import <Foundation/Foundation.h>
/** SQLite collation function for JSON-formatted strings.
- Compares them according to CouchDB's collation rules.
+ The "context" parameter should be one of the three collation mode constants below.
WARNING: This function *only* works on valid JSON with no whitespace.
If called on non-JSON strings it is quite likely to crash! */
int TDCollateJSON(void *context,
int len1, const void * chars1,
int len2, const void * chars2);
+
+// CouchDB's default collation rules, including Unicode collation for strings
+#define kTDCollateJSON_Unicode ((void*)0)
+
+// CouchDB's "raw" collation rules (which order scalar types differently, beware)
+#define kTDCollateJSON_Raw ((void*)1)
+
+// ASCII mode, which is like CouchDB default except that strings are compared as binary UTF-8
+#define kTDCollateJSON_ASCII ((void*)2)
View
152 Source/TDCollateJSON.m
@@ -29,6 +29,7 @@ static int dcmp(double n1, double n2) {
}
+// Types of values, ordered according to CouchDB collation order (see view_collation.js tests)
typedef enum {
kEndArray,
kEndObject,
@@ -45,6 +46,15 @@ static int dcmp(double n1, double n2) {
} ValueType;
+// "Raw" ordering is: 0:number, 1:false, 2:null, 3:true, 4:object, 5:array, 6:string
+// (according to view_collation_raw.js)
+static SInt8 kRawOrderOfValueType[] = {
+ -4, -3, -2, -1,
+ 2, 1, 3, 0, 6, 5, 4,
+ 7
+};
+
+
static ValueType valueTypeOf(char c) {
switch (c) {
case 'n': return kNull;
@@ -66,7 +76,7 @@ static ValueType valueTypeOf(char c) {
}
-static int compareStrings(const char** in1, const char** in2) {
+static int compareStringsASCII(const char** in1, const char** in2) {
const char* str1 = *in1, *str2 = *in2;
while(true) {
++str1;
@@ -91,7 +101,6 @@ static int compareStrings(const char** in1, const char** in2) {
int s = cmp(*str1, *str2);
if (s)
return s;
- //FIX: Need to do proper Unicode character collation on UTF-8 sequences
}
// Strings are equal, so update the positions:
@@ -101,6 +110,53 @@ static int compareStrings(const char** in1, const char** in2) {
}
+static CFStringRef createCFStringFromJSON(const char** in) {
+ // Scan the JSON string to find its end and whether it contains escapes:
+ const char* start = ++*in;
+ unsigned escapes = 0;
+ const char* str;
+ for (str = start; *str != '"'; ++str) {
+ if (*str == '\\') {
+ ++escapes;
+ ++str;
+ }
+ }
+ *in = str + 1;
+ size_t length = str - start;
+
+ CFAllocatorRef deallocator;
+ if (escapes > 0) {
+ length -= escapes;
+ char* buf = malloc(length);
+ char* dst = buf;
+ for (str = start; *str != '"'; ++str) {
+ if (*str == '\\')
+ ++str;
+ *dst++ = *str;
+ }
+ CAssertEq(dst-buf, (int)length);
+ start = buf;
+ deallocator = NULL; // means "use system deallocator", i.e. free()
+ } else {
+ deallocator = kCFAllocatorNull;
+ }
+ CFStringRef cfstr = CFStringCreateWithBytesNoCopy(NULL, (const UInt8*)start, length,
+ kCFStringEncodingUTF8, NO, deallocator);
+ CAssert(cfstr != NULL, @"Failed to convert to string: start=%p, length=%u", start, length);
+ return cfstr;
+}
+
+
+static int compareStringsUnicode(const char** in1, const char** in2) {
+ CFStringRef str1 = createCFStringFromJSON(in1);
+ CFStringRef str2 = createCFStringFromJSON(in2);
+ int result = CFStringCompare(str1, str2, kCFCompareAnchored | kCFCompareLocalized);
+ CFRelease(str1);
+ CFRelease(str2);
+ return result;
+}
+
+
int TDCollateJSON(void *context,
int len1, const void * chars1,
int len2, const void * chars2)
@@ -114,10 +170,14 @@ int TDCollateJSON(void *context,
ValueType type1 = valueTypeOf(*str1);
ValueType type2 = valueTypeOf(*str2);
// If types don't match, stop and return their relative ordering:
- if (type1 != type2)
- return cmp(type1, type2);
+ if (type1 != type2) {
+ if (context != kTDCollateJSON_Raw)
+ return cmp(type1, type2);
+ else
+ return cmp(kRawOrderOfValueType[type1], kRawOrderOfValueType[type2]);
+
// If types match, compare the actual token values:
- else switch (type1) {
+ } else switch (type1) {
case kNull:
case kTrue:
str1 += 4;
@@ -137,7 +197,11 @@ int TDCollateJSON(void *context,
break;
}
case kString: {
- int diff = compareStrings(&str1, &str2);
+ int diff;
+ if (context == kTDCollateJSON_Unicode)
+ diff = compareStringsUnicode(&str1, &str2);
+ else
+ diff = compareStringsASCII(&str1, &str2);
if (diff)
return diff; // Strings don't match
break;
@@ -169,31 +233,67 @@ int TDCollateJSON(void *context,
#if DEBUG
TestCase(TDCollateScalars) {
- CAssertEq(TDCollateJSON(NULL, 0, "true", 0, "false"), 1);
- CAssertEq(TDCollateJSON(NULL, 0, "false", 0, "true"), -1);
- CAssertEq(TDCollateJSON(NULL, 0, "null", 0, "17"), -1);
- CAssertEq(TDCollateJSON(NULL, 0, "123", 0, "1"), 1);
- CAssertEq(TDCollateJSON(NULL, 0, "123", 0, "0123.0"), 0);
- CAssertEq(TDCollateJSON(NULL, 0, "123", 0, "\"123\""), -1);
- CAssertEq(TDCollateJSON(NULL, 0, "\"1234\"", 0, "\"123\""), 1);
- CAssertEq(TDCollateJSON(NULL, 0, "\"1234\"", 0, "\"1235\""), -1);
- CAssertEq(TDCollateJSON(NULL, 0, "\"1234\"", 0, "\"1234\""), 0);
- CAssertEq(TDCollateJSON(NULL, 0, "\"12\"34\"", 0, "\"1234\""), -1);
+ void* mode = kTDCollateJSON_Unicode;
+ CAssertEq(TDCollateJSON(mode, 0, "true", 0, "false"), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "false", 0, "true"), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "null", 0, "17"), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "123", 0, "1"), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "123", 0, "0123.0"), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "123", 0, "\"123\""), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"123\""), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"1235\""), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"1234\""), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "\"12\\q34\"", 0, "\"12q34\""), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "\"\\q1234\"", 0, "\"q1234\""), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "\"1234\\q\"", 0, "\"1234q\""), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "\"a\"", 0, "\"A\""), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"A\"", 0, "\"aa\""), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"B\"", 0, "\"aa\""), 1);
+}
+
+TestCase(TDCollateASCII) {
+ void* mode = kTDCollateJSON_ASCII;
+ CAssertEq(TDCollateJSON(mode, 0, "true", 0, "false"), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "false", 0, "true"), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "null", 0, "17"), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "123", 0, "1"), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "123", 0, "0123.0"), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "123", 0, "\"123\""), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"123\""), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"1235\""), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"1234\"", 0, "\"1234\""), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "\"12\\q34\"", 0, "\"12q34\""), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "\"\\q1234\"", 0, "\"q1234\""), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "\"1234\\q\"", 0, "\"1234q\""), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "\"A\"", 0, "\"a\""), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"B\"", 0, "\"a\""), -1);
+}
+
+TestCase(TDCollateRaw) {
+ void* mode = kTDCollateJSON_Raw;
+ CAssertEq(TDCollateJSON(mode, 0, "false", 0, "17"), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "false", 0, "true"), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "null", 0, "true"), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "[\"A\"]", 0, "\"A\""), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "\"A\"", 0, "\"a\""), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "[\"b\"]", 0, "[\"b\",\"c\",\"a\"]"), -1);
}
TestCase(TDCollateArrays) {
- CAssertEq(TDCollateJSON(NULL, 0, "[]", 0, "\"foo\""), 1);
- CAssertEq(TDCollateJSON(NULL, 0, "[]", 0, "[]"), 0);
- CAssertEq(TDCollateJSON(NULL, 0, "[true]", 0, "[true]"), 0);
- CAssertEq(TDCollateJSON(NULL, 0, "[false]", 0, "[null]"), 1);
- CAssertEq(TDCollateJSON(NULL, 0, "[]", 0, "[null]"), -1);
- CAssertEq(TDCollateJSON(NULL, 0, "[123]", 0, "[45]"), 1);
- CAssertEq(TDCollateJSON(NULL, 0, "[123]", 0, "[45,67]"), 1);
- CAssertEq(TDCollateJSON(NULL, 0, "[123.4,\"wow\"]", 0, "[123.40,789]"), 1);
+ void* mode = kTDCollateJSON_Unicode;
+ CAssertEq(TDCollateJSON(mode, 0, "[]", 0, "\"foo\""), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "[]", 0, "[]"), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "[true]", 0, "[true]"), 0);
+ CAssertEq(TDCollateJSON(mode, 0, "[false]", 0, "[null]"), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "[]", 0, "[null]"), -1);
+ CAssertEq(TDCollateJSON(mode, 0, "[123]", 0, "[45]"), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "[123]", 0, "[45,67]"), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "[123.4,\"wow\"]", 0, "[123.40,789]"), 1);
}
TestCase(TDCollateNestedArrays) {
- CAssertEq(TDCollateJSON(NULL, 0, "[[]]", 0, "[]"), 1);
- CAssertEq(TDCollateJSON(NULL, 0, "[1,[2,3],4]", 0, "[1,[2,3.1],4,5,6]"), -1);
+ void* mode = kTDCollateJSON_Unicode;
+ CAssertEq(TDCollateJSON(mode, 0, "[[]]", 0, "[]"), 1);
+ CAssertEq(TDCollateJSON(mode, 0, "[1,[2,3],4]", 0, "[1,[2,3.1],4,5,6]"), -1);
}
#endif
View
9 Source/TDDatabase.m
@@ -87,8 +87,13 @@ - (BOOL) open {
if (![_fmdb open])
return NO;
- // Register CouchDB-compatible JSON collation function:
- sqlite3_create_collation(_fmdb.sqliteHandle, "JSON", SQLITE_UTF8, self, TDCollateJSON);
+ // Register CouchDB-compatible JSON collation functions:
+ sqlite3_create_collation(_fmdb.sqliteHandle, "JSON", SQLITE_UTF8,
+ kTDCollateJSON_Unicode, TDCollateJSON);
+ sqlite3_create_collation(_fmdb.sqliteHandle, "JSON_RAW", SQLITE_UTF8,
+ kTDCollateJSON_Raw, TDCollateJSON);
+ sqlite3_create_collation(_fmdb.sqliteHandle, "JSON_ASCII", SQLITE_UTF8,
+ kTDCollateJSON_ASCII, TDCollateJSON);
// Stuff we need to initialize every time the database opens:
if (![self initialize: @"PRAGMA foreign_keys = ON;"])
View
2  Source/TDPuller_Tests.m
@@ -20,7 +20,7 @@
#import "Test.h"
-#if DEBUG
+#if XXXDEBUG
static id pull(TDDatabase* db, NSString* urlStr, id lastSequence) {
NSURL* remote = [NSURL URLWithString: urlStr];
View
10 Source/TDView.h
@@ -45,6 +45,13 @@ typedef struct TDQueryOptions {
extern const TDQueryOptions kDefaultTDQueryOptions;
+typedef enum {
+ kTDViewCollationUnicode,
+ kTDViewCollationRaw,
+ kTDViewCollationASCII
+} TDViewCollation;
+
+
/** An external object that knows how to map source code of some sort into executable functions. */
@protocol TDViewCompiler <NSObject>
- (TDMapBlock) compileMapFunction: (NSString*)mapSource language: (NSString*)language;
@@ -61,6 +68,7 @@ extern const TDQueryOptions kDefaultTDQueryOptions;
int _viewID;
TDMapBlock _mapBlock;
TDReduceBlock _reduceBlock;
+ TDViewCollation _collation;
}
- (void) deleteView;
@@ -71,6 +79,8 @@ extern const TDQueryOptions kDefaultTDQueryOptions;
@property (readonly) TDMapBlock mapBlock;
@property (readonly) TDReduceBlock reduceBlock;
+@property TDViewCollation collation;
+
- (BOOL) setMapBlock: (TDMapBlock)mapBlock
reduceBlock: (TDReduceBlock)reduceBlock
version: (NSString*)version;
View
18 Source/TDView.m
@@ -59,7 +59,8 @@ - (void)dealloc {
}
-@synthesize database=_db, name=_name, mapBlock=_mapBlock, reduceBlock=_reduceBlock;
+@synthesize database=_db, name=_name, mapBlock=_mapBlock, reduceBlock=_reduceBlock,
+ collation=_collation;
- (int) viewID {
@@ -221,6 +222,8 @@ - (TDStatus) updateIndex {
// Reconstitute the document as a dictionary:
sequence = [r longLongIntForColumnIndex: 1];
NSString* docID = [r stringForColumnIndex: 2];
+ if ([docID hasPrefix: @"_design/"]) // design docs don't get indexed!
+ continue;
NSString* revID = [r stringForColumnIndex: 3];
NSData* json = [r dataForColumnIndex: 4];
NSDictionary* properties = [_db documentPropertiesFromJSON: json
@@ -270,6 +273,14 @@ - (FMResultSet*) resultSetWithOptions: (const TDQueryOptions*)options
*outStatus = [self updateIndex];
if (*outStatus >= 300)
return nil;
+
+ // OPT: It would be faster to use separate tables for raw-or ascii-collated views so that
+ // they could be indexed with the right collation, instead of having to specify it here.
+ NSString* collationStr = @"";
+ if (_collation == kTDViewCollationASCII)
+ collationStr = @" COLLATE JSON_ASCII";
+ else if (_collation == kTDViewCollationRaw)
+ collationStr = @" COLLATE JSON_RAW";
NSMutableString* sql = [NSMutableString stringWithString: @"SELECT key, value, docid"];
if (options->includeDocs)
@@ -298,15 +309,18 @@ - (FMResultSet*) resultSetWithOptions: (const TDQueryOptions*)options
}
if (minKey) {
[sql appendString: (inclusiveMin ? @" AND key >= ?" : @" AND key > ?")];
+ [sql appendString: collationStr];
[args addObject: toJSONString(minKey)];
}
if (maxKey) {
[sql appendString: (inclusiveMax ? @" AND key <= ?" : @" AND key < ?")];
+ [sql appendString: collationStr];
[args addObject: toJSONString(maxKey)];
}
[sql appendString: @" AND revs.sequence = maps.sequence AND docs.doc_id = revs.doc_id "
"ORDER BY key"];
+ [sql appendString: collationStr];
if (options->descending)
[sql appendString: @" DESC"];
if (options->limit != kDefaultTDQueryOptions.limit) {
@@ -318,6 +332,8 @@ - (FMResultSet*) resultSetWithOptions: (const TDQueryOptions*)options
[args addObject: $object(options->skip)];
}
+ LogTo(View, @"Query %@: %@", _name, sql);
+
FMResultSet* r = [_db.fmdb executeQuery: sql withArgumentsInArray: args];
if (!r)
*outStatus = 500;
View
88 Source/TDView_Tests.m
@@ -388,4 +388,92 @@
}
+TestCase(TDView_Collation) {
+ // Based on CouchDB's "view_collation.js" test
+ NSArray* testKeys = [NSArray arrayWithObjects: $null,
+ $false,
+ $true,
+ $object(0),
+ $object(2.5),
+ $object(10),
+ @" ", @"_", @"~",
+ @"a",
+ @"A",
+ @"aa",
+ @"b",
+ @"B",
+ @"ba",
+ @"bb",
+ $array(@"a"),
+ $array(@"b"),
+ $array(@"b", @"c"),
+ $array(@"b", @"c", @"a"),
+ $array(@"b", @"d"),
+ $array(@"b", @"d", @"e"), nil];
+ RequireTestCase(TDView_Query);
+ TDDatabase *db = [TDDatabase createEmptyDBAtPath: @"/tmp/TouchDB_ViewTest.touchdb"];
+ int i = 0;
+ for (id key in testKeys)
+ putDoc(db, $dict({@"_id", $sprintf(@"%d", i++)}, {@"name", key}));
+
+ TDView* view = [db viewNamed: @"default/names"];
+ [view setMapBlock: ^(NSDictionary* doc, TDMapEmitBlock emit) {
+ emit([doc objectForKey: @"name"], nil);
+ } reduceBlock: NULL version:@"1.0"];
+
+ TDQueryOptions options = kDefaultTDQueryOptions;
+ TDStatus status;
+ NSArray* rows = [view queryWithOptions: &options status: &status];
+ CAssertEq(status, 200);
+ i = 0;
+ for (NSDictionary* row in rows)
+ CAssertEqual([row objectForKey: @"key"], [testKeys objectAtIndex: i++]);
+}
+
+
+TestCase(TDView_CollationRaw) {
+ NSArray* testKeys = [NSArray arrayWithObjects: $object(0),
+ $object(2.5),
+ $object(10),
+ $false,
+ $null,
+ $true,
+ $array(@"a"),
+ $array(@"b"),
+ $array(@"b", @"c"),
+ $array(@"b", @"c", @"a"),
+ $array(@"b", @"d"),
+ $array(@"b", @"d", @"e"),
+ @" ",
+ @"A",
+ @"B",
+ @"_",
+ @"a",
+ @"aa",
+ @"b",
+ @"ba",
+ @"bb",
+ @"~", nil];
+ RequireTestCase(TDView_Query);
+ TDDatabase *db = [TDDatabase createEmptyDBAtPath: @"/tmp/TouchDB_ViewTest.touchdb"];
+ int i = 0;
+ for (id key in testKeys)
+ putDoc(db, $dict({@"_id", $sprintf(@"%d", i++)}, {@"name", key}));
+
+ TDView* view = [db viewNamed: @"default/names"];
+ [view setMapBlock: ^(NSDictionary* doc, TDMapEmitBlock emit) {
+ emit([doc objectForKey: @"name"], nil);
+ } reduceBlock: NULL version:@"1.0"];
+ view.collation = kTDViewCollationRaw;
+
+ TDQueryOptions options = kDefaultTDQueryOptions;
+ TDStatus status;
+ NSArray* rows = [view queryWithOptions: &options status: &status];
+ CAssertEq(status, 200);
+ i = 0;
+ for (NSDictionary* row in rows)
+ CAssertEqual([row objectForKey: @"key"], [testKeys objectAtIndex: i++]);
+}
+
+
#endif
Please sign in to comment.
Something went wrong with that request. Please try again.