Browse files

Parsing performance

  • Loading branch information...
1 parent 9e3acd0 commit c01a2b34eaa8e4b337a4472d535ff3e255211ba1 @davedelong committed Mar 31, 2011
Showing with 1,000,053 additions and 24 deletions.
  1. +1 −0 CHCSVParser.h
  2. +12 −7 CHCSVParser.m
  3. +1,000,000 −0 giant.csv
  4. +40 −17 main.m
View
1 CHCSVParser.h
@@ -37,6 +37,7 @@
BOOL hasStarted;
NSString * delimiter;
+ unichar delimiterCharacter;
NSMutableData * currentChunk;
NSMutableString * currentChunkString;
View
19 CHCSVParser.m
@@ -24,10 +24,13 @@ of this software and associated documentation files (the "Software"), to deal
**/
#import "CHCSVParser.h"
-#define CHUNK_SIZE 32
+#define CHUNK_SIZE 2048
#define STRING_QUOTE @"\""
#define STRING_BACKSLASH @"\\"
+#define UNICHAR_QUOTE '"'
+#define UNICHAR_BACKSLASH '\\'
+
enum {
CHCSVParserStateInsideFile = 0,
CHCSVParserStateInsideLine = 1,
@@ -246,6 +249,7 @@ - (void) setDelimiter:(NSString *)newDelimiter {
if (newDelimiter != delimiter) {
[delimiter release];
delimiter = [newDelimiter copy];
+ delimiterCharacter = [delimiter characterAtIndex:0];
}
}
@@ -328,7 +332,7 @@ - (void) runParseLoop {
NSString * previousPreviousCharacter = nil;
NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
- unsigned short counter = 0;
+ unsigned char counter = 0;
while (error == nil &&
(currentCharacter = [self nextCharacter]) &&
@@ -341,7 +345,7 @@ - (void) runParseLoop {
previousCharacter = currentCharacter;
counter++;
- if (counter == 0) { //this happens every 65,536 (2**16) iterations when the unsigned short overflows
+ if (counter == 0) { //this happens every 256 (2**8) iterations when the unsigned short overflows
[currentCharacter retain];
[previousCharacter retain];
[previousPreviousCharacter retain];
@@ -378,8 +382,9 @@ - (void) processComposedCharacter:(NSString *)currentCharacter previousCharacter
unichar currentUnichar = [currentCharacter characterAtIndex:0];
unichar previousUnichar = [previousCharacter characterAtIndex:0];
+ unichar previousPreviousUnichar = [previousPreviousCharacter characterAtIndex:0];
- if ([currentCharacter isEqual:STRING_QUOTE]) {
+ if (currentUnichar == UNICHAR_QUOTE) {
if (state == CHCSVParserStateInsideLine) {
//beginning a quoted field
[self beginCurrentField];
@@ -391,7 +396,7 @@ - (void) processComposedCharacter:(NSString *)currentCharacter previousCharacter
balancedQuotes = !balancedQuotes;
}
}
- } else if ([currentCharacter isEqual:delimiter]) {
+ } else if (currentUnichar == delimiterCharacter) {
if (state == CHCSVParserStateInsideLine) {
[self beginCurrentField];
[self finishCurrentField];
@@ -402,7 +407,7 @@ - (void) processComposedCharacter:(NSString *)currentCharacter previousCharacter
[self finishCurrentField];
}
}
- } else if ([currentCharacter isEqual:STRING_BACKSLASH]) {
+ } else if (currentUnichar == UNICHAR_BACKSLASH) {
if (state == CHCSVParserStateInsideField) {
balancedEscapes = !balancedEscapes;
} else if (state == CHCSVParserStateInsideLine) {
@@ -419,7 +424,7 @@ - (void) processComposedCharacter:(NSString *)currentCharacter previousCharacter
}
}
} else {
- if ([previousCharacter isEqual:STRING_QUOTE] && [previousPreviousCharacter isEqual:STRING_BACKSLASH] == NO && balancedQuotes == YES && balancedEscapes == YES) {
+ if (previousUnichar == UNICHAR_QUOTE && previousPreviousUnichar != UNICHAR_QUOTE && balancedQuotes == YES && balancedEscapes == YES) {
NSString * reason = [NSString stringWithFormat:@"Invalid CSV format on line #%lu immediately after \"%@\"", currentLine, currentField];
error = [[NSError alloc] initWithDomain:@"com.davedelong.csv" code:0 userInfo:[NSDictionary dictionaryWithObject:reason forKey:NSLocalizedDescriptionKey]];
return;
View
1,000,000 giant.csv
1,000,000 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
57 main.m
@@ -6,19 +6,19 @@ @interface Delegate : NSObject <CHCSVParserDelegate>
@implementation Delegate
- (void) parser:(CHCSVParser *)parser didStartDocument:(NSString *)csvFile {
- NSLog(@"parser started: %@", csvFile);
+// NSLog(@"parser started: %@", csvFile);
}
- (void) parser:(CHCSVParser *)parser didStartLine:(NSUInteger)lineNumber {
- NSLog(@"Starting line: %lu", lineNumber);
+// NSLog(@"Starting line: %lu", lineNumber);
}
- (void) parser:(CHCSVParser *)parser didReadField:(NSString *)field {
- NSLog(@" field: %@", field);
+// NSLog(@" field: %@", field);
}
- (void) parser:(CHCSVParser *)parser didEndLine:(NSUInteger)lineNumber {
- NSLog(@"Ending line: %lu", lineNumber);
+// NSLog(@"Ending line: %lu", lineNumber);
}
- (void) parser:(CHCSVParser *)parser didEndDocument:(NSString *)csvFile {
- NSLog(@"parser ended: %@", csvFile);
+// NSLog(@"parser ended: %@", csvFile);
}
- (void) parser:(CHCSVParser *)parser didFailWithError:(NSError *)error {
NSLog(@"ERROR: %@", error);
@@ -29,20 +29,23 @@ - (void) parser:(CHCSVParser *)parser didFailWithError:(NSError *)error {
int main (int argc, const char * argv[]) {
NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
- NSString * file = @"/Users/dave/Developer/Open Source/Git Projects/CHCSVParser/test.tsv";
- NSStringEncoding encoding = 0;
- CHCSVParser * p = [[CHCSVParser alloc] initWithContentsOfCSVFile:file usedEncoding:&encoding error:nil];
- [p setDelimiter:@"\t"];
-
- NSLog(@"encoding: %@", CFStringGetNameOfEncoding(CFStringConvertNSStringEncodingToEncoding(encoding)));
-
- Delegate * d = [[Delegate alloc] init];
- [p setParserDelegate:d];
+ NSString * file = @"/Users/dave/Developer/Open Source/Git Projects/CHCSVParser/giant.csv";
- [p parse];
+ /**
+ CHCSVWriter *big = [[CHCSVWriter alloc] initWithCSVFile:file atomic:NO];
+ for (int i = 0; i < 1000000; ++i) {
+ NSAutoreleasePool *inner = [[NSAutoreleasePool alloc] init];
+ for (int j = 0; j < 10; ++j) {
+ [big writeField:[NSString stringWithFormat:@"%d-%d", i, j]];
+ }
+ [big writeLine];
+ [inner drain];
+ }
+ [big closeFile];
+ [big release];
+ **/
- [d release];
- [p release];
+ /**
NSError * error = nil;
NSArray * rows = [[NSArray alloc] initWithContentsOfCSVFile:file usedEncoding:&encoding delimiter:@"\t" error:&error];
@@ -63,6 +66,26 @@ int main (int argc, const char * argv[]) {
[w release];
[rows release];
+ **/
+
+ NSLog(@"Beginning...");
+ NSStringEncoding encoding = 0;
+ CHCSVParser * p = [[CHCSVParser alloc] initWithContentsOfCSVFile:file usedEncoding:&encoding error:nil];
+
+ NSLog(@"encoding: %@", CFStringGetNameOfEncoding(CFStringConvertNSStringEncodingToEncoding(encoding)));
+
+ Delegate * d = [[Delegate alloc] init];
+ [p setParserDelegate:d];
+
+ NSTimeInterval start = [NSDate timeIntervalSinceReferenceDate];
+ [p parse];
+ NSTimeInterval end = [NSDate timeIntervalSinceReferenceDate];
+
+ NSLog(@"raw difference: %f", (end-start));
+
+ [d release];
+ [p release];
+
[pool drain];
return 0;
}

0 comments on commit c01a2b3

Please sign in to comment.