Permalink
Browse files

fixed encoding issue with some sites which say they are UTF8 and actu…

…ally Mac OS Roman, also fixed issue with relative image paths and links
  • Loading branch information...
1 parent 8d124f6 commit 6a12500bee91c08f766f8b30f52a679d2bb4ebd1 @curthard89 committed Sep 2, 2011
Showing with 81 additions and 5 deletions.
  1. +1 −1 GGReadability/GGReadability.h
  2. +80 −4 GGReadability/GGReadability.m
@@ -56,7 +56,7 @@
// use these for init'ing with blocks
typedef void (^GGReadabilityCompletionHandler)(NSString * parsedString);
typedef void (^GGReadabilityErrorHandler)(NSError * error);
-typedef NSString * (^GGReadabilityURLHandler)(NSString * parsedString);
+typedef NSString * (^GGReadabilityURLHandler)(NSString * parsedString, NSError ** error);
@interface GGReadability : NSObject <NSURLConnectionDelegate> {
@@ -51,6 +51,10 @@ - (void)parseString:(NSString *)string
type:(NSXMLDocumentContentKind)type
useWordCount:(BOOL)flag;
+// fixes relative urls to absolute
+- (void)fixRelativeURLForElement:(NSXMLElement *)element
+ attribute:(NSString *)attribute;
+
// cleans up the given element
- (void)cleanElement:(NSXMLElement *)element;
@@ -105,6 +109,10 @@ @implementation GGReadability
#define DOC_FORMAT_XML NSXMLDocumentXMLKind|NSXMLDocumentTidyXML
#define DOC_FORMAT_HTML NSXMLDocumentHTMLKind|NSXMLDocumentTidyHTML
#define DOC_FORMAT_XHTML NSXMLDocumentXHTMLKind|NSXMLDocumentTidyHTML
+#define DOC_FORMAT_NONE 0
+
+// url delimnator
+#define URL_DELIMINATOR @"/"
// error domain
#define ERROR_DOMAIN [NSString stringWithFormat:@"com.geekygoodness.",[self class]]
@@ -279,18 +287,36 @@ - (void)connectionDidFinishLoading:(NSURLConnection *)connection
{
dispatch_queue_t queue = dispatch_queue_create( "com.geekygoodness.ggreadability", NULL );
dispatch_async( queue, ^(void){
-
+
NSString * str = [[NSString alloc] initWithData:responseData
encoding:NSUTF8StringEncoding];
+ if( str == NULL || str == nil )
+ {
+ // try mac roman
+ str = [[NSString alloc] initWithData:responseData
+ encoding:NSMacOSRomanStringEncoding];
+ }
+
// is there a handler?
GGReadabilityURLHandler handler = nil;
if( ( handler = [self URLHandlerForURL:[response URL]] ) != nil )
{
// call the handler and set the contents
+
+ NSError * error = nil;
+ NSString * tempStr = [handler( str, &error ) copy];
- str = [handler( [str autorelease] ) copy];
+ // if error, just use standard parsed string
+
+ if( error != nil )
+ {
+ [tempStr release];
+ } else {
+ [str release], str = nil;
+ str = tempStr;
+ }
}
// due to the html might not be valid, we try just standard XML to begin with
@@ -323,22 +349,31 @@ - (void)parseTryUsingWordCount:(BOOL)flag
[self parseString:str
type:DOC_FORMAT_XML
useWordCount:flag];
- if( [self contents] == NULL )
+ if( [self contents] == NULL || [[self contents] length] == 0 )
{
// then if the xml is null we try standard html
[self parseString:str
type:DOC_FORMAT_HTML
useWordCount:flag];
- if( [self contents] == NULL )
+ if( [self contents] == NULL || [[self contents] length] == 0 )
{
// and if the html is null we try xhtml
[self parseString:str
type:DOC_FORMAT_XHTML
useWordCount:flag];
+
+ if( [self contents] == NULL || [[self contents] length] == 0 )
+ {
+ // now no options
+ [self parseString:str
+ type:DOC_FORMAT_NONE
+ useWordCount:flag];
+ }
+
}
}
}
@@ -689,6 +724,47 @@ - (void)cleanElement:(NSXMLElement *)element
}
[element removeChildAtIndex:[div index]];
}
+
+ // sort out image tags url's
+ NSArray * images = [element nodesForXPath:@"//img[not(contains(@src,'http'))]"
+ error:&error];
+ for( NSXMLElement * img in images )
+ {
+ [self fixRelativeURLForElement:img
+ attribute:@"src"];
+ }
+
+ // sort out link urls
+ NSArray * links = [element nodesForXPath:@"//a[not(contains(@href,'http'))]"
+ error:&error];
+ for( NSXMLElement * link in links )
+ {
+ [self fixRelativeURLForElement:link
+ attribute:@"href"];
+ }
+
+}
+
+- (void)fixRelativeURLForElement:(NSXMLElement *)element
+ attribute:(NSString *)attribute
+{
+ NSString * src = [[element attributeForName:attribute] stringValue];
+ NSString * newSRC = nil;
+ if( [src length] == 0 )
+ {
+ return;
+ }
+ if( [[src substringToIndex:1] isEqualToString:URL_DELIMINATOR] )
+ {
+ // prepend with host
+ newSRC = [NSString stringWithFormat:@"%@:%@%@%@%@",[[response URL] scheme],URL_DELIMINATOR,URL_DELIMINATOR,[[response URL] host],src];
+ } else {
+ // prepend with the page your on
+ NSString * appendURL = [[response URL] absoluteString];
+ BOOL appendSlash = [[appendURL substringWithRange:NSMakeRange( [appendURL length] - 1, 1)] isEqualToString:URL_DELIMINATOR];
+ newSRC = [NSString stringWithFormat:@"%@%@%@",[[response URL] absoluteString],( appendSlash ? URL_DELIMINATOR : @"" ),src];
+ }
+ [[element attributeForName:attribute] setStringValue:newSRC];
}
- (void)replaceElementsForXPath:(NSString *)path

0 comments on commit 6a12500

Please sign in to comment.