Skip to content

Commit

Permalink
fixed encoding issue with some sites which say they are UTF8 and actu…
Browse files Browse the repository at this point in the history
…ally Mac OS Roman, also fixed issue with relative image paths and links
  • Loading branch information
curthard89 committed Sep 2, 2011
1 parent 8d124f6 commit 6a12500
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 5 deletions.
2 changes: 1 addition & 1 deletion GGReadability/GGReadability.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
// use these for init'ing with blocks
typedef void (^GGReadabilityCompletionHandler)(NSString * parsedString);
typedef void (^GGReadabilityErrorHandler)(NSError * error);
typedef NSString * (^GGReadabilityURLHandler)(NSString * parsedString);
typedef NSString * (^GGReadabilityURLHandler)(NSString * parsedString, NSError ** error);

@interface GGReadability : NSObject <NSURLConnectionDelegate> {

Expand Down
84 changes: 80 additions & 4 deletions GGReadability/GGReadability.m
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ - (void)parseString:(NSString *)string
type:(NSXMLDocumentContentKind)type
useWordCount:(BOOL)flag;

// fixes relative urls to absolute
- (void)fixRelativeURLForElement:(NSXMLElement *)element
attribute:(NSString *)attribute;

// cleans up the given element
- (void)cleanElement:(NSXMLElement *)element;

Expand Down Expand Up @@ -105,6 +109,10 @@ @implementation GGReadability
#define DOC_FORMAT_XML NSXMLDocumentXMLKind|NSXMLDocumentTidyXML
#define DOC_FORMAT_HTML NSXMLDocumentHTMLKind|NSXMLDocumentTidyHTML
#define DOC_FORMAT_XHTML NSXMLDocumentXHTMLKind|NSXMLDocumentTidyHTML
#define DOC_FORMAT_NONE 0

// url delimnator
#define URL_DELIMINATOR @"/"

// error domain
#define ERROR_DOMAIN [NSString stringWithFormat:@"com.geekygoodness.",[self class]]
Expand Down Expand Up @@ -279,18 +287,36 @@ - (void)connectionDidFinishLoading:(NSURLConnection *)connection
{
dispatch_queue_t queue = dispatch_queue_create( "com.geekygoodness.ggreadability", NULL );
dispatch_async( queue, ^(void){

NSString * str = [[NSString alloc] initWithData:responseData
encoding:NSUTF8StringEncoding];

if( str == NULL || str == nil )
{
// try mac roman
str = [[NSString alloc] initWithData:responseData
encoding:NSMacOSRomanStringEncoding];
}

// is there a handler?

GGReadabilityURLHandler handler = nil;
if( ( handler = [self URLHandlerForURL:[response URL]] ) != nil )
{
// call the handler and set the contents

NSError * error = nil;
NSString * tempStr = [handler( str, &error ) copy];

str = [handler( [str autorelease] ) copy];
// if error, just use standard parsed string

if( error != nil )
{
[tempStr release];
} else {
[str release], str = nil;
str = tempStr;
}
}

// due to the html might not be valid, we try just standard XML to begin with
Expand Down Expand Up @@ -323,22 +349,31 @@ - (void)parseTryUsingWordCount:(BOOL)flag
[self parseString:str
type:DOC_FORMAT_XML
useWordCount:flag];
if( [self contents] == NULL )
if( [self contents] == NULL || [[self contents] length] == 0 )
{

// then if the xml is null we try standard html

[self parseString:str
type:DOC_FORMAT_HTML
useWordCount:flag];
if( [self contents] == NULL )
if( [self contents] == NULL || [[self contents] length] == 0 )
{

// and if the html is null we try xhtml

[self parseString:str
type:DOC_FORMAT_XHTML
useWordCount:flag];

if( [self contents] == NULL || [[self contents] length] == 0 )
{
// now no options
[self parseString:str
type:DOC_FORMAT_NONE
useWordCount:flag];
}

}
}
}
Expand Down Expand Up @@ -689,6 +724,47 @@ - (void)cleanElement:(NSXMLElement *)element
}
[element removeChildAtIndex:[div index]];
}

// sort out image tags url's
NSArray * images = [element nodesForXPath:@"//img[not(contains(@src,'http'))]"
error:&error];
for( NSXMLElement * img in images )
{
[self fixRelativeURLForElement:img
attribute:@"src"];
}

// sort out link urls
NSArray * links = [element nodesForXPath:@"//a[not(contains(@href,'http'))]"
error:&error];
for( NSXMLElement * link in links )
{
[self fixRelativeURLForElement:link
attribute:@"href"];
}

}

- (void)fixRelativeURLForElement:(NSXMLElement *)element
attribute:(NSString *)attribute
{
NSString * src = [[element attributeForName:attribute] stringValue];
NSString * newSRC = nil;
if( [src length] == 0 )
{
return;
}
if( [[src substringToIndex:1] isEqualToString:URL_DELIMINATOR] )
{
// prepend with host
newSRC = [NSString stringWithFormat:@"%@:%@%@%@%@",[[response URL] scheme],URL_DELIMINATOR,URL_DELIMINATOR,[[response URL] host],src];
} else {
// prepend with the page your on
NSString * appendURL = [[response URL] absoluteString];
BOOL appendSlash = [[appendURL substringWithRange:NSMakeRange( [appendURL length] - 1, 1)] isEqualToString:URL_DELIMINATOR];
newSRC = [NSString stringWithFormat:@"%@%@%@",[[response URL] absoluteString],( appendSlash ? URL_DELIMINATOR : @"" ),src];
}
[[element attributeForName:attribute] setStringValue:newSRC];
}

- (void)replaceElementsForXPath:(NSString *)path
Expand Down

0 comments on commit 6a12500

Please sign in to comment.