Browse files

New API: Get Post Data by ID

  • Loading branch information...
1 parent f161b67 commit d68e42904868dc0a3138e5091cf3ee7a23fe76a0 @glebpopoff committed Jan 25, 2013
Showing with 247 additions and 2 deletions.
  1. +18 −0 APIContent.py
  2. +173 −0 APIUtils.py
  3. +1 −1 AppConfig.py
  4. +52 −0 GetHNPostHandler.py
  5. +3 −1 main.py
View
18 APIContent.py
@@ -25,6 +25,24 @@ def getCache(pageId, format):
logging.error('getCache: unable to get/retrieve cache')
return None
+#get post by id
+def getHackerNewsPost(articleId, format='json', url='', referer='', remote_addr=''):
+ #only cache homepage data
+ apiURL = "%s/item?id=%s" % (AppConfig.hackerNewsURL, articleId)
+ apiURLBackup = "%s/item?id=%s" % (AppConfig.hackerNewsURLBackup, articleId)
+ id = '/post/%s' % (articleId)
+ cachedData = getCache(id,format)
+ if (cachedData):
+ return cachedData
+ else:
+ hnData = APIUtils.parsePostContent(apiURL, apiURLBackup, '/post', None,format)
+ if (hnData):
+ logging.debug('getHackerNewsPost: storing cached value for id %s' % id)
+ DataCache.putData(id, format,APIUtils.removeNonAscii(hnData), url, referer, remote_addr)
+ return hnData
+ else:
+ logging.warning('getHackerNewsPost: unable to retrieve data for id %s' % id)
+ return ''
#parse HN's submissions by user
def getHackerNewsSubmittedContent(user, format='json', url='', referer='', remote_addr=''):
View
173 APIUtils.py
@@ -56,6 +56,179 @@ def getRemoteData(urlStr, backupUrl):
return None
return None
+#parse post data
+def parsePostContent(hnAPIUrl,hnBackupAPIUrl, apiURL, page='',format='json',limit=0):
+ returnData = MutableString()
+ returnData = ''
+ logging.debug('HN URL: %s' % hnAPIUrl)
+
+ #next page content (not allowed - robots.txt Disallow)
+ #if (page):
+ # hnAPIUrl = '%s/x?fnid=%s' % (AppConfig.hackerNewsURL, page)
+
+ #call HN website to get data
+ httpData = getRemoteData(hnAPIUrl)
+ if (httpData):
+ htmlData = httpData
+ #php parser (primary API)
+ if ('{"title":"' in htmlData and 'HNDroidAPI PHP Parser' in htmlData):
+ return htmlData
+
+ #classic API fallback
+ soup = BeautifulSoup(htmlData)
+ urlLinksContent = soup('td', {'class' : 'title'})
+ counter = 0
+ url_links = {}
+ for node in urlLinksContent:
+ if (node.a):
+ url_links[counter] = [node.a['href'], node.a.string]
+ counter = counter + 1
+ if (limit > 0 and counter == limit):
+ break;
+
+ #get comments & the rest
+ commentsContent = soup('td', {'class' : 'subtext'})
+ counter = 0
+ comments_stuff = {}
+ for node in commentsContent:
+ if (node):
+ #parsing this
+ #<td class="subtext"><span id="score_3002117">110 points</span> by <a href="user?id=JoelSutherland">JoelSutherland</a> 3 hours ago | <a href="item?id=3002117">36 comments</a></td>
+ nodeString = removeHtmlTags(str(node))
+ score = node.first('span', {'id' : re.compile('^score.*')}).string
+ user = node.first('a', {'href' : re.compile('^user.*')}).string
+ itemId = node.first('a', {'href' : re.compile('^item.*')})["href"]
+ comments = node.first('a', {'href' : re.compile('^item.*')}).string
+ #since 'XX hours ago' string isn't part of any element we need to simply search and replace other text to get it
+ timeAgo = nodeString.replace(str(score), '')
+ timeAgo = timeAgo.replace('by %s' % str(user), '')
+ timeAgo = timeAgo.replace(str(comments), '')
+ timeAgo = timeAgo.replace('|', '')
+ comments_stuff[counter] = [score, user, comments, timeAgo.strip(), itemId, nodeString]
+ counter = counter + 1
+ if (limit > 0 and counter == limit):
+ break;
+
+ #build up string
+ for key in url_links.keys():
+ tupURL = url_links[key]
+ if (key in comments_stuff):
+ tupComments = comments_stuff[key]
+ else:
+ tupComments = None
+ if (tupURL):
+ url = ''
+ title = ''
+ score = ''
+ user = ''
+ comments = ''
+ timeAgo = ''
+ itemId = ''
+ itemInfo = ''
+
+ #assign vars
+ url = tupURL[0]
+ title = tupURL[1]
+ if (title):
+ title = title.decode("string-escape")
+
+ if (tupComments):
+ score = tupComments[0]
+ if (score):
+ score = score.decode("string-escape")
+ user = tupComments[1]
+ if (user):
+ user = user.decode("string-escape")
+ comments = tupComments[2]
+ if (comments):
+ comments = comments.decode("string-escape")
+ timeAgo = tupComments[3]
+ if (timeAgo):
+ timeAgo = timeAgo.decode("string-escape")
+ itemId = tupComments[4]
+ if (itemId):
+ itemId = itemId.decode("string-escape")
+ itemInfo = tupComments[5]
+ if (itemInfo):
+ itemInfo = itemInfo.decode("string-escape")
+ else:
+ #need this for formatting
+ itemInfo = 'n/a '
+
+ #last record (either news2 or x?fnid)
+ if (title.lower() == 'more' or '/x?fnid' in url):
+ title = 'NextId'
+ if ('/x?fnid' in url):
+ url = '%s/format/%s/page/%s' % (apiURL, format, url.replace('/x?fnid=', ''))
+ else:
+ url = '/news2'
+ itemInfo = 'hn next id %s ' % tupURL[0]
+
+ if (format == 'json'):
+ startTag = '{'
+ endTag = '},'
+
+ #cleanup
+ if (title):
+ title = re.sub("\n", "", title)
+ title = re.sub("\"", "\\\"", title)
+ #title = re.sub("&euro;", "", title)
+
+ if (itemInfo):
+ itemInfo = re.sub("\"", "\\\"", itemInfo)
+ itemInfo = re.sub("\n", "", itemInfo)
+ itemInfo = re.sub("\t", " ", itemInfo)
+ itemInfo = re.sub("\r", "", itemInfo)
+ #itemInfo = re.sub("&euro;", "", itemInfo)
+
+ if (len(itemInfo) > 0):
+ itemInfo = Formatter.data(format, 'description', escape(itemInfo))[:-1]
+ else:
+ startTag = '<record>'
+ endTag = '</record>'
+ if (len(title) > 0):
+ title = escape(removeNonAscii(title))
+
+ if (len(url) > 0):
+ url = escape(url)
+
+ if (len(user) > 0):
+ user = escape(user)
+
+ if (len(itemInfo) > 0):
+ itemInfo = Formatter.data(format, 'description', escape(itemInfo))
+
+ if (len(title) > 0):
+ returnData += startTag + Formatter.data(format, 'title', title)
+
+ if (len(url) > 0):
+ returnData += Formatter.data(format, 'url', url)
+
+ if (len(score) > 0):
+ returnData += Formatter.data(format, 'score', score)
+
+ if (len(user) > 0):
+ returnData += Formatter.data(format, 'user', user)
+
+ if (len(comments) > 0):
+ returnData += Formatter.data(format, 'comments', comments)
+
+ if (len(timeAgo) > 0):
+ returnData += Formatter.data(format, 'time', timeAgo)
+
+ if (len(itemId) > 0):
+ #cleanup
+ if ('item?id=' in itemId):
+ itemId = itemId.replace('item?id=', '')
+ returnData += Formatter.data(format, 'item_id', itemId)
+
+ if (len(itemInfo) > 0 ):
+ returnData += itemInfo + endTag
+ else:
+ returnData = None
+
+ return returnData
+
#parse content using Beautiful Soup
def parsePageContent(hnAPIUrl,hnBackupAPIUrl, apiURL, page='',format='json',limit=0):
returnData = MutableString()
View
2 AppConfig.py
@@ -21,6 +21,6 @@
hackerNewsRSSFeed = 'http://tinyurl.com/6y37ehb' # http://news.ycombinator.com/rss
hackerNewsBgroundBrightness = 0xf6 # HN pages' background color "R" value (out of "RGB")
-googleAnalyticsKey = 'UA-24252811-3' # UA-24252811-3
+googleAnalyticsKey = 'XYZ' # your GA code
dataExpirationPolicy = '180' # in seconds
appDomain = 'hndroidapi.appspot.com' # hndroidapi.appspot.com
View
52 GetHNPostHandler.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+#
+# Hacker News Droid API: returns post data by ID
+#
+
+import os
+import re
+import logging
+import datetime
+import time
+from UserString import MutableString
+from google.appengine.api import urlfetch
+from google.appengine.ext import webapp
+from google.appengine.ext import db
+from google.appengine.ext.webapp import util
+import Formatter
+import AppConfig
+import GAHelper
+from xml.sax.saxutils import escape
+import APIContent
+import GAHelper
+from BeautifulSoup import BeautifulSoup
+
+class HackerNewsPostHandler(webapp.RequestHandler):
+
+ #controller main entry
+ def get(self,format,id):
+ #set content-type
+ self.response.headers['Content-Type'] = Formatter.contentType(format)
+
+ #get consumer/client app id
+ appid = 'Unknown'
+ if (self.request.GET):
+ if ('appid' in self.request.GET):
+ appid = self.request.GET['appid']
+ if ('app' in self.request.GET):
+ appid = self.request.GET['app']
+
+ referer = ''
+ if ('HTTP_REFERER' in os.environ):
+ referer = os.environ['HTTP_REFERER']
+
+ returnData = APIContent.getHackerNewsPost(id,format,self.request.url, referer, self.request.remote_addr)
+
+ #track this request
+ GAHelper.trackGARequests('/post/%s' % (id), appid, referer)
+
+ if (not returnData):
+ returnData = ''
+
+ #output to the browser
+ self.response.out.write(Formatter.dataWrapper(format, returnData, self.request.get('callback')))
View
4 main.py
@@ -18,6 +18,7 @@
from GetHNNestedCommentsHandler import HackerNewsNestedCommentsHandler
from GetHNLatestHandler import HackerNewsLatestPageHandler
from SandboxController import HackerNewsSandboxHandler
+from GetHNPostHandler import HackerNewsPostHandler
class MainHandler(webapp.RequestHandler):
def get(self):
@@ -51,7 +52,8 @@ def main():
(r'/ask/format/(json|xml)/page/(.*)', HackerNewsAskHandler),
(r'/submitted/format/(json|xml)/user/(.*)', HackerNewsSubmittedHandler),
(r'/comments/format/(json|xml)/id/(.*)', HackerNewsCommentsHandler),
- (r'/nestedcomments/format/(json|xml)/id/(.*)', HackerNewsNestedCommentsHandler)
+ (r'/nestedcomments/format/(json|xml)/id/(.*)', HackerNewsNestedCommentsHandler),
+ (r'/post/format/(json|xml)/id/(.*)', HackerNewsPostHandler)
],
debug=True)
util.run_wsgi_app(application)

0 comments on commit d68e429

Please sign in to comment.