Permalink
Browse files

removed dependency on pyRXP in OPML processing #30

  • Loading branch information...
Fazal Majid
Fazal Majid committed Jul 6, 2013
1 parent 7f2ab8e commit be45a93ca1c549d6ea03157277a1455f037a4037
Showing with 65 additions and 48 deletions.
  1. +59 −48 opml.py
  2. +1 −0 test/category.opml
  3. +1 −0 test/directory.opml
  4. +1 −0 test/placesLived.opml
  5. +1 −0 test/simpleScript.opml
  6. +1 −0 test/states.opml
  7. +1 −0 test/subscriptionList.opml
View
107 opml.py
@@ -1,50 +1,52 @@
import sys, os, re, pyRXP, singleton
import sys, os, re, xml.etree.ElementTree, singleton
sqlite = singleton.sqlite
def opml_process(tree, level=0, out=[]):
if type(tree) != tuple:
return
tag, attrs, children, spare = tree
if tag == 'head':
return
if tag == 'body':
level = 0
if tag == 'outline':
# Sharpreader
if 'xmlUrl' in attrs:
# skip myself
# if 'majid.info' in attrs['xmlUrl']:
# return
out.append({
'xmlUrl': attrs['xmlUrl'].replace('\'', '\'\''),
'htmlUrl': attrs['htmlUrl'].replace('\'', '\'\''),
'title': attrs['title'].replace('\'', '\'\''),
'desc': re.sub('<[^>]*>', '', attrs.get('description', '')).replace(
'"', '&quot;').replace(
'& ', '&amp; ').replace(
'\xa9', '&copy;').replace('\'', '\'\'')})
# FeedOnFeeds
elif 'xmlurl' in attrs:
# skip myself
# if 'majid.info' in attrs['xmlurl']:
# return
out.append({
'xmlUrl': attrs['xmlurl'].replace('\'', '\'\''),
'htmlUrl': attrs['htmlurl'].replace('\'', '\'\''),
'title': attrs['title'].replace('\'', '\'\''),
'desc': re.sub('<[^>]*>', '', attrs.get('description', '')).replace(
'"', '&quot;').replace(
'& ', '&amp; ').replace(
'\xa9', '&copy;').replace('\'', '\'\'')})
if children:
for t in children:
opml_process(t, level + 1, out)
def parse_opml(opml_file):
opml = pyRXP.Parser().parse(open(
os.path.expanduser(opml_file)).read())
try:
opml = xml.etree.ElementTree.parse(os.path.expanduser(opml_file))
except xml.etree.ElementTree.ParseError:
try:
opml = xml.etree.ElementTree.parse(
os.path.expanduser(opml_file),
xml.etree.ElementTree.XMLParser(encoding='UTF-8'))
except xml.etree.ElementTree.ParseError:
opml = xml.etree.ElementTree.parse(
os.path.expanduser(opml_file),
xml.etree.ElementTree.XMLParser(encoding='ISO8859-1'))
tree = []
opml_process(opml, 0, tree)
#import code
#code.interact(local=locals())
# XML is case-sensitive. xmlUrl is what is officially in the OPML spec
for node in opml.getroot().findall('.//outline[@xmlUrl]'):
if node is not None:
attrs = node.get
tree.append(
{
'xmlUrl': attrs('xmlUrl', ''),
'htmlUrl': attrs('htmlUrl', ''),
'title': attrs('title', ''),
'desc': re.sub('<(^>, '')*>', '',
attrs('description', '')).replace(
'"', '&quot;').replace(
'& ', '&amp; ').replace(
'\u00a9', '&copy;')
}
)
# invalid format, e.g. as used by FeedOnFeeds
for node in opml.getroot().findall('.//outline[@xmlurl]'):
if node is not None:
attrs = node.get
tree.append(
{
'xmlUrl': attrs('xmlurl', ''),
'htmlUrl': attrs('htmlurl', ''),
'title': attrs('title', ''),
'desc': re.sub('<(^>, '')*>', '', attrs('description', '')).replace(
'"', '&quot;').replace(
'& ', '&amp; ').replace(
'\u00a9', '&copy;')
}
)
return tree
def import_opml(opml_file):
@@ -58,8 +60,7 @@ def import_opml(opml_file):
try:
c.execute("""insert into fm_feeds
(feed_xml, feed_etag, feed_html, feed_title, feed_desc) values
('%(xmlUrl)s', '%(feed_etag)s', '%(htmlUrl)s', '%(title)s',
'%(desc)s')""" % feed)
(:xmlUrl, :feed_etag, :htmlUrl, :title, :desc)""", feed)
ok += 1
except sqlite.IntegrityError, e:
if 'feed_xml' not in str(e):
@@ -69,7 +70,17 @@ def import_opml(opml_file):
print ok, 'feeds imported,', dup, 'rejected as duplicates'
if __name__ == '__main__':
#import_opml('../mylos/data/gems/sharpreader.opml')
print parse_opml('fof.opml')
#print parse_opml('../mylos/data/gems/sharpreader.opml')
for feed in [
# FeedOnFeeds
'fof.opml',
# these tests are from http://dev.opml.org/spec2.html
'test/subscriptionList.opml',
'test/simpleScript.opml',
'test/placesLived.opml',
'test/directory.opml',
'test/category.opml',
]:
print feed
print parse_opml(feed)
print '-' * 72
View
@@ -0,0 +1 @@
<?xml version="1.0" encoding="ISO-8859-1"?><opml version="2.0"> <head> <title>Illustrating the category attribute</title> <dateCreated>Mon, 31 Oct 2005 19:23:00 GMT</dateCreated> </head> <body> <outline text="The Mets are the best team in baseball." category="/Philosophy/Baseball/Mets,/Tourism/New York" created="Mon, 31 Oct 2005 18:21:33 GMT"/> </body> </opml>
View
@@ -0,0 +1 @@
<?xml version="1.0" encoding="ISO-8859-1"?><opml version="2.0"> <head> <title>scriptingNewsDirectory.opml</title> <dateCreated>Thu, 13 Oct 2005 15:34:07 GMT</dateCreated> <dateModified>Tue, 25 Oct 2005 21:33:57 GMT</dateModified> <ownerName>Dave Winer</ownerName> <ownerEmail>dwiner@yahoo.com</ownerEmail> <expansionState></expansionState> <vertScrollState>1</vertScrollState> <windowTop>105</windowTop> <windowLeft>466</windowLeft> <windowBottom>386</windowBottom> <windowRight>964</windowRight> </head> <body> <outline text="Scripting News sites" created="Sun, 16 Oct 2005 05:56:10 GMT" type="link" url="http://hosting.opml.org/dave/mySites.opml"/> <outline text="News.Com top 100 OPML" created="Tue, 25 Oct 2005 21:33:28 GMT" type="link" url="http://news.com.com/html/ne/blogs/CNETNewsBlog100.opml"/> <outline text="BloggerCon III Blogroll" created="Mon, 24 Oct 2005 05:23:52 GMT" type="link" url="http://static.bloggercon.org/iii/blogroll.opml"/> <outline text="TechCrunch reviews" type="link" url="http://hosting.opml.org/techcrunch.opml.org/TechCrunch.opml"/> <outline text="Tod Maffin's directory of Public Radio podcasts" type="link" url="http://todmaffin.com/radio.opml"/> <outline text="Adam Curry's iPodder.org directory" type="link" url="http://homepage.mac.com/dailysourcecode/DSC/ipodderDirectory.opml"/> <outline text="Memeorandum" created="Thu, 13 Oct 2005 15:19:05 GMT" type="link" url="http://tech.memeorandum.com/index.opml"/> <outline text="DaveNet archive" created="Wed, 12 Oct 2005 01:39:56 GMT" type="link" url="http://davenet.opml.org/index.opml"/> </body> </opml>
View
@@ -0,0 +1 @@
<?xml version="1.0" encoding="ISO-8859-1"?><opml version="2.0"> <head> <title>placesLived.opml</title> <dateCreated>Mon, 27 Feb 2006 12:09:48 GMT</dateCreated> <dateModified>Mon, 27 Feb 2006 12:11:44 GMT</dateModified> <ownerName>Dave Winer</ownerName> <ownerId>http://www.opml.org/profiles/sendMail?usernum=1</ownerId> <expansionState>1, 2, 5, 10, 13, 15</expansionState> <vertScrollState>1</vertScrollState> <windowTop>242</windowTop> <windowLeft>329</windowLeft> <windowBottom>665</windowBottom> <windowRight>547</windowRight> </head> <body> <outline text="Places I've lived"> <outline text="Boston"> <outline text="Cambridge"/> <outline text="West Newton"/> </outline> <outline text="Bay Area"> <outline text="Mountain View"/> <outline text="Los Gatos"/> <outline text="Palo Alto"/> <outline text="Woodside"/> </outline> <outline text="New Orleans"> <outline text="Uptown"/> <outline text="Metairie"/> </outline> <outline text="Wisconsin"> <outline text="Madison"/> </outline> <outline text="Florida" type="include" url="http://hosting.opml.org/dave/florida.opml"/> <outline text="New York"> <outline text="Jackson Heights"/> <outline text="Flushing"/> <outline text="The Bronx"/> </outline> </outline> </body> </opml>
View
@@ -0,0 +1 @@
<?xml version="1.0" encoding="ISO-8859-1"?><opml version="2.0"> <head> <title>workspace.userlandsamples.doSomeUpstreaming</title> <dateCreated>Mon, 11 Feb 2002 22:48:02 GMT</dateCreated> <dateModified>Sun, 30 Oct 2005 03:30:17 GMT</dateModified> <ownerName>Dave Winer</ownerName> <ownerEmail>dwiner@yahoo.com</ownerEmail> <expansionState>1, 2, 4</expansionState> <vertScrollState>1</vertScrollState> <windowTop>74</windowTop> <windowLeft>41</windowLeft> <windowBottom>314</windowBottom> <windowRight>475</windowRight> </head> <body> <outline text="Changes" isComment="true"> <outline text="1/3/02; 4:54:25 PM by DW"> <outline text="Change &quot;playlist&quot; to &quot;radio&quot;."/> </outline> <outline text="2/12/01; 1:49:33 PM by DW" isComment="true"> <outline text="Test upstreaming by sprinkling a few files in a nice new test folder."/> </outline> </outline> <outline text="on writetestfile (f, size)"> <outline text="file.surefilepath (f)" isBreakpoint="true"/> <outline text="file.writewholefile (f, string.filledstring (&quot;x&quot;, size))"/> </outline> <outline text="local (folder = user.radio.prefs.wwwfolder + &quot;test\\largefiles\\&quot;)"/> <outline text="for ch = 'a' to 'z'"> <outline text="writetestfile (folder + ch + &quot;.html&quot;, random (1000, 16000))"/> </outline> </body> </opml>
View
@@ -0,0 +1 @@
<?xml version="1.0" encoding="ISO-8859-1"?><opml version="2.0"> <head> <title>states.opml</title> <dateCreated>Tue, 15 Mar 2005 16:35:45 GMT</dateCreated> <dateModified>Thu, 14 Jul 2005 23:41:05 GMT</dateModified> <ownerName>Dave Winer</ownerName> <ownerEmail>dave@scripting.com</ownerEmail> <expansionState>1, 6, 13, 16, 18, 20</expansionState> <vertScrollState>1</vertScrollState> <windowTop>106</windowTop> <windowLeft>106</windowLeft> <windowBottom>558</windowBottom> <windowRight>479</windowRight> </head> <body> <outline text="United States"> <outline text="Far West"> <outline text="Alaska"/> <outline text="California"/> <outline text="Hawaii"/> <outline text="Nevada"> <outline text="Reno" created="Tue, 12 Jul 2005 23:56:35 GMT"/> <outline text="Las Vegas" created="Tue, 12 Jul 2005 23:56:37 GMT"/> <outline text="Ely" created="Tue, 12 Jul 2005 23:56:39 GMT"/> <outline text="Gerlach" created="Tue, 12 Jul 2005 23:56:47 GMT"/> </outline> <outline text="Oregon"/> <outline text="Washington"/> </outline> <outline text="Great Plains"> <outline text="Kansas"/> <outline text="Nebraska"/> <outline text="North Dakota"/> <outline text="Oklahoma"/> <outline text="South Dakota"/> </outline> <outline text="Mid-Atlantic"> <outline text="Delaware"/> <outline text="Maryland"/> <outline text="New Jersey"/> <outline text="New York"/> <outline text="Pennsylvania"/> </outline> <outline text="Midwest"> <outline text="Illinois"/> <outline text="Indiana"/> <outline text="Iowa"/> <outline text="Kentucky"/> <outline text="Michigan"/> <outline text="Minnesota"/> <outline text="Missouri"/> <outline text="Ohio"/> <outline text="West Virginia"/> <outline text="Wisconsin"/> </outline> <outline text="Mountains"> <outline text="Colorado"/> <outline text="Idaho"/> <outline text="Montana"/> <outline text="Utah"/> <outline text="Wyoming"/> </outline> <outline text="New England"> <outline text="Connecticut"/> <outline text="Maine"/> <outline text="Massachusetts"/> <outline text="New Hampshire"/> <outline text="Rhode Island"/> <outline text="Vermont"/> </outline> <outline text="South"> <outline text="Alabama"/> <outline text="Arkansas"/> <outline text="Florida"/> <outline text="Georgia"/> <outline text="Louisiana"/> <outline text="Mississippi"/> <outline text="North Carolina"/> <outline text="South Carolina"/> <outline text="Tennessee"/> <outline text="Virginia"/> </outline> <outline text="Southwest"> <outline text="Arizona"/> <outline text="New Mexico"/> <outline text="Texas"/> </outline> </outline> </body> </opml>
@@ -0,0 +1 @@
<?xml version="1.0" encoding="ISO-8859-1"?><opml version="2.0"> <head> <title>mySubscriptions.opml</title> <dateCreated>Sat, 18 Jun 2005 12:11:52 GMT</dateCreated> <dateModified>Tue, 02 Aug 2005 21:42:48 GMT</dateModified> <ownerName>Dave Winer</ownerName> <ownerEmail>dave@scripting.com</ownerEmail> <expansionState></expansionState> <vertScrollState>1</vertScrollState> <windowTop>61</windowTop> <windowLeft>304</windowLeft> <windowBottom>562</windowBottom> <windowRight>842</windowRight> </head> <body> <outline text="CNET News.com" description="Tech news and business reports by CNET News.com. Focused on information technology, core topics include computers, hardware, software, networking, and Internet media." htmlUrl="http://news.com.com/" language="unknown" title="CNET News.com" type="rss" version="RSS2" xmlUrl="http://news.com.com/2547-1_3-0-5.xml"/> <outline text="washingtonpost.com - Politics" description="Politics" htmlUrl="http://www.washingtonpost.com/wp-dyn/politics?nav=rss_politics" language="unknown" title="washingtonpost.com - Politics" type="rss" version="RSS2" xmlUrl="http://www.washingtonpost.com/wp-srv/politics/rssheadlines.xml"/> <outline text="Scobleizer: Microsoft Geek Blogger" description="Robert Scoble's look at geek and Microsoft life." htmlUrl="http://radio.weblogs.com/0001011/" language="unknown" title="Scobleizer: Microsoft Geek Blogger" type="rss" version="RSS2" xmlUrl="http://radio.weblogs.com/0001011/rss.xml"/> <outline text="Yahoo! News: Technology" description="Technology" htmlUrl="http://news.yahoo.com/news?tmpl=index&amp;cid=738" language="unknown" title="Yahoo! News: Technology" type="rss" version="RSS2" xmlUrl="http://rss.news.yahoo.com/rss/tech"/> <outline text="Workbench" description="Programming and publishing news and comment" htmlUrl="http://www.cadenhead.org/workbench/" language="unknown" title="Workbench" type="rss" version="RSS2" xmlUrl="http://www.cadenhead.org/workbench/rss.xml"/> <outline text="Christian Science Monitor | Top Stories" description="Read the front page stories of csmonitor.com." htmlUrl="http://csmonitor.com" language="unknown" title="Christian Science Monitor | Top Stories" type="rss" version="RSS" xmlUrl="http://www.csmonitor.com/rss/top.rss"/> <outline text="Dictionary.com Word of the Day" description="A new word is presented every day with its definition and example sentences from actual published works." htmlUrl="http://dictionary.reference.com/wordoftheday/" language="unknown" title="Dictionary.com Word of the Day" type="rss" version="RSS" xmlUrl="http://www.dictionary.com/wordoftheday/wotd.rss"/> <outline text="The Motley Fool" description="To Educate, Amuse, and Enrich" htmlUrl="http://www.fool.com" language="unknown" title="The Motley Fool" type="rss" version="RSS" xmlUrl="http://www.fool.com/xml/foolnews_rss091.xml"/> <outline text="InfoWorld: Top News" description="The latest on Top News from InfoWorld" htmlUrl="http://www.infoworld.com/news/index.html" language="unknown" title="InfoWorld: Top News" type="rss" version="RSS2" xmlUrl="http://www.infoworld.com/rss/news.xml"/> <outline text="NYT &gt; Business" description="Find breaking news &amp; business news on Wall Street, media &amp; advertising, international business, banking, interest rates, the stock market, currencies &amp; funds." htmlUrl="http://www.nytimes.com/pages/business/index.html?partner=rssnyt" language="unknown" title="NYT &gt; Business" type="rss" version="RSS2" xmlUrl="http://www.nytimes.com/services/xml/rss/nyt/Business.xml"/> <outline text="NYT &gt; Technology" description="" htmlUrl="http://www.nytimes.com/pages/technology/index.html?partner=rssnyt" language="unknown" title="NYT &gt; Technology" type="rss" version="RSS2" xmlUrl="http://www.nytimes.com/services/xml/rss/nyt/Technology.xml"/> <outline text="Scripting News" description="It's even worse than it appears." htmlUrl="http://www.scripting.com/" language="unknown" title="Scripting News" type="rss" version="RSS2" xmlUrl="http://www.scripting.com/rss.xml"/> <outline text="Wired News" description="Technology, and the way we do business, is changing the world we know. Wired News is a technology - and business-oriented news service feeding an intelligent, discerning audience. What role does technology play in the day-to-day living of your life? Wired News tells you. How has evolving technology changed the face of the international business world? Wired News puts you in the picture." htmlUrl="http://www.wired.com/" language="unknown" title="Wired News" type="rss" version="RSS" xmlUrl="http://www.wired.com/news_drop/netcenter/netcenter.rdf"/> </body> </opml>

0 comments on commit be45a93

Please sign in to comment.