add contrib/exportOPMLWithTags.py

dennisschagt · Oct 24, 2020 · d38b718 · d38b718
1 parent fba9376
commit d38b718
Showing 1 changed file with 104 additions and 0 deletions.
diff --git a/contrib/exportOPMLWithTags.py b/contrib/exportOPMLWithTags.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+# this script exports the urls file to OPML, including tags. for that, all feeds must have only one tag
+
+#usage: ./exportOPMLWithTags.py urls > urls.opml
+
+#requeriments (just to get the title from a rss feed if it isn't cached in newsboat):
+# pip install feedparser
+
+#input-output example:
+#
+# $ cat urls
+# https://xkcd.com/rss.xml "must"
+# http://www.commitstrip.com/en/feed/? "must"
+# https://jartigag.xyz/feed.xml "personal"
+#
+# $ cat urls.opml
+# <?xml version="1.0" ?>
+# <opml version="2.0">
+#   <head/>
+#   <body>
+#     <outline title="must" type="rss">
+#       <outline title="xkcd.com" type="rss" xmlUrl="https://xkcd.com/rss.xml"/>
+#       <outline title="CommitStrip" type="rss" xmlUrl="http://www.commitstrip.com/en/feed/?"/>
+#     </outline>
+#     <outline title="personal" type="rss">
+#       <outline title="jartigag" type="rss" xmlUrl="https://jartigag.xyz/feed.xml"/>
+#     </outline>
+#   </body>
+# </opml>
+
+from xml.etree import ElementTree as ET
+from xml.dom import minidom
+import sys, os
+import csv
+import sqlite3
+
+try:
+
+    if len(sys.argv)<2: raise Exception("input file needed")
+
+    inputfilename = sys.argv[1]
+
+    if not os.path.isfile(inputfilename): raise Exception(f"{inputfilename} not found")
+
+    with open(inputfilename) as f:
+        reader = csv.reader(f,delimiter=" ")
+        lines = list(reader)
+
+    root = ET.Element('opml', version='2.0')
+    head = ET.SubElement(root, 'head')
+    body = ET.SubElement(root, 'body')
+
+    try:
+        # get titles from ~/.newsboat/cache.db:
+        with sqlite3.connect(f"{os.environ['HOME']}/.newsboat/cache.db") as conn:
+            conn.row_factory = sqlite3.Row
+            c = conn.cursor()
+            c.execute("select rssurl,title from rss_feed")
+            db_rows = c.fetchall()
+    except sqlite.OperationalError:
+        pass
+
+
+    for line in lines:
+
+        if len(line)<2:
+            # lines must be `url "tag"`, so ignore this line
+            print(f"ignoring this line:\n{' '.join(line)}", file=sys.stderr)
+            continue
+
+        if not body:
+        # that is, body has no children
+            tag = ET.SubElement(body, 'outline', type='rss', title=line[1])
+        elif line[1] not in [ o.attrib['title'] for o in body.findall('outline') ]:
+        # that is, this tag doesn't exist yet
+            tag = ET.SubElement(body, 'outline', type='rss', title=line[1])
+
+        for tag in body.findall('outline'):
+            if tag.attrib['title']==line[1]:
+            # that is, this is the tag we are looking for
+                feed = ET.SubElement(tag, 'outline', type='rss', xmlUrl=line[0])
+                for row in db_rows:
+                # look for the title among the cached ones
+                    if row['rssurl']==line[0]:
+                        feed.set('title', row['title'])
+                if 'title' not in feed.attrib:
+                # that is, this feed's title isn't in ~/.newsboat/cache.db
+                    try:
+                        import feedparser
+                        print(f"getting title from {line[0]}", file=sys.stderr)
+                        feed.set('title', feedparser.parse(line[0])['feed']['title'])
+                    except (ModuleNotFoundError, KeyError):
+                        # can't get title neither from cache.db nor the xml of the feed,
+                        # so left title blank
+                        feed.set('title', "")
+
+    print(
+        minidom.parseString( ET.tostring(root) )
+        .toprettyxml(indent="  ")
+    )
+
+except Exception as e:
+    print(e)