From 605352b082439153e31841044e65274faaea97b6 Mon Sep 17 00:00:00 2001 From: Sebastian Gehaxelt Date: Fri, 18 Nov 2016 14:52:13 +0100 Subject: [PATCH 1/2] Fixes #28 The feed update code was moved into a separate class and can be executed separately from the bot. A new boolean configuration option `update_before_connecting` was added. If set to true, the bot will fetch all updates once before connecting to the IRC channel. --- bot.py | 69 +++++++----------------------- config.py.sample | 1 + feedupdater.py | 107 +++++++++++++++++++++++++++++++++++++++++++++++ main.py | 2 + 4 files changed, 124 insertions(+), 55 deletions(-) create mode 100644 feedupdater.py diff --git a/bot.py b/bot.py index f98e71b..38e5f9b 100644 --- a/bot.py +++ b/bot.py @@ -15,6 +15,7 @@ from colour import Colours from db import FeedDB from config import Config +from feedupdater import FeedUpdater class IRCBot(irc.bot.SingleServerIRCBot): def __init__(self, config, db, on_connect_cb): @@ -152,69 +153,27 @@ class Bot(object): def __init__(self): self.__config = Config() self.__db = FeedDB(self.__config) + self.__feedupdater = FeedUpdater(self.__config, self.__db) self.__irc = IRCBot(self.__config, self.__db, self.on_started) - self.__threads = [] self.__connected = False def start(self): """Starts the IRC bot""" threading.Thread(target=self.__irc.start).start() + def initial_feed_update(self): + def print_feed_update(feed_title, news_title, news_url, news_date): + print("[+]: {}||{}||{}||{}".format(feed_title, news_title, news_url, news_date)) + + if self.__config.update_before_connecting: + print "Started pre-connection updates!" + self.__feedupdater.update_feeds(print_feed_update, False) + print "DONE!" + def on_started(self): """Gets executed after the IRC thread has successfully established a connection.""" if not self.__connected: print "Connected!" - - # Start one fetcher thread per feed - for feed in self.__db.get_feeds(): - t = threading.Thread(target=self.__fetch_feed, args=(feed,)) - t.start() - self.__threads.append(t) - print "Started fetcher threads!" - self.__connected = True - - def __fetch_feed(self, feed_info): - """Fetches a RSS feed, parses it and updates the database and/or announces new news.""" - while 1: - try: - # Parse a feed's url - news = feedparser.parse( feed_info[2] ) - - # Reverse the ordering. Oldest first. - for newsitem in news.entries[::-1]: - newstitle = newsitem.title - if self.__config.shorturls: - newsurl = tinyurl.create_one(newsitem.link) # Create a short link - if newsurl == "Error": #If that fails, use the long version - newsurl = newsitem.link - else: - newsurl = newsitem.link - - # Try to get the published or updated date. Otherwise set it to 'no date' - try: - # Get date and parse it - newsdate = dateutil.parser.parse(newsitem.published) - # Format date based on 'dateformat' in config.py - newsdate = newsdate.strftime(self.__config.dateformat) - - except Exception as e: - try: - # Get date and parse it - newsdate = dateutil.parser.parse(newsitem.updated) - # Format date based on 'dateformat' in config.py - newsdate = newsdate.strftime(self.__config.dateformat) - - except Exception as e: - newsdate = "no date" - - # Update the database. If it's a new issue, post it to the channel - is_new = self.__db.insert_news(feed_info[0], newstitle, newsitem.link, newsdate) - if is_new: - self.__irc.post_news(feed_info[1], newstitle, newsurl, newsdate) - print "Updated: " + feed_info[1] - except Exception as e: - print e - print "Failed: " + feed_info[1] - - # sleep frequency minutes - time.sleep(int(feed_info[3])*60) + self.__feedupdater.update_feeds(self.__irc.post_news, True) + print "Started feed updates!" + self.__connected = True \ No newline at end of file diff --git a/config.py.sample b/config.py.sample index fb023b1..960e378 100644 --- a/config.py.sample +++ b/config.py.sample @@ -21,3 +21,4 @@ class Config(object): self.shorturls = False self.dateformat = '%Y-%m-%d %H:%M:%S %z' self.feedlimit = 10 + self.update_before_connecting = True #Update all feeds before connecting to the IRC server diff --git a/feedupdater.py b/feedupdater.py new file mode 100644 index 0000000..d319e9a --- /dev/null +++ b/feedupdater.py @@ -0,0 +1,107 @@ +#!/usr/bin/python2.7 + +import feedparser +import datetime +import dateutil.parser +import signal +import time +import tinyurl +import threading +import os +from db import FeedDB +from config import Config + +class FeedUpdater(object): + + def __init__(self, config, db): + self.__config = config + self.__db = db + self.__threads = [] + + def update_feeds(self, callback=None, forever=False): + for feed in self.__db.get_feeds(): + t = threading.Thread(target=self.__fetch_feed, + args=({ + 'id': feed[0], + 'title': feed[1], + 'url': feed[2], + 'published': feed[3] + }, + callback, + forever, + ) + ) + t.start() + self.__threads.append(t) + + if not forever: + for thread in self.__threads: + thread.join() + self.__threads.remove(thread) + + def __fetch_feed(self, feed_info, callback, forever): + """Fetches a RSS feed, parses it and updates the database and/or announces new news.""" + while 1: + try: + # Parse a feed's url + news = feedparser.parse( feed_info['url'] ) + + # Reverse the ordering. Oldest first. + for newsitem in news.entries[::-1]: + newstitle = newsitem.title + if self.__config.shorturls: + newsurl = tinyurl.create_one(newsitem.link) # Create a short link + if newsurl == "Error": #If that fails, use the long version + newsurl = newsitem.link + else: + newsurl = newsitem.link + + # Try to get the published or updated date. Otherwise set it to 'no date' + try: + # Get date and parse it + newsdate = dateutil.parser.parse(newsitem.published) + # Format date based on 'dateformat' in config.py + newsdate = newsdate.strftime(self.__config.dateformat) + + except Exception as e: + try: + # Get date and parse it + newsdate = dateutil.parser.parse(newsitem.updated) + # Format date based on 'dateformat' in config.py + newsdate = newsdate.strftime(self.__config.dateformat) + + except Exception as e: + newsdate = "no date" + + # Update the database. If it's a new issue, post it to the channel + is_new = self.__db.insert_news(feed_info['id'], newstitle, newsitem.link, newsdate) + if is_new and callback is not None: + callback(feed_info['title'], newstitle, newsurl, newsdate) + print "Updated: " + feed_info['title'] + except Exception as e: + print e + print "Failed: " + feed_info['title'] + + if not forever: + break + + # sleep frequency minutes + time.sleep(int(feed_info['published'])*60) + +if __name__ == "__main__": + def print_line(feed_title, news_title, news_url, news_date): + print("[+]: {}||{}||{}||{}".format(feed_title, news_title, news_url, news_date)) + + def main(): + config = Config() + db = FeedDB(config) + updater = FeedUpdater(config, db) + + updater.update_feeds(print_line, False) + + def signal_handler(signal, frame): + print "Caught SIGINT, terminating." + os._exit(0) + + signal.signal(signal.SIGINT, signal_handler) + main() \ No newline at end of file diff --git a/main.py b/main.py index 0ed7325..294fb23 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- from bot import Bot +from feedupdater import FeedUpdater import os import signal @@ -11,6 +12,7 @@ def signal_handler(signal, frame): if __name__ == "__main__": bot = Bot() bot._Bot__irc.connection.buffer_class.errors = 'replace' # prevent utf-8 error in jaraco.stream + bot.initial_feed_update() bot.start() signal.signal(signal.SIGINT, signal_handler) while True: From 926d68e2152adca2b97686a17c757ca0b74dc0ff Mon Sep 17 00:00:00 2001 From: Sebastian Gehaxelt Date: Fri, 18 Nov 2016 15:01:26 +0100 Subject: [PATCH 2/2] Updated the README: - typos - `update_before_connecting` option explained --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8be4fa9..7bf7658 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,13 @@ cp config.py.sample config.py cp feeds.sql.sample feeds.sql ``` -Edit `configs.py` to fit your needs and IRC settings. All feeds from `feeds.sql` will be imported one the first start. +Edit `config.py` to fit your needs and IRC settings. All feeds from `feeds.sql` will be imported on the first start. + +You might want to update all feeds before connecting to the IRC server to prevent spamming the channel (and optionally a ban from your IRC server). Either set `update_before_connecting = True` in the `config.py` or run the update script before starting the bot: + +``` +python2 feedupdater.py +``` To start the bot, run: