Permalink
Browse files

fix manifesto parsing

  • Loading branch information...
1 parent a0ed2bf commit b280111896d90c87d5eb84473f65720879e5516b @filippog committed May 30, 2009
Showing with 32 additions and 37 deletions.
  1. +32 −37 manifesto.py
View
@@ -8,59 +8,54 @@
from datetime import date
-class ManifestoPrima(ScrapeNFeed.ScrapedFeed):
+class ManifestoPrima(ScrapeNFeed.ScrapedFeed):
def HTML2RSS(self, headers, body):
import copy
- #massage = copy.copy(BeautifulSoup.BeautifulStoneSoup.MARKUP_MASSAGE)
- #script_massage = [(re.compile("sc'\+'ript"), lambda m: "script")]
- #massage.extend(script_massage)
- #soup = BeautifulSoup.BeautifulSoup(body, markupMassage=massage)
-
soup = BeautifulSoup.BeautifulSoup(body)
- #rellink = soup('img', src=re.compile("fileadmin/archivi/in_edicola/\d+prima.gif"))[0]['src']
- #link = "http://www.ilmanifesto.it/" + re.sub('prima.gif', 'primapagina.gif', rellink)
+ rellink = soup.find('a', attrs = {'href': re.compile('pagina/IMMAGINE/$')})['href']
+ if not rellink:
+ return
- rellink = soup('img', src=re.compile("fileadmin/archivi/in_edicola/\d+primapagina.gif"))[0]['src']
- link = "http://www.ilmanifesto.it/" + rellink
+ today = re.search("numero/(\d+)/pagina/", rellink).group(1)
+ link = "http://www.ilmanifesto.it/fileadmin/archivi/in_edicola/%sprimapagina.gif" % today
if not self.hasSeen(link):
- i = RSSItem(title=date.today().strftime("il Manifesto prima pagina - %d/%m/%Y"),
+ i = RSSItem(title = "il Manifesto prima pagina - %s" % today,
description = '<img src="%s"/>' % link,
- #description = "&lt;img src=&quot;%s&quot;/&gt;" % link,
link=link)
self.pushRSSItem(i)
ManifestoPrima.load("il Manifesto",
- "http://www.ilmanifesto.it/il-manifesto/in-edicola/numero/%s/pagina/IMMAGINE/" % date.today().strftime("%Y%m%d"),
+ "http://www.ilmanifesto.it/il-manifesto/in-edicola/",
"il Manifesto - prima pagina",
"manifesto_prima.xml",
"state/manifesto_prima.pickle")
-#class ManifestoVignetta(ScrapeNFeed.ScrapedFeed):
-# def HTML2RSS(self, headers, body):
-# import copy
-#
-# massage = copy.copy(BeautifulSoup.BeautifulStoneSoup.MARKUP_MASSAGE)
-# script_massage = [(re.compile("sc'\+'ript"), lambda m: "script")]
-# massage.extend(script_massage)
-#
-# soup = BeautifulSoup.BeautifulSoup(body, markupMassage=massage)
-#
-# rellink = soup('img', src=re.compile("fileadmin/archivi/in_edicola/\d+prima.gif"))[0]['src']
-# link = "http://www.ilmanifesto.it/" + re.sub('prima.gif', 'primapagina.gif', rellink)
-#
-# if not self.hasSeen(link):
-# i = RSSItem(title=date.today().strftime("il Manifesto prima pagina - %d/%m/%Y"),
-# description = "&lt;img src=&quot;%s&quot;/&gt;" % link,
-# link=link)
-# self.pushRSSItem(i)
-#
-#ManifestoVignetta.load("il Manifesto",
-# "http://www.ilmanifesto.it/il-manifesto/in-edicola/numero/%s/pagina/IMMAGINE/" % date.today().strftime("%Y%m%d"),
-# "il Manifesto - vignetta",
-# "manifesto_vignetta.xml",
-# "manifesto_vignetta.pickle")
+class ManifestoVignetta(ScrapeNFeed.ScrapedFeed):
+ def HTML2RSS(self, headers, body):
+ import copy
+
+ soup = BeautifulSoup.BeautifulSoup(body)
+
+ rellink = soup.find('a', attrs = {'href': re.compile('pagina/VIGNETTA/$')})['href']
+ if not rellink:
+ return
+
+ today = re.search("numero/(\d+)/pagina/", rellink).group(1)
+ link = "http://www.ilmanifesto.it/fileadmin/archivi/in_edicola/%svignetta.gif" % today
+
+ if not self.hasSeen(link):
+ i = RSSItem(title = "il Manifesto vignetta - %s" % today,
+ description = '<img src="%s"/>' % link,
+ link=link)
+ self.pushRSSItem(i)
+
+ManifestoVignetta.load("il Manifesto",
+ "http://www.ilmanifesto.it/il-manifesto/in-edicola/",
+ "il Manifesto - vignetta",
+ "manifesto_vignetta.xml",
+ "state/manifesto_vignetta.pickle")
# vim:et

0 comments on commit b280111

Please sign in to comment.