Permalink
Browse files

first commit

  • Loading branch information...
davorg committed Oct 27, 2011
0 parents commit e283d9917be3fb688ebc934c2ae287bce9c60981
Showing with 106 additions and 0 deletions.
  1. +1 −0 README
  2. +105 −0 mailcc
1 README
@@ -0,0 +1 @@
+An attempt to build a web feed for the stories in the Daily Mail (and Mail on Sunday) Corrections and Clarifications column.
105 mailcc
@@ -0,0 +1,105 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use Data::Dumper;
+
+use URI;
+use Web::Scraper;
+use XML::Feed;
+use DateTime;
+use DateTime::Format::Strptime;
+use Encode;
+
+# binmode STDOUT, ':utf8';
+
+my $search_url = 'http://www.dailymail.co.uk/home/search.html' .
+ '?sel=site&searchPhrase=corrections+and+clarifications' .
+ '&orderBy=dateDesc';
+
+my $search = scraper {
+ process 'div.sch-result', 'articles[]' => scraper {
+ process 'h3.sch-res-title', title => 'TEXT';
+ process 'h4.sch-res-info', date => 'TEXT';
+ process 'a', link => '@href';
+ };
+};
+
+my $entries = scraper {
+ process '//p/font | //div[@class="pageBreak"]', 'paras[]' => 'HTML';
+ result 'paras';
+};
+
+my $dt_parser = DateTime::Format::Strptime->new(
+ pattern => '%d/%m/%Y %H:%M:%S',
+ time_zone => 'Europe/London',
+);
+
+my $res = $search->scrape( URI->new($search_url) );
+
+my $feed = XML::Feed->new('Atom');
+$feed->title('Mail Corrections and Clarifications');
+$feed->modified(DateTime->now);
+$feed->id('http://fellowtravellers.org.uk/mailcc/');
+# $feed->id($feed->self_link);
+
+foreach my $a (@{$res->{articles}}) {
+ next unless $a->{title} =~ /^Corrections/;
+
+ my $i = 1;
+
+ my $dt = $dt_parser->parse_datetime($a->{date});
+
+ my @paras;
+
+ foreach (@{ $entries->scrape($a->{link}) }) {
+ # warn "$_\n";
+ next if /corrections\@dailymail\.co\.uk/;
+
+ if (length == 1) {
+ my $link = $a->{link} . '#corr-' . $i++;
+ $feed->add_entry(new_entry($link, $dt, @paras));
+ @paras = ();
+ } else {
+ # Kludge
+ s/\x{2018}/'/g;
+ s/\x{2019}/'/g;
+ # Encode::from_to($_, 'iso-8859-1', 'utf8');
+ push @paras, decode('iso-8859-1', $_);
+ }
+# print ord, "\n" if length == 1;
+# print "$_\n";
+ }
+
+ if (@paras) {
+ my $link = $a->{link} . '#corr-' . $i++;
+ $feed->add_entry(new_entry($link, $dt, @paras));
+ @paras = ();
+ }
+
+# print $dt->strftime('%z') . "\n";
+}
+
+print $feed->as_xml;
+
+sub new_entry {
+ my ($link, $dt, @paras) = @_;
+
+ my $e = XML::Feed::Entry->new;
+ my $title;
+ if (index($paras[0], '.') > -1) {
+ $title = substr($paras[0], 0, index($paras[0], '.'));
+ } else {
+ $title = $paras[0];
+ }
+ $e->title($title);
+ $e->link($link);
+ $e->content(join "\n", @paras);
+ $e->author('Mail Corrections');
+ $e->issued($dt);
+ $e->modified($dt);
+ $e->id($link);
+
+ return $e;
+}

0 comments on commit e283d99

Please sign in to comment.