Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Tree: 6d1cfce994
Fetching contributors…

Cannot retrieve contributors at this time

executable file 80 lines (68 sloc) 3.529 kB
#!/usr/bin/perl
#
# generate_stats.pl
#
# generate some unchanging stats (blog rankings, journal paper shares etc.)
# there's no point in working these out dynamically as they only change when there's been a pipeline update
#
use lib (".");
use strict;
use DBI;
use config qw(%config log log_error urldecode $DEBUG parse_post_xml url_breakdown);
use helper qw(download_url);
use Digest::MD5 qw(md5_hex);
my $connection_string = sprintf("dbi:mysql:%s:%s", $config{"db_name"}, $config{"db_host"});
my $db = DBI->connect($connection_string, $config{"db_user"}, $config{"db_password"}) or log_error("Couldn't connect to the database.\n");
# let's only summarize links from the past x days.
my $age_limit = 30;
# we have two "summary" tables, for links and posts
my $sql = $db->prepare("DELETE FROM links_summary");
$sql->execute();
# repopulate the tables with data
my $sql = $db->prepare("SET group_concat_max_len = 4096");
$sql->execute();
my $sql = $db->prepare("
INSERT INTO links_summary
(categories, url_hash, post_titles, linked_by, url, domain, titles, page_title, last_linked_on)
SELECT
GROUP_CONCAT(DISTINCT t2.tag) AS categories,
links.url_hash, GROUP_CONCAT(DISTINCT CONCAT(p2.title,'===',p2.url,'===',p2.post_id,'===',blogs.title,'===',blogs.url,'===',blogs.blog_id) SEPARATOR '|||') AS post_titles, COUNT(DISTINCT links.blog_id) AS linked_by, links.url, domain, GROUP_CONCAT(DISTINCT TRIM(links.title) SEPARATOR ', ') AS title, links.page_title AS page_title, MAX(p2.pubdate) AS last_linked_on FROM links LEFT JOIN posts ON posts.url_hash = links.url_hash, posts AS p2, blogs, tags AS t2 WHERE blogs.blog_id = p2.blog_id AND p2.post_id = links.post_id AND ISNULL(posts.post_id) AND links.type = 'link' AND ISNULL(links.paper_id) AND length(trim(links.title)) >= 3 AND t2.blog_id = p2.blog_id AND DATEDIFF(CURRENT_TIMESTAMP(), p2.pubdate) <= ? GROUP BY links.url_hash
");
$sql->execute($age_limit);
print "Done repopulation links...\n";
my $sql = $db->prepare("DELETE FROM posts_summary");
$sql->execute();
my $sql = $db->prepare("
INSERT INTO posts_summary
(post_id, blog_id, title, url, url_hash, summary, filename, author, pubdate, added_on, blog_name, blog_image, linked_by)
SELECT posts.post_id, posts.blog_id, posts.title, posts.url, posts.url_hash, posts.summary, posts.filename, posts.author, posts.pubdate, posts.added_on, blogs.title AS blog_name, blogs.image AS blog_image, COUNT(DISTINCT IF(ISNULL(links.blog_id),NULL,IF(links.blog_id = posts.blog_id,NULL,links.blog_id))) AS linked_by FROM blogs, posts LEFT JOIN links ON links.url_hash = posts.url_hash WHERE blogs.blog_id = posts.blog_id GROUP BY posts.url_hash
");
$sql->execute();
print "Done repopulation posts...\n";
my $sql = $db->prepare("DELETE FROM papers_summary");
$sql->execute();
my $sql = $db->prepare("
INSERT INTO papers_summary
(paper_id, isbn_id, image, doi_id, pubmed_id, arxiv_id, pii_id, journal, title, abstract, authors, pubdate, added_on, url, cited_by, reviewed, blog_ids)
SELECT
papers.paper_id,
papers.isbn_id,
papers.image,
papers.doi_id,
papers.pubmed_id,
papers.arxiv_id,
papers.pii_id,
papers.journal,
papers.title,
papers.abstract,
papers.authors,
papers.pubdate,
papers.added_on,
links.url AS url,
COUNT(DISTINCT links.blog_id) AS cited_by,
SUM(IF(links.type = 'review', 1, 0)) AS reviewed,
GROUP_CONCAT(DISTINCT links.blog_id SEPARATOR ',') AS blog_ids
FROM papers LEFT JOIN links ON links.paper_id = papers.paper_id GROUP BY papers.paper_id;
");
$sql->execute();
print "Done repopulation papers...\n";
Jump to Line
Something went wrong with that request. Please try again.