Permalink
Browse files

Tweaks to Cb in the past 3-4 years I never commited, and will not spl…

…it up now
  • Loading branch information...
1 parent 1b52c74 commit 6d1cfce994ebf37e62b587405d1b1119374dad3a @egonw committed Aug 17, 2011
Showing with 195 additions and 6 deletions.
  1. +1 −0 get_inchis.pl
  2. +2 −2 get_links.pl
  3. +5 −1 handle_acs.pl
  4. +1 −1 parse_feed.py
  5. +178 −0 parse_wp_links_offline.pl
  6. +3 −1 pipeline.pl
  7. +5 −1 update_posts.pl
View
@@ -18,6 +18,7 @@
my %posts;
my %blogs;
my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE active=1");
+#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts");
#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE blog_id = 52");
$sql->execute();
while (my $row = $sql->fetchrow_hashref()) {
View
@@ -17,8 +17,8 @@
# get active posts
my %posts;
my %blogs;
-my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE active=1");
-#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts");
+#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE active=1");
+my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts");
$sql->execute();
while (my $row = $sql->fetchrow_hashref()) {
$posts{$row->{"post_id"}} = $row->{"filename"};
View
@@ -49,7 +49,7 @@
while (my $row = $sql->fetchrow_hashref()) {
my $doi = $row->{"doi_id"};
- my $url = "http://pubs3.acs.org/acs/journals/doilookup?in_doi=".$doi;
+ my $url = "http://pubs.acs.org/doi/abs/".$doi;
print STDERR "URL: $url\n";
my $page = download_url($url);
@@ -69,6 +69,10 @@
print STDERR "Got title $1\n";
my $update = $db->prepare("UPDATE papers SET title=? WHERE doi_id=?");
$update->execute($1, $doi);
+ } elsif ($page =~ /name="dc.title"\s+content="(.*?)"/i) {
+ print STDERR "Got title $1\n";
+ my $update = $db->prepare("UPDATE papers SET title=? WHERE doi_id=?");
+ $update->execute($1, $doi);
} else {
# couldn't get a title for the page, set the title to !NULL, anyway, so that this script won't look at it again.
print STDERR "No title found\n";
View
@@ -25,7 +25,7 @@
import time
from xml.sax import saxutils
-ENCODING = 'ascii'
+ENCODING = 'us-ascii'
ENCODING_ERRORS = 'ignore'
FEED_DIR = "feeds/"
POSTS_DIR = "posts/"
View
@@ -0,0 +1,178 @@
+#!/usr/bin/perl
+#
+# get links from posts and put them in the database
+#
+
+use lib (".");
+use strict;
+use DBI;
+use config qw(%config log log_error urldecode $DEBUG parse_post_xml url_breakdown trim);
+use helper qw(download_url non_html);
+use Digest::MD5 qw(md5_hex);
+use HTML::TreeBuilder;
+use Encode qw(encode);
+use HTML::Entities;
+use pubchem;
+
+my $connection_string = sprintf("dbi:mysql:%s:%s", $config{"db_name"}, $config{"db_host"});
+my $db = DBI->connect($connection_string, $config{"db_user"}, $config{"db_password"}) or log_error("Couldn't connect to the database.\n");
+
+# do a brute force update of the link page titles, using the cache?
+my $shoehorn = 0;
+
+my $offline = 0;
+foreach my $arg (@ARGV) {
+ $offline = 1 if ($arg eq "--offline");
+}
+
+# get existing names
+my %titles;
+my $sql;
+if ($offline) {
+ $sql = $db->prepare("SELECT links.url, posts.post_id, posts.blog_id FROM links, posts WHERE posts.post_id = links.post_id AND id_inchi_hash IS NULL");
+} else {
+ $sql = $db->prepare("SELECT links.url, posts.post_id, posts.blog_id FROM links, posts WHERE posts.post_id = links.post_id AND id_inchi_hash IS NULL AND active = 1");
+}
+$sql->execute();
+my $box;
+while (my $row = $sql->fetchrow_hashref()) {
+ my $url = $row->{"url"};
+ my $post_id = $row->{"post_id"};
+ my $blog_id = $row->{"blog_id"};
+ if ($url =~ m/wikipedia.org\/wiki/ && !($url =~ m/google.com/) &&
+ !($url =~ m/wiki\/InChI/i) && !($url =~ m/wiki\/Simplified/) &&
+ !($url =~ m/wiki\/Template/i) && !($url =~ m/wiki\/PubChem/) &&
+ !($url =~ m/wiki\/SMILES/i)) {
+ # figure out name
+ my $name = "";
+ if ($url =~ m#/wiki/(.*)#) {
+ $name = $1;
+ $name =~ s/\_/ /g;
+ $name = lc $name;
+ }
+
+ my $likelyChemical = 0;
+ my $inchi = "";
+ my $cid = "";
+ next if (!($url =~ m/Pyrene/i));
+
+ print "WP URL: $url";
+ # remove URL # part
+ if ($url =~ m/(.*)#.*/) {
+ $url = $1;
+ }
+ `wget -q -O wp.html "$url"`;
+ my @content = `cat wp.html`;
+ $box = "";
+ my $readingInChI = 0;
+ foreach my $line (@content) {
+ if ($line =~ m#(InChI=1/[^\s]*)#) {
+ $readingInChI = 1;
+ }
+ if ($line =~ m#/wiki/Simplified_molecular_input_line_entry_specification#) {
+ $likelyChemical = 1;
+ }
+ if ($line =~ m#http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi\?cid=(\d*)#) {
+ $likelyChemical = 1;
+ $cid = $1;
+ }
+ if ($line =~ m#href="/wiki/Chemical_formula"#) {
+ $likelyChemical = 1;
+ }
+ if ($line =~ m/id="drugInfoBox"/) {
+ $box = "DrugBox";
+ }
+ if ($readingInChI == 1) {
+ #print "line: $line";
+ $inchi .= $line;
+ if ($line =~ m#</#) {
+ $inchi =~ s/\s//g;
+ $inchi =~ s#<br[^/]*/>##g;
+ $inchi =~ s#</?td>##g;
+ if ($inchi =~ /href/) {
+ # OK, this is not good.
+ $inchi = "";
+ }
+ $readingInChI = 0;
+ }
+ }
+ }
+
+ # do we know anything about this compound
+ my $compoundKnown = 0;
+ if ($inchi) {
+ my $query2 = "SELECT inchi, cid FROM compounds WHERE inchi = '$inchi'";
+ print "Q: $query2\n";
+ my $sql2 = $db->prepare($query2);
+ $sql2->execute();
+ while (my $row = $sql2->fetchrow_hashref()) {
+ $compoundKnown = 1;
+ if (!$cid && $row->{"cid"}) {
+ print "Retrieved CID from cb db: $cid\n";
+ $cid = $row->{"cid"};
+ }
+ }
+ } elsif ($cid) {
+ my $query2 = "SELECT inchi, cid FROM compounds WHERE cid = '$cid'";
+ my $sql2 = $db->prepare($query2);
+ $sql2->execute();
+ while (my $row = $sql2->fetchrow_hashref()) {
+ $compoundKnown = 1;
+ if (!$inchi && $row->{"inchi"}) {
+ print "Retrieved InChI from cb db: $inchi\n";
+ $inchi = $row->{"inchi"};
+ }
+ }
+ }
+
+ if ($likelyChemical) {
+ print " -> Chemical?: $url -> ";
+ if ($inchi) {
+ print "$inchi";
+ if (!$cid) {
+ $cid = getCID($inchi);
+ }
+ print " -> CID:$cid";
+ } elsif ($cid) {
+ print "CID:$cid";
+ if (!$inchi) {
+ $inchi = getInChI($cid);
+ }
+ print " -> $inchi" if ($inchi);
+ } else {
+ print "but no InChI/CID";
+ }
+ if ($inchi) {
+ my $id_inchi_hash = md5_hex($post_id.$inchi);
+
+ print "name: $name\n";
+ print "post: $post_id\n";
+ print "blog: $blog_id\n";
+ print " adding inchi: $inchi\n";
+
+ my $query = "UPDATE links SET id_inchi_hash = '$id_inchi_hash' WHERE url = '$url' AND blog_id = '$blog_id' AND post_id = '$post_id'";
+ print "Q: $query\n";
+ #my $insert = $db->prepare($query);
+ #$insert->execute();
+ $query = "INSERT INTO inchis (id_inchi_hash, blog_id, post_id, inchi, added_on) VALUES (?, ?, ?, ?,CURRENT_TIMESTAMP())";
+ print "Q: $query\n";
+ #$insert = $db->prepare($query);
+ #$insert->execute($id_inchi_hash, $blog_id, $post_id, $inchi);
+
+ print "known: $compoundKnown\n";
+ if ($compoundKnown != 0) {
+ print " Already know this compound; not adding into compounds\n";
+ } else {
+ print " Adding to compounds table...\n";
+ $query = "INSERT INTO compounds (cid, inchi, name, added_on) VALUES (?, ?, ?, CURRENT_TIMESTAMP())";
+ print " query: $query\n" if ($offline);
+ # my $insert = $db->prepare($query);
+ # $insert->execute($cid, $inchi, $name);
+ }
+ }
+ }
+ print "\n";
+ }
+}
+
+log("script complete");
View
@@ -15,6 +15,8 @@
system("perl update_posts.pl 51 100");
system("perl update_posts.pl 101 150");
system("perl update_posts.pl 151 200");
+system("perl update_posts.pl 201 250");
+system("perl update_posts.pl 251 300");
system("perl update_feeds.pl"); # update feed names, descriptions etc.
system("perl get_links.pl"); # get all URLs from posts
system("perl get_inchis.pl"); # get all InChIs from posts
@@ -43,7 +45,7 @@
#system("perl get_connotea_tags.pl"); # match tags and comments to items in our database
system("perl generate_summaries.pl"); # generate summary tables to speed up front-end
system("perl get_bursts.pl"); # get wordbursts
-system("perl geolocate_terms.pl"); # geolocate terms associated with conference posts
+#system("perl geolocate_terms.pl"); # geolocate terms associated with conference posts
system("perl generate_xml.pl"); # generate flatfiles for papers in the database
system("perl wipe_cache.pl"); # wipe cache of interface
View
@@ -63,8 +63,12 @@
print STDERR " -> hash: $hash\n";
my @posts = glob($posts_dir."/post_*");
+ # my $postCounter = 0;
foreach my $post (@posts) {
-
+ # do at most 25 blog posts
+ #last if ($postCounter == 25);
+ #$postCounter++;
+
if (!$config{"allow_post_edits"}) {
if ($exists{$post}) {
print STDERR "-";

0 comments on commit 6d1cfce

Please sign in to comment.