Skip to content

Commit

Permalink
Tweaks to Cb in the past 3-4 years I never commited, and will not spl…
Browse files Browse the repository at this point in the history
…it up now
  • Loading branch information
egonw committed Aug 17, 2011
1 parent 1b52c74 commit 6d1cfce
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 6 deletions.
1 change: 1 addition & 0 deletions get_inchis.pl
Expand Up @@ -18,6 +18,7 @@
my %posts;
my %blogs;
my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE active=1");
#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts");
#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE blog_id = 52");
$sql->execute();
while (my $row = $sql->fetchrow_hashref()) {
Expand Down
4 changes: 2 additions & 2 deletions get_links.pl
Expand Up @@ -17,8 +17,8 @@
# get active posts
my %posts;
my %blogs;
my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE active=1");
#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts");
#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE active=1");
my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts");
$sql->execute();
while (my $row = $sql->fetchrow_hashref()) {
$posts{$row->{"post_id"}} = $row->{"filename"};
Expand Down
6 changes: 5 additions & 1 deletion handle_acs.pl
Expand Up @@ -49,7 +49,7 @@

while (my $row = $sql->fetchrow_hashref()) {
my $doi = $row->{"doi_id"};
my $url = "http://pubs3.acs.org/acs/journals/doilookup?in_doi=".$doi;
my $url = "http://pubs.acs.org/doi/abs/".$doi;
print STDERR "URL: $url\n";
my $page = download_url($url);

Expand All @@ -69,6 +69,10 @@
print STDERR "Got title $1\n";
my $update = $db->prepare("UPDATE papers SET title=? WHERE doi_id=?");
$update->execute($1, $doi);
} elsif ($page =~ /name="dc.title"\s+content="(.*?)"/i) {
print STDERR "Got title $1\n";
my $update = $db->prepare("UPDATE papers SET title=? WHERE doi_id=?");
$update->execute($1, $doi);
} else {
# couldn't get a title for the page, set the title to !NULL, anyway, so that this script won't look at it again.
print STDERR "No title found\n";
Expand Down
2 changes: 1 addition & 1 deletion parse_feed.py
Expand Up @@ -25,7 +25,7 @@
import time
from xml.sax import saxutils

ENCODING = 'ascii'
ENCODING = 'us-ascii'
ENCODING_ERRORS = 'ignore'
FEED_DIR = "feeds/"
POSTS_DIR = "posts/"
Expand Down
178 changes: 178 additions & 0 deletions parse_wp_links_offline.pl
@@ -0,0 +1,178 @@
#!/usr/bin/perl
#
# get links from posts and put them in the database
#

use lib (".");
use strict;
use DBI;
use config qw(%config log log_error urldecode $DEBUG parse_post_xml url_breakdown trim);
use helper qw(download_url non_html);
use Digest::MD5 qw(md5_hex);
use HTML::TreeBuilder;
use Encode qw(encode);
use HTML::Entities;
use pubchem;

my $connection_string = sprintf("dbi:mysql:%s:%s", $config{"db_name"}, $config{"db_host"});
my $db = DBI->connect($connection_string, $config{"db_user"}, $config{"db_password"}) or log_error("Couldn't connect to the database.\n");

# do a brute force update of the link page titles, using the cache?
my $shoehorn = 0;

my $offline = 0;
foreach my $arg (@ARGV) {
$offline = 1 if ($arg eq "--offline");
}

# get existing names
my %titles;
my $sql;
if ($offline) {
$sql = $db->prepare("SELECT links.url, posts.post_id, posts.blog_id FROM links, posts WHERE posts.post_id = links.post_id AND id_inchi_hash IS NULL");
} else {
$sql = $db->prepare("SELECT links.url, posts.post_id, posts.blog_id FROM links, posts WHERE posts.post_id = links.post_id AND id_inchi_hash IS NULL AND active = 1");
}
$sql->execute();
my $box;
while (my $row = $sql->fetchrow_hashref()) {
my $url = $row->{"url"};
my $post_id = $row->{"post_id"};
my $blog_id = $row->{"blog_id"};
if ($url =~ m/wikipedia.org\/wiki/ && !($url =~ m/google.com/) &&
!($url =~ m/wiki\/InChI/i) && !($url =~ m/wiki\/Simplified/) &&
!($url =~ m/wiki\/Template/i) && !($url =~ m/wiki\/PubChem/) &&
!($url =~ m/wiki\/SMILES/i)) {
# figure out name
my $name = "";
if ($url =~ m#/wiki/(.*)#) {
$name = $1;
$name =~ s/\_/ /g;
$name = lc $name;
}

my $likelyChemical = 0;
my $inchi = "";
my $cid = "";
next if (!($url =~ m/Pyrene/i));

print "WP URL: $url";
# remove URL # part
if ($url =~ m/(.*)#.*/) {
$url = $1;
}
`wget -q -O wp.html "$url"`;
my @content = `cat wp.html`;
$box = "";
my $readingInChI = 0;
foreach my $line (@content) {
if ($line =~ m#(InChI=1/[^\s]*)#) {
$readingInChI = 1;
}
if ($line =~ m#/wiki/Simplified_molecular_input_line_entry_specification#) {
$likelyChemical = 1;
}
if ($line =~ m#http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi\?cid=(\d*)#) {
$likelyChemical = 1;
$cid = $1;
}
if ($line =~ m#href="/wiki/Chemical_formula"#) {
$likelyChemical = 1;
}
if ($line =~ m/id="drugInfoBox"/) {
$box = "DrugBox";
}
if ($readingInChI == 1) {
#print "line: $line";
$inchi .= $line;
if ($line =~ m#</#) {
$inchi =~ s/\s//g;
$inchi =~ s#<br[^/]*/>##g;
$inchi =~ s#</?td>##g;
if ($inchi =~ /href/) {
# OK, this is not good.
$inchi = "";
}
$readingInChI = 0;
}
}
}

# do we know anything about this compound
my $compoundKnown = 0;
if ($inchi) {
my $query2 = "SELECT inchi, cid FROM compounds WHERE inchi = '$inchi'";
print "Q: $query2\n";
my $sql2 = $db->prepare($query2);
$sql2->execute();
while (my $row = $sql2->fetchrow_hashref()) {
$compoundKnown = 1;
if (!$cid && $row->{"cid"}) {
print "Retrieved CID from cb db: $cid\n";
$cid = $row->{"cid"};
}
}
} elsif ($cid) {
my $query2 = "SELECT inchi, cid FROM compounds WHERE cid = '$cid'";
my $sql2 = $db->prepare($query2);
$sql2->execute();
while (my $row = $sql2->fetchrow_hashref()) {
$compoundKnown = 1;
if (!$inchi && $row->{"inchi"}) {
print "Retrieved InChI from cb db: $inchi\n";
$inchi = $row->{"inchi"};
}
}
}

if ($likelyChemical) {
print " -> Chemical?: $url -> ";
if ($inchi) {
print "$inchi";
if (!$cid) {
$cid = getCID($inchi);
}
print " -> CID:$cid";
} elsif ($cid) {
print "CID:$cid";
if (!$inchi) {
$inchi = getInChI($cid);
}
print " -> $inchi" if ($inchi);
} else {
print "but no InChI/CID";
}
if ($inchi) {
my $id_inchi_hash = md5_hex($post_id.$inchi);

print "name: $name\n";
print "post: $post_id\n";
print "blog: $blog_id\n";
print " adding inchi: $inchi\n";

my $query = "UPDATE links SET id_inchi_hash = '$id_inchi_hash' WHERE url = '$url' AND blog_id = '$blog_id' AND post_id = '$post_id'";
print "Q: $query\n";
#my $insert = $db->prepare($query);
#$insert->execute();
$query = "INSERT INTO inchis (id_inchi_hash, blog_id, post_id, inchi, added_on) VALUES (?, ?, ?, ?,CURRENT_TIMESTAMP())";
print "Q: $query\n";
#$insert = $db->prepare($query);
#$insert->execute($id_inchi_hash, $blog_id, $post_id, $inchi);

print "known: $compoundKnown\n";
if ($compoundKnown != 0) {
print " Already know this compound; not adding into compounds\n";
} else {
print " Adding to compounds table...\n";
$query = "INSERT INTO compounds (cid, inchi, name, added_on) VALUES (?, ?, ?, CURRENT_TIMESTAMP())";
print " query: $query\n" if ($offline);
# my $insert = $db->prepare($query);
# $insert->execute($cid, $inchi, $name);
}
}
}
print "\n";
}
}

log("script complete");
4 changes: 3 additions & 1 deletion pipeline.pl
Expand Up @@ -15,6 +15,8 @@
system("perl update_posts.pl 51 100");
system("perl update_posts.pl 101 150");
system("perl update_posts.pl 151 200");
system("perl update_posts.pl 201 250");
system("perl update_posts.pl 251 300");
system("perl update_feeds.pl"); # update feed names, descriptions etc.
system("perl get_links.pl"); # get all URLs from posts
system("perl get_inchis.pl"); # get all InChIs from posts
Expand Down Expand Up @@ -43,7 +45,7 @@
#system("perl get_connotea_tags.pl"); # match tags and comments to items in our database
system("perl generate_summaries.pl"); # generate summary tables to speed up front-end
system("perl get_bursts.pl"); # get wordbursts
system("perl geolocate_terms.pl"); # geolocate terms associated with conference posts
#system("perl geolocate_terms.pl"); # geolocate terms associated with conference posts
system("perl generate_xml.pl"); # generate flatfiles for papers in the database
system("perl wipe_cache.pl"); # wipe cache of interface

Expand Down
6 changes: 5 additions & 1 deletion update_posts.pl
Expand Up @@ -63,8 +63,12 @@
print STDERR " -> hash: $hash\n";

my @posts = glob($posts_dir."/post_*");
# my $postCounter = 0;
foreach my $post (@posts) {

# do at most 25 blog posts
#last if ($postCounter == 25);
#$postCounter++;

if (!$config{"allow_post_edits"}) {
if ($exists{$post}) {
print STDERR "-";
Expand Down

0 comments on commit 6d1cfce

Please sign in to comment.