diff --git a/get_inchis.pl b/get_inchis.pl index f54d963..242a047 100755 --- a/get_inchis.pl +++ b/get_inchis.pl @@ -18,6 +18,7 @@ my %posts; my %blogs; my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE active=1"); +#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts"); #my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE blog_id = 52"); $sql->execute(); while (my $row = $sql->fetchrow_hashref()) { diff --git a/get_links.pl b/get_links.pl index 6f8b3db..ee48e50 100755 --- a/get_links.pl +++ b/get_links.pl @@ -17,8 +17,8 @@ # get active posts my %posts; my %blogs; -my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE active=1"); -#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts"); +#my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts WHERE active=1"); +my $sql = $db->prepare("SELECT post_id, blog_id, filename FROM posts"); $sql->execute(); while (my $row = $sql->fetchrow_hashref()) { $posts{$row->{"post_id"}} = $row->{"filename"}; diff --git a/handle_acs.pl b/handle_acs.pl index 2d1e95c..fb09d62 100755 --- a/handle_acs.pl +++ b/handle_acs.pl @@ -49,7 +49,7 @@ while (my $row = $sql->fetchrow_hashref()) { my $doi = $row->{"doi_id"}; - my $url = "http://pubs3.acs.org/acs/journals/doilookup?in_doi=".$doi; + my $url = "http://pubs.acs.org/doi/abs/".$doi; print STDERR "URL: $url\n"; my $page = download_url($url); @@ -69,6 +69,10 @@ print STDERR "Got title $1\n"; my $update = $db->prepare("UPDATE papers SET title=? WHERE doi_id=?"); $update->execute($1, $doi); + } elsif ($page =~ /name="dc.title"\s+content="(.*?)"/i) { + print STDERR "Got title $1\n"; + my $update = $db->prepare("UPDATE papers SET title=? WHERE doi_id=?"); + $update->execute($1, $doi); } else { # couldn't get a title for the page, set the title to !NULL, anyway, so that this script won't look at it again. print STDERR "No title found\n"; diff --git a/parse_feed.py b/parse_feed.py index d3c4b47..3ce68ec 100755 --- a/parse_feed.py +++ b/parse_feed.py @@ -25,7 +25,7 @@ import time from xml.sax import saxutils -ENCODING = 'ascii' +ENCODING = 'us-ascii' ENCODING_ERRORS = 'ignore' FEED_DIR = "feeds/" POSTS_DIR = "posts/" diff --git a/parse_wp_links_offline.pl b/parse_wp_links_offline.pl new file mode 100644 index 0000000..164e70f --- /dev/null +++ b/parse_wp_links_offline.pl @@ -0,0 +1,178 @@ +#!/usr/bin/perl +# +# get links from posts and put them in the database +# + +use lib ("."); +use strict; +use DBI; +use config qw(%config log log_error urldecode $DEBUG parse_post_xml url_breakdown trim); +use helper qw(download_url non_html); +use Digest::MD5 qw(md5_hex); +use HTML::TreeBuilder; +use Encode qw(encode); +use HTML::Entities; +use pubchem; + +my $connection_string = sprintf("dbi:mysql:%s:%s", $config{"db_name"}, $config{"db_host"}); +my $db = DBI->connect($connection_string, $config{"db_user"}, $config{"db_password"}) or log_error("Couldn't connect to the database.\n"); + +# do a brute force update of the link page titles, using the cache? +my $shoehorn = 0; + +my $offline = 0; +foreach my $arg (@ARGV) { + $offline = 1 if ($arg eq "--offline"); +} + +# get existing names +my %titles; +my $sql; +if ($offline) { + $sql = $db->prepare("SELECT links.url, posts.post_id, posts.blog_id FROM links, posts WHERE posts.post_id = links.post_id AND id_inchi_hash IS NULL"); +} else { + $sql = $db->prepare("SELECT links.url, posts.post_id, posts.blog_id FROM links, posts WHERE posts.post_id = links.post_id AND id_inchi_hash IS NULL AND active = 1"); +} +$sql->execute(); +my $box; +while (my $row = $sql->fetchrow_hashref()) { + my $url = $row->{"url"}; + my $post_id = $row->{"post_id"}; + my $blog_id = $row->{"blog_id"}; + if ($url =~ m/wikipedia.org\/wiki/ && !($url =~ m/google.com/) && + !($url =~ m/wiki\/InChI/i) && !($url =~ m/wiki\/Simplified/) && + !($url =~ m/wiki\/Template/i) && !($url =~ m/wiki\/PubChem/) && + !($url =~ m/wiki\/SMILES/i)) { + # figure out name + my $name = ""; + if ($url =~ m#/wiki/(.*)#) { + $name = $1; + $name =~ s/\_/ /g; + $name = lc $name; + } + + my $likelyChemical = 0; + my $inchi = ""; + my $cid = ""; + next if (!($url =~ m/Pyrene/i)); + + print "WP URL: $url"; + # remove URL # part + if ($url =~ m/(.*)#.*/) { + $url = $1; + } + `wget -q -O wp.html "$url"`; + my @content = `cat wp.html`; + $box = ""; + my $readingInChI = 0; + foreach my $line (@content) { + if ($line =~ m#(InChI=1/[^\s]*)#) { + $readingInChI = 1; + } + if ($line =~ m#/wiki/Simplified_molecular_input_line_entry_specification#) { + $likelyChemical = 1; + } + if ($line =~ m#http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi\?cid=(\d*)#) { + $likelyChemical = 1; + $cid = $1; + } + if ($line =~ m#href="/wiki/Chemical_formula"#) { + $likelyChemical = 1; + } + if ($line =~ m/id="drugInfoBox"/) { + $box = "DrugBox"; + } + if ($readingInChI == 1) { + #print "line: $line"; + $inchi .= $line; + if ($line =~ m###g; + $inchi =~ s###g; + if ($inchi =~ /href/) { + # OK, this is not good. + $inchi = ""; + } + $readingInChI = 0; + } + } + } + + # do we know anything about this compound + my $compoundKnown = 0; + if ($inchi) { + my $query2 = "SELECT inchi, cid FROM compounds WHERE inchi = '$inchi'"; + print "Q: $query2\n"; + my $sql2 = $db->prepare($query2); + $sql2->execute(); + while (my $row = $sql2->fetchrow_hashref()) { + $compoundKnown = 1; + if (!$cid && $row->{"cid"}) { + print "Retrieved CID from cb db: $cid\n"; + $cid = $row->{"cid"}; + } + } + } elsif ($cid) { + my $query2 = "SELECT inchi, cid FROM compounds WHERE cid = '$cid'"; + my $sql2 = $db->prepare($query2); + $sql2->execute(); + while (my $row = $sql2->fetchrow_hashref()) { + $compoundKnown = 1; + if (!$inchi && $row->{"inchi"}) { + print "Retrieved InChI from cb db: $inchi\n"; + $inchi = $row->{"inchi"}; + } + } + } + + if ($likelyChemical) { + print " -> Chemical?: $url -> "; + if ($inchi) { + print "$inchi"; + if (!$cid) { + $cid = getCID($inchi); + } + print " -> CID:$cid"; + } elsif ($cid) { + print "CID:$cid"; + if (!$inchi) { + $inchi = getInChI($cid); + } + print " -> $inchi" if ($inchi); + } else { + print "but no InChI/CID"; + } + if ($inchi) { + my $id_inchi_hash = md5_hex($post_id.$inchi); + + print "name: $name\n"; + print "post: $post_id\n"; + print "blog: $blog_id\n"; + print " adding inchi: $inchi\n"; + + my $query = "UPDATE links SET id_inchi_hash = '$id_inchi_hash' WHERE url = '$url' AND blog_id = '$blog_id' AND post_id = '$post_id'"; + print "Q: $query\n"; + #my $insert = $db->prepare($query); + #$insert->execute(); + $query = "INSERT INTO inchis (id_inchi_hash, blog_id, post_id, inchi, added_on) VALUES (?, ?, ?, ?,CURRENT_TIMESTAMP())"; + print "Q: $query\n"; + #$insert = $db->prepare($query); + #$insert->execute($id_inchi_hash, $blog_id, $post_id, $inchi); + + print "known: $compoundKnown\n"; + if ($compoundKnown != 0) { + print " Already know this compound; not adding into compounds\n"; + } else { + print " Adding to compounds table...\n"; + $query = "INSERT INTO compounds (cid, inchi, name, added_on) VALUES (?, ?, ?, CURRENT_TIMESTAMP())"; + print " query: $query\n" if ($offline); + # my $insert = $db->prepare($query); + # $insert->execute($cid, $inchi, $name); + } + } + } + print "\n"; + } +} + +log("script complete"); diff --git a/pipeline.pl b/pipeline.pl index 8e9d67f..dcaa079 100755 --- a/pipeline.pl +++ b/pipeline.pl @@ -15,6 +15,8 @@ system("perl update_posts.pl 51 100"); system("perl update_posts.pl 101 150"); system("perl update_posts.pl 151 200"); +system("perl update_posts.pl 201 250"); +system("perl update_posts.pl 251 300"); system("perl update_feeds.pl"); # update feed names, descriptions etc. system("perl get_links.pl"); # get all URLs from posts system("perl get_inchis.pl"); # get all InChIs from posts @@ -43,7 +45,7 @@ #system("perl get_connotea_tags.pl"); # match tags and comments to items in our database system("perl generate_summaries.pl"); # generate summary tables to speed up front-end system("perl get_bursts.pl"); # get wordbursts -system("perl geolocate_terms.pl"); # geolocate terms associated with conference posts +#system("perl geolocate_terms.pl"); # geolocate terms associated with conference posts system("perl generate_xml.pl"); # generate flatfiles for papers in the database system("perl wipe_cache.pl"); # wipe cache of interface diff --git a/update_posts.pl b/update_posts.pl index 46e6790..2481e41 100755 --- a/update_posts.pl +++ b/update_posts.pl @@ -63,8 +63,12 @@ print STDERR " -> hash: $hash\n"; my @posts = glob($posts_dir."/post_*"); + # my $postCounter = 0; foreach my $post (@posts) { - + # do at most 25 blog posts + #last if ($postCounter == 25); + #$postCounter++; + if (!$config{"allow_post_edits"}) { if ($exists{$post}) { print STDERR "-";