Permalink
Cannot retrieve contributors at this time
474 lines (390 sloc)
7.57 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ###################################################################### | |
| # | |
| # EPrints::Apache::LogHandler | |
| # | |
| ###################################################################### | |
| # | |
| # | |
| ###################################################################### | |
| =pod | |
| =head1 NAME | |
| EPrints::Apache::LogHandler - Main handler for Apache log events | |
| =head1 CONFIGURATION | |
| To enable the Apache::LogHandler add to your ArchiveConfig: | |
| $c->{loghandler}->{enable} = 1; | |
| =head1 DATA FORMAT | |
| =over 4 | |
| =item requester | |
| The requester is stored using their IP in URN format: C<urn:ip:x.x.x.x>. | |
| =item serviceType | |
| ServiceType is in format L<info:ofi/fmt:kev:mtx:sch_svc|http://alcme.oclc.org/openurl/servlet/OAIHandler?verb=GetRecord&metadataPrefix=oai_dc&identifier=info:ofi/fmt:kev:mtx:sch_svc>. | |
| The value is encoded as C<?name=yes> (where C<name> is one of the services defined). | |
| =item referent, referringEntity | |
| These are stored in URN format: C<info:oai:repositoryid:eprintid>. | |
| =item referent_docid | |
| The document id as a fragment of the referent: C<#docid>. | |
| =back | |
| =head1 METHODS | |
| =over 4 | |
| =cut | |
| package EPrints::Apache::LogHandler; | |
| use EPrints; | |
| use strict; | |
| use EPrints::Apache::AnApache; | |
| use Apache2::Connection; | |
| our @USERAGENT_ROBOTS = map { s/\s+//g; qr/$_/i } <DATA>; | |
| our %ROBOTS_CACHE; # key=IP, value=time (or -time if not a robot) | |
| our $TIMEOUT = 3600; # 1 hour | |
| sub handler {} # deprecated | |
| sub is_robot | |
| { | |
| my( $r ) = @_; | |
| my $ip = $r->connection->remote_ip; | |
| my $time_t = time(); | |
| # cleanup then check the cache | |
| for(keys %ROBOTS_CACHE) | |
| { | |
| delete $ROBOTS_CACHE{$_} if abs($ROBOTS_CACHE{$_}) < $time_t; | |
| } | |
| return $ROBOTS_CACHE{$ip} > 0 if exists $ROBOTS_CACHE{$ip}; | |
| $ROBOTS_CACHE{$ip} = $time_t + $TIMEOUT; | |
| my $is_robot = 0; | |
| my $ua = $r->headers_in->{ "User-Agent" }; | |
| if( $ua ) | |
| { | |
| for(@USERAGENT_ROBOTS) | |
| { | |
| $is_robot = 1, last if $ua =~ $_; | |
| } | |
| } | |
| $ROBOTS_CACHE{$ip} *= -1 if !$is_robot; | |
| return $is_robot; | |
| } | |
| =item $handler->document( $r ) | |
| A request on a document. | |
| =cut | |
| sub document | |
| { | |
| my( $r ) = @_; | |
| # COUNTER compliance specifies 200 and 304 | |
| if( $r->status != 200 && $r->status != 304 ) | |
| { | |
| return DECLINED; | |
| } | |
| return if is_robot( $r ); | |
| my $doc = $r->pnotes( "document" ); | |
| my $filename = $r->pnotes->{ "filename" }; | |
| # only count hits to the main file | |
| if( $filename ne $doc->get_main ) | |
| { | |
| return DECLINED; | |
| } | |
| # ignore volatile version downloads (e.g. thumbnails) | |
| my $relations = $doc->get_value( "relation" ); | |
| $relations = [] unless( defined $relations ); | |
| foreach my $r (@$relations) | |
| { | |
| return DECLINED if( $r->{type} =~ /is\w+ThumbnailVersionOf$/ ); | |
| } | |
| my $epdata = _generic( $r, { _parent => $doc } ); | |
| $epdata->{service_type_id} = "?fulltext=yes"; | |
| $epdata->{referent_id} = $doc->value( "eprintid" ); | |
| $epdata->{referent_docid} = $doc->id; | |
| return _create_access( $r, $epdata ); | |
| } | |
| =item $handler->eprint( $r ) | |
| A request on an eprint abstract page. | |
| =cut | |
| sub eprint | |
| { | |
| my( $r ) = @_; | |
| # e.g. ignore 304 NOT MODIFIED | |
| if( $r->status != 200 ) | |
| { | |
| return DECLINED; | |
| } | |
| # only track hits on the full abstract page | |
| if( $r->filename !~ /\bindex\.html$/ ) | |
| { | |
| return DECLINED; | |
| } | |
| return if is_robot( $r ); | |
| my $eprint = $r->pnotes( "eprint" ); | |
| my $epdata = _generic( $r, { _parent => $eprint } ); | |
| $epdata->{service_type_id} = "?abstract=yes"; | |
| $epdata->{referent_id} = $eprint->id; | |
| return _create_access( $r, $epdata ); | |
| } | |
| sub _generic | |
| { | |
| my( $r, $epdata ) = @_; | |
| my $c = $r->connection; | |
| my $ip = $c->remote_ip; | |
| $epdata->{datestamp} = EPrints::Time::get_iso_timestamp( $r->request_time ); | |
| $epdata->{requester_id} = $ip; | |
| $epdata->{referring_entity_id} = $r->headers_in->{ "Referer" }; | |
| $epdata->{requester_user_agent} = $r->headers_in->{ "User-Agent" }; | |
| # Sanity check referring URL (don't store non-HTTP referrals) | |
| if( !$epdata->{referring_entity_id} || $epdata->{referring_entity_id} !~ /^https?:/ ) | |
| { | |
| $epdata->{referring_entity_id} = ''; | |
| } | |
| return $epdata; | |
| } | |
| sub _create_access | |
| { | |
| my( $r, $epdata ) = @_; | |
| my $repository = $EPrints::HANDLE->current_repository; | |
| if( !defined $repository ) | |
| { | |
| return DECLINED; | |
| } | |
| $repository->dataset( "access" )->create_dataobj( $epdata ); | |
| return OK; | |
| } | |
| 1; | |
| =back | |
| =head1 SEE ALSO | |
| L<EPrints::DataObj::Access> | |
| =head1 COPYRIGHT | |
| =for COPYRIGHT BEGIN | |
| Copyright 2000-2011 University of Southampton. | |
| =for COPYRIGHT END | |
| =for LICENSE BEGIN | |
| This file is part of EPrints L<http://www.eprints.org/>. | |
| EPrints is free software: you can redistribute it and/or modify it | |
| under the terms of the GNU Lesser General Public License as published | |
| by the Free Software Foundation, either version 3 of the License, or | |
| (at your option) any later version. | |
| EPrints is distributed in the hope that it will be useful, but WITHOUT | |
| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
| FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public | |
| License for more details. | |
| You should have received a copy of the GNU Lesser General Public | |
| License along with EPrints. If not, see L<http://www.gnu.org/licenses/>. | |
| =for LICENSE END | |
| =cut | |
| # http://www.projectcounter.org/documents/COUNTER_robot_txt_list_Jan_2011.txt | |
| __DATA__ | |
| Alexandria(\s|\+)prototype(\s|\+)project | |
| AllenTrack | |
| Arachmo | |
| Brutus\/AET | |
| China\sLocal\sBrowse\s2\.6 | |
| Code\sSample\sWeb\sClient | |
| ContentSmartz | |
| DSurf | |
| DataCha0s\/2\.0 | |
| Demo\sBot | |
| EmailSiphon | |
| EmailWolf | |
| FDM(\s|\+)1 | |
| Fetch(\s|\+)API(\s|\+)Request | |
| GetRight | |
| Goldfire(\s|\+)Server | |
| Googlebot | |
| HTTrack | |
| LOCKSS | |
| LWP\:\:Simple | |
| MSNBot | |
| Microsoft(\s|\+)URL(\s|\+)Control | |
| Milbot | |
| MuscatFerre | |
| NABOT | |
| NaverBot | |
| Offline(\s|\+)Navigator | |
| OurBrowser | |
| Python\-urllib | |
| Readpaper | |
| Strider | |
| T\-H\-U\-N\-D\-E\-R\-S\-T\-O\-N\-E | |
| Teleport(\s|\+)Pro | |
| Teoma | |
| Wanadoo | |
| Web(\s|\+)Downloader | |
| WebCloner | |
| WebCopier | |
| WebReaper | |
| WebStripper | |
| WebZIP | |
| Webinator | |
| Webmetrics | |
| Wget | |
| Xenu(\s|\+)Link(\s|\+)Sleuth | |
| [+:,\.\;\/\\-]bot | |
| [^a]fish | |
| ^voyager\/ | |
| acme\.spider | |
| alexa | |
| almaden | |
| appie | |
| architext | |
| archive\.org_bot | |
| arks | |
| asterias | |
| atomz | |
| autoemailspider | |
| awbot | |
| baiduspider | |
| bbot | |
| biadu | |
| biglotron | |
| bjaaland | |
| blaiz\-bee | |
| bloglines | |
| blogpulse | |
| boitho\.com\-dc | |
| bookmark\-manager | |
| bot | |
| bot[+:,\.\;\/\\-] | |
| bspider | |
| bwh3_user_agent | |
| celestial | |
| cfnetwork|checkbot | |
| combine | |
| commons\-httpclient | |
| contentmatch | |
| core | |
| crawl | |
| crawler | |
| cursor | |
| custo | |
| daumoa | |
| docomo | |
| dtSearchSpider | |
| dumbot | |
| easydl | |
| exabot | |
| fast-webcrawler | |
| favorg | |
| feedburner | |
| feedfetcher\-google | |
| ferret | |
| findlinks | |
| gaisbot | |
| geturl | |
| gigabot | |
| girafabot | |
| gnodspider | |
| grub | |
| gulliver | |
| harvest | |
| heritrix | |
| hl_ftien_spider | |
| holmes | |
| htdig | |
| htmlparser | |
| httpget\-5\.2\.2 | |
| httpget\?5\.2\.2 | |
| httrack | |
| iSiloX | |
| ia_archiver | |
| ichiro | |
| iktomi | |
| ilse | |
| internetseer | |
| intute | |
| java | |
| java\/ | |
| jeeves | |
| jobo | |
| kyluka | |
| larbin | |
| libwww | |
| libwww\-perl | |
| lilina | |
| linkbot | |
| linkcheck | |
| linkchecker | |
| linkscan | |
| linkwalker | |
| livejournal\.com | |
| lmspider | |
| lwp | |
| lwp\-request | |
| lwp\-tivial | |
| lwp\-trivial | |
| lycos[_+] | |
| mail.ru | |
| mediapartners\-google | |
| megite | |
| milbot | |
| mimas | |
| mj12bot | |
| mnogosearch | |
| moget | |
| mojeekbot | |
| momspider | |
| motor | |
| msiecrawler | |
| msnbot | |
| myweb | |
| nagios | |
| netcraft | |
| netluchs | |
| ng\/2\. | |
| no_user_agent | |
| nomad | |
| nutch | |
| ocelli | |
| onetszukaj | |
| perman | |
| pioneer | |
| playmusic\.com | |
| playstarmusic\.com | |
| powermarks | |
| psbot | |
| python | |
| qihoobot | |
| rambler | |
| redalert|robozilla | |
| robot | |
| robots | |
| rss | |
| scan4mail | |
| scientificcommons | |
| scirus | |
| scooter | |
| seekbot | |
| seznambot | |
| shoutcast | |
| slurp | |
| sogou | |
| speedy | |
| spider | |
| spiderman | |
| spiderview | |
| sunrise | |
| superbot | |
| surveybot | |
| tailrank | |
| technoratibot | |
| titan | |
| turnitinbot | |
| twiceler | |
| ucsd | |
| ultraseek | |
| urlaliasbuilder | |
| urllib | |
| virus[_+]detector | |
| voila | |
| w3c\-checklink | |
| webcollage | |
| weblayers | |
| webmirror | |
| webreaper | |
| wordpress | |
| worm | |
| xenu | |
| y!j | |
| yacy | |
| yahoo | |
| yahoo\-mmcrawler | |
| yahoofeedseeker | |
| yahooseeker | |
| yandex | |
| yodaobot | |
| zealbot | |
| zeus | |
| zyborg |