| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,275 @@ | ||
| package Koha::SearchEngine::Elasticsearch::Document; | ||
|
|
||
| use Moo; | ||
| use Modern::Perl; | ||
| use MIME::Base64; | ||
| use Encode qw(encode); | ||
|
|
||
| =head1 NAME | ||
| Koha::SearchEngine::Elasticsearch::Document - This class create a 'document' data | ||
| structure that can be sent to ES for indexing. | ||
| =head1 ATTRIBUTES | ||
| =head2 indexx | ||
| ES index in which the document will be stored | ||
| =cut | ||
| has indexx => (is => 'rw'); | ||
|
|
||
| =head2 doc | ||
| The ES document being built. | ||
| =cut | ||
| has doc => (is => 'rw', default => sub {{}}); | ||
|
|
||
| =head2 record | ||
| The biblio/authority record for which the ES document is build. | ||
| =cut | ||
| has record => (is => 'rw'); | ||
|
|
||
| =head2 fields | ||
| Array of all MARC record fieds | ||
| =cut | ||
| has fields => (is => 'rw'); | ||
|
|
||
| =head2 fpt | ||
| Fields per tag for quick access. For example: | ||
| if (my $ftp = $doc->fpt->{880}) { | ||
| for my $fields (@$fpt) { | ||
| # Here $field contains a MARC::Field | ||
| } | ||
| } | ||
| =cut | ||
| has fpt => (is => 'rw'); | ||
|
|
||
| =head2 mapfull | ||
| A data structure aggregating MARC record content based on ES fields as defined | ||
| in Koha configuration. It avoid later to access again and again the same | ||
| MARC fields when they are sent to various ES fields. | ||
| =cut | ||
| has mapfull => (is => 'rw'); | ||
|
|
||
|
|
||
|
|
||
| =head1 METHODS | ||
| =cut | ||
| sub BUILD { | ||
| my $self = shift; | ||
|
|
||
| my @fields = $self->record->fields(); | ||
| my $fpt; | ||
| for my $field (@fields) { | ||
| push @{$fpt->{$field->tag}}, $field; | ||
| } | ||
| $self->fields(\@fields); | ||
| $self->fpt($fpt); | ||
|
|
||
| my $record = $self->record; | ||
| my $leader = $record->leader; | ||
| my $mapr = $self->indexx->mapr; | ||
| my $mapfull; | ||
| my $getrange = sub { | ||
| my ($term, $range) = @_; | ||
| if ( $range && $range =~ /_\/([0-9\-]+)/ ) { | ||
| my ($from, $to) = split /-/, $1; | ||
| if (defined($to)) { $to = '10000000' if $to eq ''; } | ||
| else { $to = $from; } | ||
| return substr($term, $from, $to-$from+1) | ||
| if $from <= length($term); | ||
| } | ||
| return undef; | ||
| }; | ||
| for my $letters ( keys %{$mapr->{leader}} ) { | ||
| push @{ $mapfull->{"leader$letters"} }, $getrange->($leader, $letters); | ||
| } | ||
| for my $field ( $record->fields() ) { | ||
| my $tag = $field->tag; | ||
| if ( $tag lt '010') { | ||
| my $data = $field->data; | ||
| next if length($data) == 0; | ||
| for my $letters ( keys %{$mapr->{$tag}} ) { | ||
| if ( $letters eq '' ) { | ||
| push @{$mapfull->{$tag}}, $data; | ||
| } | ||
| else { | ||
| my $term = $getrange->($data, $letters); | ||
| push @{$mapfull->{"$tag$letters"}}, $term if $term; | ||
| } | ||
| } | ||
| } | ||
| else { | ||
| my @subfields = $field->subfields; | ||
| for my $letters ( keys %{$mapr->{$tag}} ) { | ||
| $letters ||= ''; | ||
| my @values; | ||
| for (@subfields) { | ||
| my ($letter, $value) = @$_; | ||
| next if length($letters) && index($letters, $letter) == -1; | ||
| push @values, $value; | ||
| } | ||
| next unless @values; | ||
| my $term = join(' ', @values); | ||
| if ( $letters && $letters =~ /_\/([0-9\-]+)/ ) { | ||
| $term = $getrange->($term, $letters); | ||
| } | ||
| push @{$mapfull->{"$tag$letters"}}, $term if $term; | ||
| } | ||
| } | ||
| } | ||
| $self->mapfull($mapfull); | ||
| } | ||
|
|
||
|
|
||
| =head2 get_terms_range($value, $letters) | ||
| Take data C<$value> extracted from a field, and get from this value all the | ||
| portions specified by a range C<$letters>. An array of all those portion is | ||
| returned. In C<letters>, there is portion specication like that : | ||
| C<12-14,15,20-25>. | ||
| =cut | ||
| sub get_terms_range { | ||
| my ($self, $value, $letters) = @_; | ||
| my @terms; | ||
| if ($value) { | ||
| if ( $letters && $letters =~ /([0-9\-,]+)/ ) { | ||
| my @ranges = split /,/, $1; | ||
| my $len = length($value); | ||
| for my $range (@ranges) { | ||
| my ($from, $to) = split /-/, $range; | ||
| if (defined($to)) { $to = '10000000' if $to eq ''; } | ||
| else { $to = $from; } | ||
| next if $from > $len; | ||
| push @terms, substr($value, $from-1, $to-$from+1); | ||
| } | ||
| } | ||
| else { | ||
| push @terms, $value; | ||
| } | ||
| } | ||
| return @terms; | ||
| } | ||
|
|
||
|
|
||
| sub extract_terms_from_field { | ||
| my ($self, $field, $letters) = @_; | ||
|
|
||
| return unless $field; | ||
|
|
||
| my $terms; | ||
| if ($field->tag lt '010') { | ||
| push @$terms, $self->get_terms_range($field->data, $letters); | ||
| } | ||
| else { | ||
| my @subf = $field->subfields; | ||
| my @values; | ||
| for ( $field->subfields ) { | ||
| my ($letter, $value) = @$_; | ||
| next if $letters && index($letters, $letter) == -1; | ||
| push @values, $value; | ||
| } | ||
| return unless @values; | ||
| my $term = join(' ', @values); | ||
| push @$terms, $self->get_terms_range($term, $letters); | ||
| } | ||
| return $terms; | ||
| } | ||
|
|
||
|
|
||
| =head append_terms($terms, $name, $subname) | ||
| The array of terms C<$terms> is appended to the doc field C<$name>, with it | ||
| sub-indexes C<$subname> : ["search","facet"]. If C<$subname> is not provided | ||
| => ["search"]. | ||
| =cut | ||
| sub append_terms { | ||
| my ($self, $terms, $name, $subname) = @_; | ||
|
|
||
| return unless $terms; | ||
| $subname ||= ['search']; | ||
| for my $sub (@$subname) { | ||
| my $fieldname = $name; | ||
| $fieldname .= "__$sub" if $sub ne 'search'; | ||
| $self->doc->{$fieldname}->{$_} = undef for @$terms; | ||
| } | ||
| } | ||
|
|
||
|
|
||
| =head2 add($field, $letters, $name, $subname) | ||
| Add a C<MARC::Field> | ||
| =cut | ||
| sub add { | ||
| my ($self, $field, $letters, $name, $subname) = @_; | ||
|
|
||
| my $terms = $self->extract_terms_from_field($field, $letters); | ||
| $self->append_terms($terms, $name, $subname); | ||
| } | ||
|
|
||
|
|
||
|
|
||
| sub getnormalized { | ||
| my $self = shift; | ||
|
|
||
| my $doc = $self->doc; | ||
| my $record = $self->record; | ||
|
|
||
| # Hash transformation into array | ||
| # + fix __suggestion index | ||
| for my $name (keys %$doc) { | ||
| my $input = $name =~ /__suggestible$/; | ||
| $doc->{$name} = [ map { | ||
| $_ = { input => $_ } if $input; | ||
| $_; | ||
| } keys %{$doc->{$name}} ]; | ||
| } | ||
|
|
||
| $record->encoding('UTF-8'); | ||
| my $marcflavour = lc C4::Context->preference('marcflavour'); | ||
| my $use_array = C4::Context->preference('ElasticsearchMARCFormat') eq 'ARRAY'; | ||
| if ($use_array) { | ||
| #FIXME: Ă faire | ||
| $doc->{marc_data_array} = $self->_marc_to_array($record); | ||
| $doc->{marc_format} = 'ARRAY'; | ||
| } else { | ||
| my @warnings; | ||
| { | ||
| # Temporarily intercept all warn signals (MARC::Record carps when record length > 99999) | ||
| local $SIG{__WARN__} = sub { | ||
| push @warnings, $_[0]; | ||
| }; | ||
| $doc->{marc_data} = encode_base64(encode('UTF-8', $record->as_usmarc())); | ||
| } | ||
| if (@warnings) { | ||
| # Suppress warnings if record length exceeded | ||
| unless (substr($record->leader(), 0, 5) eq '99999') { | ||
| foreach my $warning (@warnings) { | ||
| carp $warning; | ||
| } | ||
| } | ||
| $doc->{marc_data} = $record->as_xml_record($marcflavour); | ||
| $doc->{marc_format} = 'MARCXML'; | ||
| } | ||
| else { | ||
| $doc->{marc_format} = 'base64ISO2709'; | ||
| } | ||
| } | ||
| return $doc; | ||
| } | ||
|
|
||
| 1; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| package Koha::SearchEngine::Elasticsearch::Plugin; | ||
|
|
||
| use Moo; | ||
| use Modern::Perl; | ||
|
|
||
| has indexx => (is => 'rw'); | ||
|
|
||
|
|
||
| =head2 add_field($doc, $name, $param) | ||
| Add to C<$doc>, the field C<$name>, depending on C<$param>. C<$param->{map}> | ||
| specify where to find data. C<$param->{index} specify in which ES specialized | ||
| index put the data (__sort, __facet, ...) | ||
| =cut | ||
| sub add_field { | ||
| my ($self, $doc, $name, $param) = @_; | ||
|
|
||
| return unless $param; | ||
| my $maps = $param->{map}; | ||
| return unless $maps; | ||
| for my $map (@$maps) { | ||
| my $terms = $doc->mapfull->{$map}; | ||
| $doc->append_terms($terms, $name, $param->{index}); | ||
| } | ||
| } | ||
|
|
||
| 1; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| package Koha::SearchEngine::Elasticsearch::Plugin::AGR; | ||
|
|
||
| use Moo; | ||
| extends 'Koha::SearchEngine::Elasticsearch::Plugin'; | ||
| use Modern::Perl; | ||
| use C4::AuthoritiesMarc; | ||
| use YAML; | ||
| use JSON; | ||
|
|
||
|
|
||
| sub add_field { | ||
| my ($self, $doc, $esname, $param) = @_; | ||
|
|
||
| $param = [$param] if ref($param) ne 'ARRAY'; | ||
|
|
||
| # Get 880 fields and give access to them by targeted tag | ||
| my $fields = $doc->fpt->{'880'}; | ||
| return unless $fields; | ||
| my $fpt; | ||
| for my $field (@$fields) { | ||
| my $linkage = $field->subfield('6'); | ||
| next unless $linkage; | ||
| my $tag = substr($linkage, 0, 3); | ||
| $fpt->{$tag} = $field; | ||
| } | ||
|
|
||
| for my $p (@$param) { | ||
| my $sources = $p->{source}; | ||
| $sources = [$sources] if ref($sources) ne 'ARRAY'; | ||
| for my $source (@$sources) { | ||
| my $maps = $source->{map}; | ||
| $maps = [$maps] if ref($maps) ne 'ARRAY'; | ||
| for my $map (@$maps) { | ||
| next if length($map) < 3; | ||
| my ($tag, $letters) = (substr($map,0,3), substr($map,3)); | ||
| my $field = $fpt->{$tag}; | ||
| next unless $field; | ||
| my $targets = $source->{target}; | ||
| $targets = [$targets] if ref($targets) ne 'ARRAY'; | ||
| $doc->add($field, $letters, $_, $source->{index}) for @$targets; | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| 1; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| package Koha::SearchEngine::Elasticsearch::Plugin::Authority; | ||
|
|
||
| use Moo; | ||
| extends 'Koha::SearchEngine::Elasticsearch::Plugin'; | ||
| use Modern::Perl; | ||
| use C4::AuthoritiesMarc; | ||
| use YAML; | ||
|
|
||
|
|
||
| sub add_field { | ||
| my ($self, $doc, $esname, $param) = @_; | ||
|
|
||
| $param = [$param] if ref($param) ne 'ARRAY'; | ||
| for my $p (@$param) { | ||
| for my $map ( @{$p->{map}} ) { | ||
| my ($tag, $letters) = (substr($map,0,3), substr($map,3)); | ||
| my $fpt = $doc->fpt->{$tag}; | ||
| next unless $fpt; # No field $tag | ||
| for my $field (@$fpt) { | ||
| my $authid = $field->tag ge '010' && $field->subfield('9'); | ||
| my $auth = $authid && $self->indexx->get_record_type('authorities', $authid); | ||
| if ($auth) { | ||
| my $tag = substr($p->{heading},0,3); | ||
| my $letters = substr($p->{heading},3); | ||
| my $field = $auth->field($tag); | ||
| $doc->add($field, $letters, $esname, $p->{index}); | ||
| if ( my $sees = $p->{see} ) { | ||
| $sees = [$sees] if ref($sees) ne 'ARRAY'; | ||
| for my $see (@$sees) { | ||
| my $index_name = $esname; | ||
| my $subname = $see->{index} || $p->{index}; | ||
| if ( my $target = $see->{target} ) { | ||
| my $index = $self->indexx->es->c->{indexes}->{$self->indexx->name}; | ||
| $index_name = $target if $index->{$target}; | ||
| } | ||
| for my $map ( @{$see->{map}}) { | ||
| my $tag = substr($map,0,3); | ||
| my $letters = substr($map,3); | ||
| for my $field ($auth->field($tag)) { | ||
| $doc->add($field, $letters, $index_name, $subname); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| else { # No authoriy found => take biblio info | ||
| $doc->add($field, $letters, $esname, $p->{index}); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| 1; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| package Koha::SearchEngine::Elasticsearch::Plugin::ISBN; | ||
|
|
||
| use Moo; | ||
| extends 'Koha::SearchEngine::Elasticsearch::Plugin'; | ||
| use Business::ISBN; | ||
|
|
||
|
|
||
| sub add_field { | ||
| my ($self, $doc, $name, $param) = @_; | ||
|
|
||
| my @isbns = (); | ||
| my $maps = $param->{map}; | ||
| return unless $maps; | ||
| $maps = [$maps] if ref($maps) ne 'ARRAY'; | ||
| for my $map (@$maps) { | ||
| my ($tag, $letters) = (substr($map,0,3), substr($map,3)); | ||
| my $fpt = $doc->fpt->{$tag}; | ||
| next unless $fpt; # No field $tag | ||
| for my $field (@$fpt) { | ||
| for ( $field->subfields ) { | ||
| my ($letter, $value) = @$_; | ||
| next if $letters && index($letters, $letter) == -1; | ||
| my $isbn = Business::ISBN->new($value); | ||
| if (defined $isbn && $isbn->is_valid) { | ||
| my $isbn13 = $isbn->as_isbn13->as_string; | ||
| push @isbns, $isbn13; | ||
| $isbn13 =~ s/\-//g; | ||
| push @isbns, $isbn13; | ||
| my $isbn10 = $isbn->as_isbn10; | ||
| if ($isbn10) { | ||
| $isbn10 = $isbn10->as_string; | ||
| push @isbns, $isbn10; | ||
| $isbn10 =~ s/\-//g; | ||
| push @isbns, $isbn10; | ||
| } | ||
| } | ||
| else { | ||
| push @isbns, $value; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| $doc->append_terms(\@isbns, $name); | ||
| } | ||
|
|
||
| 1; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| package Koha::SearchEngine::Elasticsearch::Plugin::MatchHeading; | ||
|
|
||
| use Moo; | ||
| extends 'Koha::SearchEngine::Elasticsearch::Plugin'; | ||
| use C4::AuthoritiesMarc; | ||
|
|
||
|
|
||
| sub add_field { | ||
| my ($self, $doc, $name, $param) = @_; | ||
|
|
||
| #my $heading = C4::Heading->new_from_field($field, undef, 1 ); | ||
| my $heading; | ||
| if ($heading) { | ||
| $doc->{'match-heading'}->{$heading->{search_form}} = undef; | ||
| return 1; | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| 1; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| package Koha::SearchEngine::Elasticsearch::Plugin::NonFillChar; | ||
|
|
||
| use Moo; | ||
| extends 'Koha::SearchEngine::Elasticsearch::Plugin'; | ||
|
|
||
|
|
||
| sub add_field { | ||
| my ($self, $doc, $esname, $param) = @_; | ||
|
|
||
| $param = [$param] if ref($param) ne 'ARRAY'; | ||
| for my $p (@$param) { | ||
| my $indicator = $p->{indicator} || 0; | ||
| my $maps = $p->{map}; | ||
| $maps = [$maps] if ref($maps) ne 'ARRAY'; | ||
| for my $map ( @{$p->{map}} ) { | ||
| next if length($map) < 3; | ||
| my ($tag,$letters) = (substr($map,0,3), substr($map,3)); | ||
| my $fields = $doc->fpt->{$tag}; | ||
| next unless $fields; | ||
| for my $field (@$fields) { | ||
| my $count = $field->indicator($indicator); | ||
| my $terms = $doc->extract_terms_from_field($field, $letters); | ||
| # Here we should get just 1 terms | ||
| $terms = [ map { | ||
| length($_) < $count ? $_ : substr($_,$count) | ||
| } @$terms ]; | ||
| $doc->append_terms($terms, $esname, $p->{index}); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| 1; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,172 @@ | ||
| { | ||
| "server":{ | ||
| "index_name":"koha", | ||
| "server":[ | ||
| "localhost:9200" | ||
| ], | ||
| "cxn_pool":"Static", | ||
| "client":"5_0::Direct" | ||
| }, | ||
| "fields":{ | ||
| "general":{ | ||
| "properties":{ | ||
| "marc_data_array":{ | ||
| "dynamic":"true", | ||
| "type":"object" | ||
| }, | ||
| "marc_format":{ | ||
| "index":"false", | ||
| "analyzer":"keyword", | ||
| "store":"true", | ||
| "type":"text" | ||
| }, | ||
| "marc_data":{ | ||
| "analyzer":"keyword", | ||
| "index":"false", | ||
| "store":"true", | ||
| "type":"text" | ||
| } | ||
| } | ||
| }, | ||
| "search":{ | ||
| "integer":{ | ||
| "null_value":"0", | ||
| "type":"integer" | ||
| }, | ||
| "default":{ | ||
| "type":"text", | ||
| "search_analyzer":"analyzer_standard", | ||
| "analyzer":"analyzer_standard", | ||
| "fields":{ | ||
| "phrase":{ | ||
| "type":"text", | ||
| "search_analyzer":"analyzer_phrase", | ||
| "analyzer":"analyzer_phrase" | ||
| }, | ||
| "raw":{ | ||
| "type":"keyword", | ||
| "normalizer":"nfkc_cf_normalizer" | ||
| }, | ||
| "ci_raw":{ | ||
| "normalizer":"icu_folding_normalizer", | ||
| "type":"keyword" | ||
| } | ||
| } | ||
| }, | ||
| "boolean":{ | ||
| "null_value":"false", | ||
| "type":"boolean" | ||
| }, | ||
| "stdno":{ | ||
| "type":"text", | ||
| "search_analyzer":"analyzer_stdno", | ||
| "analyzer":"analyzer_stdno", | ||
| "fields":{ | ||
| "phrase":{ | ||
| "search_analyzer":"analyzer_phrase", | ||
| "type":"text", | ||
| "analyzer":"analyzer_phrase" | ||
| }, | ||
| "raw":{ | ||
| "type":"keyword" | ||
| } | ||
| } | ||
| } | ||
| }, | ||
| "sort":{ | ||
| "default":{ | ||
| "type":"icu_collation_keyword", | ||
| "numeric":"true", | ||
| "index":"false" | ||
| } | ||
| }, | ||
| "facet":{ | ||
| "default":{ | ||
| "type":"keyword" | ||
| } | ||
| }, | ||
| "suggestible":{ | ||
| "default":{ | ||
| "type":"completion", | ||
| "search_analyzer":"simple", | ||
| "analyzer":"simple" | ||
| } | ||
| } | ||
| }, | ||
| "index":{ | ||
| "number_of_shards": 5, | ||
| "analysis":{ | ||
| "analyzer":{ | ||
| "analyzer_standard":{ | ||
| "filter":[ | ||
| "elision", | ||
| "icu_folding" | ||
| ], | ||
| "tokenizer":"icu_tokenizer" | ||
| }, | ||
| "analyzer_phrase":{ | ||
| "filter":[ | ||
| "elision", | ||
| "icu_folding", | ||
| "apostrophe" | ||
| ], | ||
| "char_filter":[ | ||
| "punctuation" | ||
| ], | ||
| "tokenizer":"keyword" | ||
| }, | ||
| "analyzer_stdno":{ | ||
| "tokenizer":"whitespace", | ||
| "char_filter":[ | ||
| "punctuation" | ||
| ], | ||
| "filter":[ | ||
| "icu_folding" | ||
| ] | ||
| } | ||
| }, | ||
| "filter":{ | ||
| "elision":{ | ||
| "articles":[ | ||
| "c", | ||
| "d", | ||
| "j", | ||
| "l", | ||
| "m", | ||
| "n", | ||
| "qu", | ||
| "s", | ||
| "t" | ||
| ], | ||
| "articles_case":"true", | ||
| "type":"elision" | ||
| }, | ||
| "apostrophe":{ | ||
| "pattern":"'", | ||
| "type":"pattern_replace", | ||
| "replacement":"" | ||
| } | ||
| }, | ||
| "normalizer":{ | ||
| "icu_folding_normalizer":{ | ||
| "filter":[ | ||
| "elision", | ||
| "icu_folding" | ||
| ], | ||
| "type":"custom" | ||
| }, | ||
| "nfkc_cf_normalizer":{ | ||
| "char_filter":"icu_normalizer", | ||
| "type":"custom" | ||
| } | ||
| }, | ||
| "char_filter":{ | ||
| "punctuation":{ | ||
| "pattern":"([\\x00-\\x1F,\\x21-\\x26,\\x28-\\x2F,\\x3A-\\x40,\\x5B-\\x60,\\x7B-\\x89,\\x8B,\\x8D,\\x8F,\\x90-\\x99,\\x9B,\\x9D,\\xA0-\\xBF,\\xD7,\\xF7])", | ||
| "replacement":"", | ||
| "type":"pattern_replace" | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,248 @@ | ||
| { | ||
| "indexes" : { | ||
| "biblios" : { | ||
| "author" : { | ||
| "source" : [ | ||
| { | ||
| "map" : ["200f","200g","700a"], | ||
| "index" : ["search","facet","suggestible"] | ||
| }, | ||
| { | ||
| "map" : "701" | ||
| } | ||
| ] | ||
| }, | ||
| "bio" : { "source" : { "map" : "105a_/12" } }, | ||
| "ccode" : { | ||
| "source" : { | ||
| "index" : ["search","facet"], | ||
| "map" : "9958" | ||
| } | ||
| }, | ||
| "ctype" : { "source" : { "map" : "105a_/4-7" } }, | ||
| "date-entered-on-file" : { "source" : { "map" : "099c" } }, | ||
| "date-of-acquisition" : { | ||
| "type" : "date", | ||
| "source" : { | ||
| "map" : "9955", | ||
| "index" : ["search","sort"] | ||
| } | ||
| }, | ||
| "date-of-publication" : { | ||
| "source" : { | ||
| "index" : ["search","sort"], | ||
| "map" : "100a_/9-12" | ||
| } | ||
| }, | ||
| "date-time-last-modified" : { "source" : { "map" : "099d" } }, | ||
| "ff8-29" : { "source" : { "map" : "105a_/8" } }, | ||
| "holdingbranch" : { "source" : { "map" : "995c" } }, | ||
| "homebranch" : { | ||
| "source" : { | ||
| "index" : ["search","facet"], | ||
| "map" : "995b" | ||
| } | ||
| }, | ||
| "host-item-number" : { | ||
| "type" : "number", | ||
| "source" : { "map" : "4619" } | ||
| }, | ||
| "local-number" : { "source" : { "map" : "001" } }, | ||
| "identifier-standard" : { | ||
| "type" : "stdno", | ||
| "source" : { "map" : ["010a","010z","011a","011y","011z"] } | ||
| }, | ||
| "isbn" : { | ||
| "type" : "isbn", | ||
| "source" : { | ||
| "plugin" : { | ||
| "ISBN" : { | ||
| "map" : "010az" | ||
| } | ||
| } | ||
| } | ||
| }, | ||
| "issn" : { | ||
| "type" : "stdno", | ||
| "source" : { | ||
| "map" : ["011a","011y","011z"] | ||
| } | ||
| }, | ||
| "itemnumber" : { | ||
| "type" : "number", | ||
| "source" : { "map" : "9959" } | ||
| }, | ||
| "itype" : { | ||
| "source" : { | ||
| "index" : ["search","facet"], | ||
| "map" : ["200b","995r"] | ||
| } | ||
| }, | ||
| "lc-card-number" : { "source" : { "map" : "995j" } }, | ||
| "lf" : { "source" : { "map" : "105a_/11" } }, | ||
| "local-classification" : { | ||
| "source" : [ | ||
| { | ||
| "index" : ["search","suggestible"], | ||
| "map" : "995k" | ||
| }, | ||
| { | ||
| "map" : "686" | ||
| } | ||
| ] | ||
| }, | ||
| "location" : { | ||
| "source" : { | ||
| "index" : ["search","facet"], | ||
| "map" : "995e" | ||
| } | ||
| }, | ||
| "ln" : { | ||
| "source" : { | ||
| "index" : ["search","facet"], | ||
| "map" : "101a" | ||
| } | ||
| }, | ||
| "notforloan" : { "source" : { "map" : "995o" } }, | ||
| "onloan" : { | ||
| "type" : "boolean", | ||
| "source" : { "map" : "995n" } | ||
| }, | ||
| "publisher" : { | ||
| "source" : { | ||
| "index" : ["search","facet"], | ||
| "map" : "210c" | ||
| } | ||
| }, | ||
| "su-geo" : { | ||
| "source" : { | ||
| "index" : ["search","facet"], | ||
| "map" : "607a" | ||
| } | ||
| }, | ||
| "subject" : { | ||
| "source" : { | ||
| "index" : ["search","facet","suggestible"], | ||
| "map" : ["600","600a","601","602","604","605","606","607","608","610"] | ||
| } | ||
| }, | ||
| "suppress" : { | ||
| "type" : "boolean", | ||
| "source" : { "map" : "955n" } | ||
| }, | ||
| "ta" : { "source" : { "map" : "100a_/17" } }, | ||
| "title" : { | ||
| "type" : "string", | ||
| "source" : [ | ||
| { | ||
| "index" : ["search","suggestible"], | ||
| "map" : ["200acdehi","205","304a"] | ||
| }, | ||
| { | ||
| "index" : ["search","sort","suggestible"], | ||
| "map" : "200a" | ||
| }, | ||
| { | ||
| "map" : [ | ||
| "327abcdefghi","328t", | ||
| "410t","411t","412t","413t", | ||
| "421t","422t","423t","424t","425t", | ||
| "430t","431t","432t","433t","434t","435t","436t","437t", | ||
| "440t","441t","442t","443t","444t","445t","446t","447t","448t", | ||
| "451t","452t","453t","454t","455t","456t", | ||
| "461t","462t","463t","464t", | ||
| "470t", | ||
| "481t","482t","488t" | ||
| ] | ||
| } | ||
| ] | ||
| }, | ||
| "title-series" : { | ||
| "source" : { | ||
| "index" : ["search","facet"], | ||
| "map" : "225a" | ||
| } | ||
| } | ||
| }, | ||
| "authorities" : { | ||
| "authtype" : { "source" : { "map" : "152b" } }, | ||
| "Heading" : { | ||
| "source" : { | ||
| "map" : [ | ||
| "200abcdfgjxyz", | ||
| "210abcdefghjxyz", | ||
| "215ajxyz", | ||
| "216afcjxyz", | ||
| "220afjxyz", | ||
| "230abhijklmnqrsuwxyz", | ||
| "235abejkmrsuwxyz", | ||
| "240atjxyz", | ||
| "250ajxyz", | ||
| "260abcdjxyz", | ||
| "280ajxyz" | ||
| ] | ||
| } | ||
| }, | ||
| "Heading-Main" : { | ||
| "source" : { | ||
| "index" : ["search","sort"], | ||
| "map" : ["200a","210a","215a","216a","220a","230a","235a","240a","250a","260a","280a"] | ||
| } | ||
| }, | ||
| "local-number" : { "source" : { "map" : "001" } }, | ||
| "Match" : { | ||
| "source" : { | ||
| "map" : [ | ||
| "2003478abcdfgjxyz", | ||
| "2103478abcdefghxyz", | ||
| "21578ajxyz", | ||
| "216378acfjxyz", | ||
| "220378afjxyz", | ||
| "2308abhijklmqrsuwxyz", | ||
| "235378abeijkmratsuwxyz", | ||
| "2408abcfjtxyz", | ||
| "245ajtxyz", | ||
| "2508abcdjvxyz", | ||
| "260378abcd", | ||
| "280378ajxyz", | ||
| "4000234568abcdfgjxyz", | ||
| "41002345678abcdefghjxyz", | ||
| "4150235678ajxyz", | ||
| "416acfjxyz", | ||
| "420afjxyz", | ||
| "4300235678abhijklmnqrsuwxyz", | ||
| "4350235678abejkmrsuwxyz", | ||
| "4400235678abcdfjtxyz", | ||
| "445ajtxyz", | ||
| "4500235678ajxyz", | ||
| "460abcd", | ||
| "480ajxyz", | ||
| "500023456789abcdfgjxyz", | ||
| "51002345678abcdefghjxyz", | ||
| "5150356789ajxyz", | ||
| "5169acfjxyz", | ||
| "5209afjxyz", | ||
| "53002356789abhijklmnqrsuwxyz", | ||
| "53502356789abjkmqrsuwxyz", | ||
| "5400235678ajtxyz", | ||
| "5459ajtxyz", | ||
| "55002356789abjxyz", | ||
| "56059abcd", | ||
| "58059ajxyz", | ||
| "70023478abcdfgjxyz", | ||
| "71023478abcdefghjxyz", | ||
| "7152378ajxyz", | ||
| "716acfjxyz", | ||
| "720afjxyz", | ||
| "7302378abhijklmnqrsuwxyz", | ||
| "7402378ajtxyz", | ||
| "745378ajtxyz", | ||
| "75023789abjxyz", | ||
| "760abcd", | ||
| "780ajxyz" | ||
| ] | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,288 @@ | ||
| #!/usr/bin/perl | ||
|
|
||
| # Copyright 2020 Tamil s.a.r.l. | ||
| # | ||
| # This file is part of Koha. | ||
| # | ||
| # Koha is free software; you can redistribute it and/or modify it | ||
| # under the terms of the GNU General Public License as published by | ||
| # the Free Software Foundation; either version 3 of the License, or | ||
| # (at your option) any later version. | ||
| # | ||
| # Koha is distributed in the hope that it will be useful, but | ||
| # WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| # GNU General Public License for more details. | ||
| # | ||
| # You should have received a copy of the GNU General Public License | ||
| # along with Koha; if not, see <http://www.gnu.org/licenses>. | ||
|
|
||
| =head1 NAME | ||
| koha-es.pl - Manipulates biblio/authority Elasticsearch indexes | ||
| =head1 USAGE | ||
| Manipulates Koha ElasticSearch indexes. There is one index for biblio records: | ||
| B<biblios>, and another one for authority records: B<authorities>. The command | ||
| syntax is: C<koha-es.pl command param1 param2 param3 ...>. Here are the | ||
| available commands: | ||
| =head2 index | ||
| The B<index> command is used to populate a specific index. Syntax has this | ||
| form: C<koha-es.pl index biblio|authorities param1 param2 ...>. For example: | ||
| C<koha-es.pl index biblios resest all> | ||
| =over | ||
| =item B<biblios all> | ||
| Index all biblio records. | ||
| =item B<authorities all> | ||
| Index all authority records. | ||
| =item B<biblios all noverbose> | ||
| Index silently all biblio records. | ||
| =item B<biblios reset all> | ||
| Reset ES 'biblios' index: delete, then recreate with the current mapping. | ||
| Index all biblio records. | ||
| =item B<biblios reset all childs 4> | ||
| Reset ES biblios index: delete, then recreate with the current mapping. Index | ||
| all biblio records. Create 4 childs (processes) in order to speed up the | ||
| processing. | ||
| =item B<biblios all commit 10000> | ||
| Index all biblio records. Commit records to ES per batch of 10000 records. By | ||
| default 5000. | ||
| =item B<biblios 1-100,2000-3000> | ||
| Index biblio records with biblionumber in interval [1-100] and [2000-3000]. | ||
| =back | ||
| =head2 document | ||
| Convert Koha biblio/authority records into the document sent to ElasticSearch | ||
| for indexing. This way it's possible to 'see' the effect of modifying Koha ES | ||
| configuration. | ||
| C<koha-es.pl document biblios 100-200,1000-1020> | ||
| =head2 config | ||
| C<koha-es.pl config> manipulates Koha ES configuration. | ||
| =over | ||
| =item B<config show> | ||
| Display the ES configuration, ie the content of ESConfig system preference. | ||
| =item B<config check> | ||
| Check the JSON ES configuration. | ||
| =item B<config mapping biblios|authorities> | ||
| Show the ES mapping derived from Koha configuration. It may help to diagnostic | ||
| ES malfunctions. | ||
| =item B<config fromlegacy> | ||
| Generate ES configuration from legacy Koha ElasticSearch configuration which | ||
| were split in 3 yaml configuration files and 3 tables (search_marc_to_field, | ||
| search_marc_map, search_field). | ||
| =item B<config fromlegacy save> | ||
| Generate ES configuration from legacy Koha ElasticSearch configuration and | ||
| save it in ESConfig system preference. | ||
| =back | ||
| =cut | ||
|
|
||
| use Modern::Perl; | ||
| use Koha::SearchEngine::Elasticsearch; | ||
| use Pod::Usage; | ||
| use Time::HiRes qw/gettimeofday time/; | ||
| use JSON; | ||
|
|
||
|
|
||
| binmode(STDOUT, 'encoding(utf8)'); | ||
| binmode(STDERR, 'encoding(utf8)'); | ||
|
|
||
|
|
||
| sub usage { pod2usage( -verbose => 2 ); exit; } | ||
| sub error { say shift; exit; } | ||
|
|
||
| sub range { | ||
| my $value = shift; | ||
| my @ids; | ||
| for my $range ( split /,/, $value ) { | ||
| if ( $range =~ /-/ ) { | ||
| my ($from, $to) = split /-/, $range; | ||
| for (my $i=$from; $i <= $to; $i++) { | ||
| push @ids, $i; | ||
| } | ||
| } | ||
| else { | ||
| push @ids, $range; | ||
| } | ||
| } | ||
| return @ids; | ||
| } | ||
|
|
||
|
|
||
| sub getindex { | ||
| my $index = shift @ARGV || ''; | ||
| $index = | ||
| $index =~ /biblio/i ? 'biblios' : | ||
| $index =~ /author/i ? 'authorities' : undef; | ||
| error("Specify an index name: biblios|authorities") unless $index; | ||
| return $index; | ||
| } | ||
|
|
||
|
|
||
| sub index { | ||
| my ($verbose, $reset, $commit, $childs, $all, $range) = (1, 0, 5000, 1, 0, undef); | ||
|
|
||
| my $index = getindex(); | ||
|
|
||
| while (@ARGV) { | ||
| $_ = shift @ARGV; | ||
| if ( /reset/ ) { $reset = 1; } | ||
| elsif ( /noverbose/i ) { $verbose = 0; } | ||
| elsif ( /all/ ) { $all = 1; } | ||
| elsif ( /^([0-9\-,])*$/ ) { $range = $_; } | ||
| elsif ( /commit|child/i ) { | ||
| usage() unless @ARGV; | ||
| my $value = shift @ARGV; | ||
| error("commit|childs requires a numeric parameter") if $value !~ /^([0-9])*$/; | ||
| if ( /commit/ ) { $commit = $value; } else { $childs = $value; } | ||
| } | ||
| } | ||
|
|
||
| error("Choose 'range' indexing or 'all' indexing") if $range && $all; | ||
| error("Nothing to index") unless $all || $range; | ||
|
|
||
| my $es = Koha::SearchEngine::Elasticsearch->new(); | ||
| $index = $es->indexes->{$index}; # Koha::SearchEngine::Elasticsearch::Index | ||
| my $p = { queue_size => $commit, childs => $childs, | ||
| reset => $reset, range => $range }; | ||
| if ($verbose) { | ||
| $p->{cb} = { | ||
| add => sub { | ||
| my $self = shift; # ES::Indexer Object | ||
| return if $self->count % 1000; | ||
| my $pcent = $self->count * 100 / $self->total; | ||
| say $self->count, sprintf(" (%.2f%%)", $pcent); | ||
| }, | ||
| begin => sub { | ||
| my $self = shift; | ||
| say $all | ||
| ? "Full indexing" | ||
| : "Indexing", ": ", $self->total, " records"; | ||
| }, | ||
| end => sub { | ||
| my $self = shift; | ||
| say "Terminated: ", $self->total, " records indexed"; | ||
| }, | ||
| }; | ||
| } | ||
| $index->indexing($p); | ||
| } | ||
|
|
||
|
|
||
| sub marc_to_text { | ||
| my $record = shift; | ||
| join("\n", map { | ||
| my $field = $_; | ||
| $_->tag lt '010' | ||
| ? $field->tag . " " . $_->data | ||
| : $field->tag . " " . $field->indicator(1) . $field->indicator(2) . ' ' . | ||
| join(' ', map { | ||
| '$' . $_->[0] . ' ' . $_->[1] } $field->subfields); | ||
| } $record->fields); | ||
| } | ||
|
|
||
|
|
||
| sub document { | ||
| my $index = getindex(); | ||
|
|
||
| my @ids; | ||
| push @ids, range($_) for @ARGV; | ||
| error("Specify biblio/authority records ids") unless @ids; | ||
|
|
||
| my $es = Koha::SearchEngine::Elasticsearch->new(); | ||
| $index = $es->indexes->{$index}; | ||
| for my $id (@ids) { | ||
| my $record = $index->get_record($id); | ||
| next unless $record; | ||
| my $doc = $index->to_doc($record); | ||
| delete $doc->{$_} for qw/marc_format marc_data/; | ||
| say marc_to_text($record), "\n\nElasticsearch document to index:\n", to_json($doc, {pretty => 1}); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| sub config { | ||
| my $action = lc shift @ARGV || ''; | ||
| if ( $action eq 'show' ) { | ||
| say C4::Context->preference('ESConfig'); | ||
| } | ||
| elsif ( $action eq 'check' ) { | ||
| my $errors = Koha::SearchEngine::Elasticsearch::check_config(); | ||
| if (@$errors) { | ||
| say $_ for @$errors; | ||
| } | ||
| else { | ||
| say "Configuration OK"; | ||
| } | ||
| } | ||
| elsif ( $action eq 'mapping' ) { | ||
| usage() unless @ARGV; | ||
| my $es = Koha::SearchEngine::Elasticsearch->new(); | ||
| my $mapping = $es->indexes->{getindex()}->mapping; | ||
| say to_json($mapping, {pretty=>1}); | ||
| } | ||
| elsif ( $action eq 'fromlegacy' ) { | ||
| my $c = Koha::SearchEngine::Elasticsearch::conf_from_legacy(); | ||
| my $save = lc shift @ARGV || ''; | ||
| if ($save eq 'save') { | ||
| C4::Context->set_preference('ESConfig', $c); | ||
| say "ESConfig updated with an ES configuration built from legacy configuration"; | ||
| } | ||
| else { | ||
| say $c; | ||
| } | ||
| } | ||
| else { | ||
| usage(); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| sub main() { | ||
| usage() unless @ARGV; | ||
| my $command = shift @ARGV; | ||
| if ($command =~ /index|document|mapping|config/) { | ||
| no strict 'refs'; | ||
| $command->(); | ||
| } | ||
| else { | ||
| error("Uknown command: $command"); | ||
| } | ||
| } | ||
|
|
||
| main(); |