Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Search API Elasticsearch Attachments
[![CircleCI](https://circleci.com/gh/dakkusingh/search_api_elasticsearch_attachments.svg?style=svg)](https://circleci.com/gh/dakkusingh/search_api_elasticsearch_attachments)

Elasticsearch is generally used to index data of types like string,
number, date, etc.
However, what if you wanted to index a file like a .pdf or a .doc
Elasticsearch is generally used to index data of types like string,
number, date, etc.
However, what if you wanted to index a file like a .pdf or a .doc
directly and make it searchable?

This module allows Drupal to index files (attachments) to Elasticsearch by
This module allows Drupal to index files (attachments) to Elasticsearch by
making use of Elasticsearch data type "attachment".

![Search_API_Elasticsearch_Attachments](https://www.drupal.org/files/search_api_elasticsearch_attachments.jpg)
Expand All @@ -20,9 +20,9 @@ This module requires:
* Elasticsearch `ingest-attachment` plugin

## Elasticsearch Plugin Installation
The first step is to install the Elasticsearch plugin: `ingest-attachment`,
which enables ES to recognise the "attachment" data type. In turn, it uses
Apache Tika for content extraction and supports several file types such as
The first step is to install the Elasticsearch plugin: `ingest-attachment`,
which enables ES to recognise the "attachment" data type. In turn, it uses
Apache Tika for content extraction and supports several file types such as
.pdf, .doc, .xls, .rtf, .html, .odt, etc.

```
Expand All @@ -41,7 +41,7 @@ You have to choose the correct versions of the module depending on your
Elastic Search Server setup. Please see the table below for
compatibility.

If you are using Elasticsearch Connector 8.x-5.x,
If you are using Elasticsearch Connector 8.x-5.x,
please use 8.x-5.x of
*search_api_elasticsearch_attachments* module.

Expand Down
21 changes: 14 additions & 7 deletions composer.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
{
"name": "drupal/search_api_elasticsearch_attachments",
"description": "Search API Elasticsearch Attachments",
"type": "drupal-module",
"minimum-stability": "dev",
"require": {
"drupal/elasticsearch_connector": "5.0-alpha3"
}
"name": "drupal/search_api_elasticsearch_attachments",
"description": "Search API Elasticsearch Attachments",
"type": "drupal-module",
"minimum-stability": "dev",
"require": {
"drupal/elasticsearch_connector": "5.0-alpha3"
},
"extra": {
"patches": {
"drupal/elasticsearch_connector": {
"Issue #2918138": "https://www.drupal.org/files/issues/2018-12-14/elasticsearch_connector-alter_params-2918138-5.patch"
}
}
}
}
17 changes: 5 additions & 12 deletions search_api_elasticsearch_attachments.services.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,4 @@
services:
search_api_elasticsearch_attachments.prepare_index:
class: Drupal\search_api_elasticsearch_attachments\EventSubscriber\PrepareIndex
tags:
- { name: event_subscriber }
search_api_elasticsearch_attachments.prepare_query:
class: Drupal\search_api_elasticsearch_attachments\EventSubscriber\PrepareQuery
tags:
- { name: event_subscriber }
search_api_elasticsearch_attachments.prepare_mapping:
class: Drupal\search_api_elasticsearch_attachments\EventSubscriber\PrepareMapping
tags:
- { name: event_subscriber }
search_api_elasticsearch_attachments.build_query:
class: Drupal\search_api_elasticsearch_attachments\EventSubscriber\BuildSearchParams
tags:
Expand All @@ -19,3 +7,8 @@ services:
class: Drupal\search_api_elasticsearch_attachments\EventSubscriber\PrepareIndexMapping
tags:
- { name: event_subscriber }
search_api_elasticsearch_attachments.build_index_params:
class: Drupal\search_api_elasticsearch_attachments\EventSubscriber\BuildIndexParams
arguments: ['@elasticsearch_connector.client_manager', '@entity_type.manager']
tags:
- { name: event_subscriber }
188 changes: 188 additions & 0 deletions src/EventSubscriber/BuildIndexParams.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
<?php

namespace Drupal\search_api_elasticsearch_attachments\EventSubscriber;

use Drupal\elasticsearch_connector\Event\BuildIndexParamsEvent;
use Symfony\Component\EventDispatcher\EventSubscriberInterface;
use Drupal\search_api\Entity\Index;
use Drupal\search_api_elasticsearch_attachments\Helpers;
use Drupal\elasticsearch_connector\ElasticSearch\ClientManagerInterface;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\search_api\IndexInterface;
use Elasticsearch\Common\Exceptions\Missing404Exception;

/**
* {@inheritdoc}
*/
class BuildIndexParams implements EventSubscriberInterface {

protected $pipelineName = 'es_attachment';
protected $targetFieldId = 'es_attachment';

/**
* {@inheritdoc}
*/
public function __construct(ClientManagerInterface $client_manager, EntityTypeManagerInterface $entity_type_manager) {
$this->clientManager = $client_manager;
$this->entityTypeManager = $entity_type_manager;
}

/**
* {@inheritdoc}
*/
public static function getSubscribedEvents() {
$events[BuildIndexParamsEvent::BUILD_PARAMS][] = ['indexParams', 100];
$events[BuildIndexParamsEvent::BUILD_PARAMS][] = ['pipelineProcessing', 101];
return $events;
}

/**
* Method to build Params.
*
* @param \Drupal\elasticsearch_connector\Event\BuildIndexParamsEvent $event
* The BuildIndexParamsEvent event.
*/
public function indexParams(BuildIndexParamsEvent $event) {
// We need to react only on our processor.
$indexName = $this->getIndexName($event);
$processors = $this->getIndexProcessors($indexName);
// Add pipeline param.
if (!empty($processors['elasticsearch_attachments'])) {
$params = $event->getElasticIndexParams();
// Add pipeline param for attachment processing.
$params['pipeline'] = $this->pipelineName;
// Set updated params array.
$event->setElasticIndexParams($params);
}
}

/**
* Valdiate pipeline. Create new one or delete existing.
*
* @param \Drupal\elasticsearch_connector\Event\BuildIndexParamsEvent $event
* The BuildIndexParamsEvent event.
*/
public function pipelineProcessing(BuildIndexParamsEvent $event) {
// Get incex name and list of available processors.
$indexName = $this->getIndexName($event);
$processors = $this->getIndexProcessors($indexName);
/** @var \Drupal\search_api\IndexInterface $index */
$index = $this->getIndex($indexName);
// Initialize client to work with.
$this->initializeClient($index);
// Pipeline registration.
if (!empty($processors['elasticsearch_attachments'])) {
// If there is no pipeline yet, Elastic will return Missing404Exception.
// There is no other way to check if pipeline exist.
try {
$this->getPipeline();
}
catch (Missing404Exception $e) {
$this->putPipeline();
}
}
else {
// If there is no pipeline yet, Elastic will return Missing404Exception.
// There is no other way to check if pipeline exist.
try {
$this->getPipeline();
$this->deletePipeline();
}
catch (Missing404Exception $e) {
// Nothing to do here.
}
}
}

/**
* Get index name.
*
* @param \Drupal\elasticsearch_connector\Event\BuildIndexParamsEvent $event
* The BuildIndexParamsEvent event.
*
* @return string
* Index name
*/
public function getIndexName(BuildIndexParamsEvent $event) {
return Helpers::getIndexName($event->getIndexName());
}

/**
* Get list of all available index processors.
*
* @param string $indexName
* Name of index.
*
* @return array
* List of all available processors.
*/
public function getIndexProcessors($indexName) {
return Index::load($indexName)->getProcessors();
}

/**
* Get list of all available index processors.
*
* @param string $indexName
* Name of index.
*
* @return \Drupal\search_api\IndexInterface
* Index object.
*/
public function getIndex($indexName) {
return Index::load($indexName);
}

/**
* ElasticSearch client initialization.
*
* @param \Drupal\search_api\IndexInterface $index
* The index scheduled for indexing.
*/
public function initializeClient(IndexInterface $index) {
$cluster_name = $index->getServerInstance()->getBackend()->getCluster();
$cluster = $this->entityTypeManager->getStorage('elasticsearch_cluster')->load($cluster_name);
$this->client = $this->clientManager->getClientForCluster($cluster);
}

/**
* Helper to register new pipeline.
*/
public function putPipeline() {
$params = [];
$params['id'] = $this->pipelineName;
$params['body'] = [
'description' => 'Extract attachment information from arrays',
'processors' => [
[
'foreach' => [
'field' => $this->targetFieldId,
'ignore_failure' => TRUE,
'processor' => [
'attachment' => [
'target_field' => '_ingest._value.attachment',
'field' => '_ingest._value.data',
],
],
],
],
],
];
$this->client->ingest()->putPipeline($params);
}

/**
* Helper to delete exiting pipeline.
*/
public function deletePipeline() {
$this->client->ingest()->deletePipeline(['id' => $this->pipelineName]);
}

/**
* Helper to get exiting pipeline.
*/
public function getPipeline() {
return $this->client->ingest()->getPipeline(['id' => $this->pipelineName]);
}

}
59 changes: 46 additions & 13 deletions src/EventSubscriber/BuildSearchParams.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,59 @@ public static function getSubscribedEvents() {
*/
public function searchParams(BuildSearchParamsEvent $event) {
$params = $event->getElasticSearchParams();

// Default Prefix and Suffix.
$prefix = '<strong>';
$suffix = '</strong>';

// We need to get the Prefix and Suffix from processor.
// Set default boost.
$boost = 1.0;
// We need to get the processor.
$indexName = Helpers::getIndexName($event->getIndexName());
$processors = Index::load($indexName)->getProcessors();

// Try to load boost value from config form.
if (!empty($processors['elasticsearch_attachments'])) {
$boost = $processors['elasticsearch_attachments']->getConfiguration()['boost'];
// Get original query.
$originalBoolQuery = $params['body']['query']['bool']['must'];
// Get query string.
if (isset($originalBoolQuery['query_string'])) {
$queryString = $originalBoolQuery['query_string']['query'];
// Build nestedQuery.
// @see https://www.elastic.co/guide/en/elasticsearch/guide/current/nested-query.html.
$nestedQuery = [
'nested' => [
'path' => 'es_attachment',
'query' => [
'bool' => [
'must' => [
'query_string' => [
'query' => $queryString,
'fields' => [
'es_attachment.attachment.content^' . $boost,
],
],
],
],
],
],
];
// We need to change the bool query from must to should.
// This is requried to add support for nested and string queries.
$params['body']['query']['bool']['should'] = [];
unset($params['body']['query']['bool']['must']);
$params['body']['query']['bool']['should'][] = $originalBoolQuery;
$params['body']['query']['bool']['should'][] = $nestedQuery;
// Add min match param.
$params['body']['query']['bool']['minimum_should_match'] = 1;
}
}
// Add highlight if enabled.
if (!empty($processors['elasticsearch_attachments_highlight'])) {
$processorConf = $processors['elasticsearch_attachments_highlight']->getConfiguration();
$prefix = $processorConf['prefix'];
$suffix = $processorConf['suffix'];
// See: https://github.com/elastic/elasticsearch-php/issues/394
$params['body']['highlight']['fields']['es_attachment.attachment.content'] = (object) [];
$params['body']['highlight']['pre_tags'] = [$prefix];
$params['body']['highlight']['post_tags'] = [$suffix];
}

// See: https://github.com/elastic/elasticsearch-php/issues/394
$params['body']['highlight']['fields']['es_attachment.content'] = (object) [];
$params['body']['highlight']['pre_tags'] = [$prefix];
$params['body']['highlight']['post_tags'] = [$suffix];

// Set updated params array.
$event->setElasticSearchParams($params);
}

Expand Down
30 changes: 0 additions & 30 deletions src/EventSubscriber/PrepareIndex.php

This file was deleted.

Loading