Skip to content

Commit

Permalink
Similarity Implementation (#22)
Browse files Browse the repository at this point in the history
* MINOR: Removed error log

* EHHANCEMENT: Experimentation with extraction of terms for similar searching

* WIP: Added stopwords, but this needs configurable

* WIP: A very ad hoc similarity text extraction
  • Loading branch information
gordonbanderson committed Sep 17, 2020
1 parent 0f35aa6 commit 8a28a1d
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 10 deletions.
10 changes: 1 addition & 9 deletions src/Service/IndexCreator.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@ public function createIndex(string $indexName): void
$specsHelper = new SpecsHelper();
$specs = $specsHelper->getFieldSpecs($indexName);

\error_log('SPECS');
\print_r($specs);

$columns = [];
foreach ($fields as $field) {
/*
Expand Down Expand Up @@ -126,6 +123,7 @@ public function createIndex(string $indexName): void
'min_infix_len' => 2,
'html_strip' => 1,
'bigram_index' => 'all',
'stopwords' => 'en',
];

$manticoreTokenizer = null;
Expand Down Expand Up @@ -162,16 +160,10 @@ public function createIndex(string $indexName): void
$settings['morphology'] = $this->getMorphology($manticoreTokenizer, $manticoreLanguage);
}


// drop index, and updating an existing one does not effect change
$manticoreClient->indices()->drop(['index' => $indexName, 'body'=>['silent'=>true]]);


$manticoreIndex = new \Manticoresearch\Index($manticoreClient, $indexName);

\error_log('----- payload -----');
\error_log(\print_r($columns, true));

$manticoreIndex->create(
$columns,
$settings,
Expand Down
63 changes: 62 additions & 1 deletion src/Service/Searcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,70 @@ public function searchForSimilar(DataObject $dataObject): SearchResults
foreach (\array_keys($textForCurrentIndex) as $fieldName) {
$amalgamatedText .= $textForCurrentIndex[$fieldName] . ' ';
}

$this->searchType = SearchParamTypes::OR;
$text = $this->getLeastCommonTerms($amalgamatedText, 10);

return $this->search($text);
}


/**
* Find terms suitable for similarity searching
*
* @todo Rename this method, or separate into a helper?
* @param string $text text of a document being searched for
*/
private function getLeastCommonTerms(string $text, int $number = 20): string
{
$client = new Client();
$connection = $client->getConnection();
$params = [
'index' => $this->indexName,
'body' => [
'query'=>$text,
'options' => [
'stats' =>1,
'fold_lemmas' => 1,
],
],
];

$keywords = $connection->keywords($params);

/* @phpstan-ignore-next-line */
\usort(
$keywords,
static function ($a, $b): void {

($a["docs"] <= $b["docs"])
? -1
: +1;
}
);

$wordInstances = [];
$wordNDocs = [];
foreach ($keywords as $entry) {
// @todo this or normalized?
$word = $entry['tokenized'];

// if a word is unique to the source document, it is useless for finding other similar documents
if ($entry['docs'] > 1) {
if (!isset($wordInstances[$word])) {
$wordInstances[$word] = 0;
}
$wordInstances[$word] += 1;
}

$wordNDocs[$word] = $entry['docs'];
}

$toGlue = \array_keys($wordInstances);
$toGlue = \array_slice($toGlue, 0, $number);
$text = \implode(' ', $toGlue);

return $this->search($amalgamatedText);
return $text;
}


Expand Down

0 comments on commit 8a28a1d

Please sign in to comment.