In [3]:

documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]

In [4]:
#!pip install sentence_transformers

In [5]:
from sentence_transformers import SentenceTransformer

# Load pre-trained Sentence Transformer model
model_name = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'
model = SentenceTransformer(model_name)
     

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [6]:
document_embeddings = model.encode(documents)
for i, embedding in enumerate(document_embeddings):
    print(f"Document {i+1} embedding: {embedding}")

Document 1 embedding: [ 0.10894679  0.07812059  0.11626562 -0.31912503  0.46890226  0.43514383
  0.0145374   0.44238758  0.29716396 -0.18982685  0.07389066 -0.278649
  0.2133816  -0.12077004  0.17891712 -0.007899    0.04754866 -0.18204558
  0.34227106 -0.06994262 -0.14288737  0.57141256 -0.11153238 -0.178954
  0.01523135  0.2610571  -0.20555824  0.05203116 -0.02810767  0.23873235
  0.01206979  0.04404926  0.0224232  -0.13895181 -0.74100375  0.2560101
  0.08149689  0.18820493 -0.41237676  0.11368624  0.28121156  0.05860884
 -0.17318773  0.3354913   0.21803683 -0.05090716 -0.05457793 -0.8738478
 -0.24082273  0.32006967  0.44761655  0.06347829  0.5357484   0.16607259
 -0.33196998  0.33393645  0.28615907 -0.5419567  -0.271324    0.24881148
 -0.23919372 -0.469263    0.13836567  0.37842894 -0.01304427  0.01990607
  0.3236508   0.45857537  0.07600269  0.25299588 -0.4293894   0.10051924
 -0.33042607 -0.69876456  0.01035899  0.05666558  0.14731233 -0.47082356
  0.0806399   0.33870447 -0.2727814

In [8]:

query = "Natural language processing techniques enhance keyword extraction efficiency."
query_embedding = model.encode(query)
print("Query embedding:", query_embedding)

Query embedding: [ 3.42821121e-01  3.26170415e-01  7.73246121e-03  1.60758961e-02
  2.38517355e-02 -2.52880782e-01  8.15468654e-02  1.37177303e-01
  3.33291799e-01  3.65743823e-02 -9.28224847e-02 -3.43725860e-01
  7.12015927e-02  9.48833823e-01 -9.41289589e-02 -7.96414137e-01
 -3.20592225e-01 -2.67564505e-01  2.08664209e-01  7.99599960e-02
  7.93335494e-03 -5.22296652e-02 -1.00512795e-01 -2.04229817e-01
  2.19562605e-01  7.67738968e-02  1.25881493e-01  2.90843755e-01
 -4.99191076e-01  2.27709681e-01 -1.94742605e-01  7.21394867e-02
 -4.56506852e-03  1.21945605e-01 -1.46225780e-01 -9.64745954e-02
  1.02258220e-01 -2.04421103e-01 -3.95657659e-01  7.32251927e-02
  1.10104166e-01  4.84611303e-01  1.04519352e-01  3.86131644e-01
 -1.68697327e-01 -2.11251169e-01 -6.38036370e-01 -4.32641685e-01
  2.74459243e-01  9.64053795e-02 -1.30053505e-01  5.76567911e-02
  8.07963088e-02  6.48147836e-02 -2.14181561e-03  2.29773909e-01
 -6.66054562e-02  3.07675391e-01  2.11779624e-01 -1.57844439e-01
  9.7886

In [10]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(np.array([query_embedding]), document_embeddings)
similarities

array([[0.16948144, 0.45802277, 0.56756926, 0.44123283, 0.6316115 ,
        0.75214124, 0.550352  , 0.7448169 ]], dtype=float32)

In [14]:
print("query: ",query)
print("Documents: ", documents)
most_similar_index = np.argmax(similarities)
print("most_similar_index: ",most_similar_index)
most_similar_document = documents[most_similar_index]
print("most_similar_document: ", most_similar_document)
similarity_score = similarities[0][most_similar_index]
print("similarity_score: ",similarity_score)


query:  Natural language processing techniques enhance keyword extraction efficiency.
Documents:  ['This is a list which containing sample documents.', 'Keywords are important for keyword-based search.', 'Document analysis involves extracting keywords.', 'Keyword-based search relies on sparse embeddings.', 'Understanding document structure aids in keyword extraction.', 'Efficient keyword extraction enhances search accuracy.', 'Semantic similarity improves document retrieval performance.', 'Machine learning algorithms can optimize keyword extraction methods.']
most_similar_index:  5
most_similar_document:  Efficient keyword extraction enhances search accuracy.
similarity_score:  0.75214124


In [19]:
print("Ranked Documents:")
for rank, (document, similarity) in enumerate(ranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Ranked Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521412372589111
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448168992996216
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.6316115260124207
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5675692558288574
Rank 5: Document - 'Semantic similarity improves document retrieval performance.', Similarity Score - 0.5503519773483276
Rank 6: Document - 'Keywords are important for keyword-based search.', Similarity Score - 0.4580227732658386
Rank 7: Document - 'Keyword-based search relies on sparse embeddings.', Similarity Score - 0.441232830286026
Rank 8: Document - 'This is a list which containing sample documents.', Similarity Score - 0.16948144137859344


In [21]:
print("Top 4 Ranked Documents:")
for rank, (document, similarity) in enumerate(ranked_documents[:4], start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Top 4 Ranked Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521412372589111
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448168992996216
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.6316115260124207
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5675692558288574


## Reranking

In [23]:
from rank_bm25 import BM25Okapi
top_4_documents = [doc[0] for doc in ranked_documents[:4]]

In [25]:

tokenized_top_4_documents = [doc.split() for doc in top_4_documents]
tokenized_top_4_documents

[['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Document', 'analysis', 'involves', 'extracting', 'keywords.']]

In [26]:
tokenized_query = query.split()
tokenized_query

['Natural',
 'language',
 'processing',
 'techniques',
 'enhance',
 'keyword',
 'extraction',
 'efficiency.']

In [28]:
bm25=BM25Okapi(tokenized_top_4_documents)
bm25_scores = bm25.get_scores(tokenized_query)
sorted_indices2 = np.argsort(bm25_scores)[::-1]
bm25_scores

array([0.1907998 , 0.16686672, 0.17803252, 0.        ])

In [29]:
reranked_documents = [(top_4_documents[i], bm25_scores[i]) for i in sorted_indices2]
reranked_documents

[('Efficient keyword extraction enhances search accuracy.',
  0.19079979534096053),
 ('Understanding document structure aids in keyword extraction.',
  0.1780325227902643),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  0.1668667199671815),
 ('Document analysis involves extracting keywords.', 0.0)]

In [30]:
print("Rerank of top 4 Documents:")
for rank, (document, similarity) in enumerate(reranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Rerank of top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.19079979534096053
Rank 2: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.1780325227902643
Rank 3: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.1668667199671815
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.0


In [32]:
ranked_documents[:4]

[('Efficient keyword extraction enhances search accuracy.', 0.75214124),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  0.7448169),
 ('Understanding document structure aids in keyword extraction.', 0.6316115),
 ('Document analysis involves extracting keywords.', 0.56756926)]

### Other Alg - Cross Encoders

In [33]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [34]:
top_4_documents

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [35]:

query

'Natural language processing techniques enhance keyword extraction efficiency.'

In [37]:

pairs = []
for doc in top_4_documents:
    pairs.append([query, doc])
pairs

[['Natural language processing techniques enhance keyword extraction efficiency.',
  'Efficient keyword extraction enhances search accuracy.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Machine learning algorithms can optimize keyword extraction methods.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Understanding document structure aids in keyword extraction.'],
 ['Natural language processing techniques enhance keyword extraction efficiency.',
  'Document analysis involves extracting keywords.']]

In [38]:
scores = cross_encoder.predict(pairs)
scores

array([ 3.1378736 ,  0.84216446, -2.9193006 , -2.878191  ], dtype=float32)

In [40]:

scored_docs = zip(scores, top_4_documents)
reranked_document_cross_encoder = sorted(scored_docs, reverse=True)
reranked_document_cross_encoder

[(3.1378736, 'Efficient keyword extraction enhances search accuracy.'),
 (0.84216446,
  'Machine learning algorithms can optimize keyword extraction methods.'),
 (-2.878191, 'Document analysis involves extracting keywords.'),
 (-2.9193006, 'Understanding document structure aids in keyword extraction.')]

vergleich mit BM_25

In [41]:
reranked_documents

[('Efficient keyword extraction enhances search accuracy.',
  0.19079979534096053),
 ('Understanding document structure aids in keyword extraction.',
  0.1780325227902643),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  0.1668667199671815),
 ('Document analysis involves extracting keywords.', 0.0)]

### cohere

In [42]:
import cohere
co = cohere.Client("nbDqU1hTVxWmXGbLYI6OnYhp4Cx40MZ5hOmO5oKX")

In [44]:

top_4_documents

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [45]:

query

'Natural language processing techniques enhance keyword extraction efficiency.'

In [46]:

response = co.rerank(
    model="rerank-english-v3.0",
    query="Natural language processing techniques enhance keyword extraction efficiency.",
    documents=top_4_documents,
    return_documents=True
)

In [48]:
for i in range(4):
  print(f'text: {response.results[i].document.text} score: {response.results[i].relevance_score}')

text: Efficient keyword extraction enhances search accuracy. score: 0.99411184
text: Machine learning algorithms can optimize keyword extraction methods. score: 0.9129032
text: Understanding document structure aids in keyword extraction. score: 0.32885265
text: Document analysis involves extracting keywords. score: 0.02865267
