In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.

In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda


In [4]:
def preprocess_text(text):
    """
    Tokenizes the input text into sentences.
    """
    sentences = text.split('. ')
    return [sentence.strip() for sentence in sentences if sentence]

def get_sentence_embeddings(sentences):
    """
    Converts sentences to embeddings using SentenceTransformer.
    """
    return model.encode(sentences)

def match_sentences(text1, text2, threshold=0.75):
    """
    Matches sentences from text1 to text2 based on cosine similarity of their embeddings.
    """
    sentences1 = preprocess_text(text1)
    sentences2 = preprocess_text(text2)
    
    logger.info("Sentences from text1: %s", sentences1)
    logger.info("Sentences from text2: %s", sentences2)
    
    embeddings1 = get_sentence_embeddings(sentences1)
    embeddings2 = get_sentence_embeddings(sentences2)
    
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)
    logger.info("Similarity matrix: %s", similarity_matrix)
    
    matches = []
    unmatched1 = set(range(len(sentences1)))
    unmatched2 = set(range(len(sentences2)))
    
    for i, row in enumerate(similarity_matrix):
        for j, similarity in enumerate(row):
            if similarity > threshold:
                matches.append((sentences1[i], sentences2[j], similarity))
                unmatched1.discard(i)
                unmatched2.discard(j)
    
    unmatched_sentences1 = [sentences1[i] for i in unmatched1]
    unmatched_sentences2 = [sentences2[i] for i in unmatched2]
    
    return matches, unmatched_sentences1, unmatched_sentences2

def main():
    text1 = ("As the sun sank below the horizon, casting shadows across Thistledown, "
             "a group of adventurers gathered around a flickering campfire. The air was full of the scent of pine and the distant calls of nocturnal animals.\n\n"
             "\"Are you sure this is the right path, Elaria?\" asked Thorne, his hand on the hilt of his sword. He peered out at the darkening forest warily.\n\n"
             "\"The map led us here,\" said Elaria, a slim elf with piercing green eyes. \"The ancient runes spoke of a temple hidden beyond the Silverstream. We must trust in the old ways.\"\n\n"
             "Brakkar, the burly dwarf, sighed and adjusted his axe. \"Trusting in old runes and forgotten temples... This had better lead to treasure worth all this trouble.\"\n\n"
             "\"Not all treasures are made of gold, Brakkar,\" said Lyra, the group's sorceress. Her eyes reflected the light of the fire. \"Some secrets are far more valuable.\"\n\n"
             "Thorne smiled. \"Secrets or gold... We'll find out soon enough.\"")

    text2 = ("The sun was just setting, and Thistledown was darkening by degrees. A campfire burned at the center of a small group of adventurers—some humans, some elves, and one dwarf. The air held the scent of pine, and the distant call of nocturnal animals echoed through the forest.\n\n"
             "\"Are you sure this is right?\" Thorne asked Elaria, who was pointing at a trail. \"We're supposed to be going toward the Silverstream.\"\n\n"
             "\"The map we found led us here,\" said Elaria, an elf. Her green eyes glinted with a hint of mischief in the firelight. \"It spoke of a temple hidden beyond the river. We'll see if those old runes hold any truth.\"\n\n"
             "\"Trusting in old runes and forgotten temples?\" Brakkar said, adjusting his axe. \"Hope it leads to treasure worth all this trouble.\"\n\n"
             "Lyra smiled at that. \"Not all treasures are made of gold. Some secrets are far more valuable.\"")
    
    matches, unmatched1, unmatched2 = match_sentences(text1, text2)
    
    logger.info("Matched sentences:")
    for match in matches:
        logger.info("Text1: %s\nText2: %s\nSimilarity: %.2f", match[0], match[1], match[2])
    
    logger.info("Unmatched sentences in text1: %s", unmatched1)
    logger.info("Unmatched sentences in text2: %s", unmatched2)

if __name__ == "__main__":
    main()

INFO:__main__:Sentences from text1: ['As the sun sank below the horizon, casting shadows across Thistledown, a group of adventurers gathered around a flickering campfire', 'The air was full of the scent of pine and the distant calls of nocturnal animals.\n\n"Are you sure this is the right path, Elaria?" asked Thorne, his hand on the hilt of his sword', 'He peered out at the darkening forest warily.\n\n"The map led us here," said Elaria, a slim elf with piercing green eyes', '"The ancient runes spoke of a temple hidden beyond the Silverstream', 'We must trust in the old ways."\n\nBrakkar, the burly dwarf, sighed and adjusted his axe', '"Trusting in old runes and forgotten temples..', 'This had better lead to treasure worth all this trouble."\n\n"Not all treasures are made of gold, Brakkar," said Lyra, the group\'s sorceress', 'Her eyes reflected the light of the fire', '"Some secrets are far more valuable."\n\nThorne smiled', '"Secrets or gold..', 'We\'ll find out soon enough."']
INFO:_

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Similarity matrix: [[ 0.47909266  0.7808743   0.36432648  0.19050679  0.40290454  0.18935221
   0.25128692  0.06008013  0.04477419  0.0572561 ]
 [ 0.34853593  0.3029966   0.87715924  0.49618113  0.25114632  0.35833463
   0.3336891   0.27747256  0.12796298  0.18529359]
 [ 0.3810831   0.22356963  0.5719634   0.729069    0.3822515   0.3341831
   0.32547677  0.3643644   0.04114632  0.14702854]
 [ 0.20595044  0.23514651  0.2690522   0.5077661   0.2075455   0.76063037
   0.65164995  0.3201077   0.42035162  0.2961071 ]
 [ 0.17049411  0.196282    0.26436275  0.28011551  0.24448109  0.18634276
   0.6388257   0.25879085  0.1826383   0.21228862]
 [ 0.12994505  0.19518682  0.20609137  0.33249357  0.09088229  0.54658014
   0.8029504   0.29744738  0.3872131   0.28365856]
 [ 0.01581878  0.14806704  0.19834086  0.3171805   0.1086188   0.2519964
   0.3933123   0.6488729   0.7007144   0.52296954]
 [ 0.35829353  0.3785791   0.4154824   0.21939783  0.7132244   0.24633218
   0.13539533  0.245