<a href="https://colab.research.google.com/github/dawit-melka/ICog-Labs/blob/main/semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/dawit-melka/ICog-Labs.git

In [2]:
%cd ICog-Labs

/content/ICog-Labs


In [None]:
%cd Semantic Search

In [None]:
!pip install fuzzywuzzy
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
import os
import nltk
import time
import matplotlib.pyplot as plt
import numpy as np
from fuzzywuzzy import fuzz

# import voyageai
# from voyageai import get_embeddings, get_embedding

from data.questions_and_answers import questions, answers
nltk.download('punkt')  # Uncomment if not already downloaded


In [5]:
def load_input_data(file_path):
    """
    Load input data from a file.

    Parameters:
    - file_path (str): Path to the input file.

    Returns:
    - str: Contents of the file.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    return data

In [6]:
def convert_to_sentences(text):
    """
    Convert a text into a list of sentences.

    Parameters:
    - text (str): Input text.

    Returns:
    - list: List of sentences.
    """
    return nltk.sent_tokenize(text)

In [7]:
def download_models(model_names):
    """
    Download sentence transformer models.

    Parameters:
    - model_names (list): List of model names to download.
    """
    for model_name in model_names:
        model = SentenceTransformer(model_name)


In [8]:
def compute_similarity_scores(query_emb, doc_emb):
    """
    Compute similarity scores between a query and document embeddings.

    Parameters:
    - query_emb (numpy array): Embedding of the query.
    - doc_emb (numpy array): Embeddings of the documents.

    Returns:
    - list: Similarity scores.
    """
    return util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

In [9]:
def check_similarity(sentence1, sentence2):
    similarity_ratio = fuzz.ratio(sentence1, sentence2)

    threshold = 70

    if similarity_ratio >= threshold:
        return True
    else:
        return False


In [10]:
def save_model_stats_to_txt(model_stats, file_path):
    with open(file_path, 'a') as file:
        for model_name, stats in model_stats.items():
            file.write(f"Model: {model_name}\n")
            file.write(f"Time Elapsed: {stats['time elapsed']} seconds\n")
            file.write(f"Total Error: {stats['total error']}\n")
            file.write(f"Total Found: {stats['total found']}\n")
            file.write(f"First Results: {stats['first results']}\n")
            file.write(f"Second Results: {stats['second results']}\n")
            file.write(f"Third Results: {stats['third results']}\n")
            file.write("\n")

In [11]:
# Load input data
INPUT_FILE_PATH = '/content/ICog-Labs/Semantic Search/data/to_kill_a_mocking_bird.txt'
data = load_input_data(INPUT_FILE_PATH)

# Convert the book into a list of sentences
book_sentences = convert_to_sentences(data)

In [12]:
def run_semantic_search(model_name, model):
    model_stats = {}

    start_time = time.time()
    print(model_name, "model running...")


    # Encode query and documents
    if model_name == "embed-english-v3.0":
      doc_emb = model.embed(book_sentences, input_type="search_document", model=model_name).embeddings
    elif model != None:
      doc_emb = model.encode(book_sentences)
    else:
      doc_emb = [get_embedding(s) for s in book_sentences]
    error = 0
    count_first_result = 0
    count_second_result = 0
    count_third_result = 0
    count_top_10 = 0

    # Implementation of the semantic search logic...
    for i, query in enumerate(questions):
        if model_name == "embed-english-v3.0":
          query_emb = model.embed([query], input_type="search_query", model=model_name).embeddings
        elif model != None:
          query_emb = model.encode(query)
        else:
          query_emb = get_embedding(query)

        # Compute dot score between query and all document embeddings
        scores = compute_similarity_scores(query_emb, doc_emb)

        # Combine docs & scores
        doc_score_pairs = list(zip(book_sentences, scores))

        # Sort by decreasing score
        doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

        # Get top-k results
        top_k = 10
        found = False

        answer_sentences = convert_to_sentences(answers[i])
        for j in range(1, top_k + 1):
            for a in answer_sentences:
              isSimmilar= check_similarity(doc_score_pairs[j][0], a)
              if isSimmilar:
                  error += (j - 1)
                  found = True
                  count_first_result += 1 if j == 1 else 0
                  count_second_result += 1 if j == 2 else 0
                  count_third_result += 1 if j == 3 else 0
                  count_top_10 += 1
                  break
            if found:
              break

        if not found:
            error += 15
        # print(i + 1, "/", len(questions), " error: ", error, "count found: ", count_top_10)

    end_time = time.time()
    model_stats[model_name] = {"time elapsed": end_time - start_time,
                                "total error": error,
                                "total found": count_top_10,
                                "first results": count_first_result,
                                "second results": count_second_result,
                                "third results": count_third_result}
    print(model_name," model status \n", model_stats[model_name])

    stats_file_path = '/content/ICog-Labs/Semantic Search/data/model_stats.txt'
    save_model_stats_to_txt(model_stats, stats_file_path)

    return model_stats

In [None]:
def run_sbert_models(INPUT_FILE_PATH):
    # Define constants
    # INPUT_FILE_PATH = os.path.join(os.path.dirname(__file__), '.\\data\\to_kill_a_mocking_bird.txt')
    MODELS = ['multi-qa-mpnet-base-dot-v1',
              'all-mpnet-base-v2',
              'gtr-t5-xl',
              'all-roberta-large-v1',
              'multi-qa-distilbert-cos-v1',
              'all-distilroberta-v1',
              'msmarco-bert-base-dot-v5',
              'multi-qa-MiniLM-L6-cos-v1',
              'msmarco-distilbert-dot-v5',
              'all-MiniLM-L12-v2',
              ]

    # Load input data
    data = load_input_data(INPUT_FILE_PATH)

    # Convert the book into a list of sentences
    book_sentences = convert_to_sentences(data)

    # Download all models
    download_models(MODELS)

    model_stats = {}

    for model_name in MODELS:
        start_time = time.time()
        print(model_name, "model running...")

        # Load the model
        model = SentenceTransformer(model_name)

        # Encode query and documents
        doc_emb = model.encode(book_sentences, convert_to_tensor=True)

        error = 0
        count_first_result = 0
        count_second_result = 0
        count_third_result = 0
        count_top_10 = 0

        # Implementation of the semantic search logic...
        for i, query in enumerate(questions):
            query_emb = model.encode(query, convert_to_tensor=True)

            # Compute dot score between query and all document embeddings
            scores = compute_similarity_scores(query_emb, doc_emb)

            # Combine docs & scores
            doc_score_pairs = list(zip(book_sentences, scores))

            # Sort by decreasing score
            doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

            # Get top-k results
            top_k = 10
            found = False
            print("Query: ", query)
            print("Answer: ", answers[i])
            answer_sentences = convert_to_sentences(answers[i])
            for j in range(1, top_k + 1):
                for a in answer_sentences:
                  isSimmilar= check_similarity(doc_score_pairs[j][0], a)
                  if isSimmilar:
                      error += (j - 1)
                      found = True
                      count_first_result += 1 if j == 1 else 0
                      count_second_result += 1 if j == 2 else 0
                      count_third_result += 1 if j == 3 else 0
                      count_top_10 += 1
                      break
                if found:
                  break

            if not found:
                error += 15
            # print(i + 1, "/", len(questions), " error: ", error, "count found: ", count_top_10)

        end_time = time.time()
        model_stats[model_name] = {"time elapsed": end_time - start_time,
                                   "total error": error,
                                   "total found": count_top_10,
                                   "first results": count_first_result,
                                   "second results": count_second_result,
                                   "third results": count_third_result}
        print(model_name, " model status \n", model_stats[model_name])

    print(model_stats)
    stats_file_path = '/content/ICog-Labs/Semantic Search/data/model_stats.txt'
    save_model_stats_to_txt(model_stats, stats_file_path)




In [None]:
INPUT_FILE_PATH = '/content/ICog-Labs/Semantic Search/data/to_kill_a_mocking_bird.txt'
run_sbert_models(INPUT_FILE_PATH)

In [None]:
sbert_models_stat = {'multi-qa-mpnet-base-dot-v1': {'time elapsed': 87.76673674583435, 'total error': 117, 'total found': 16, 'first results': 6, 'second results': 2, 'third results': 2},
                     'all-mpnet-base-v2': {'time elapsed': 89.32079792022705, 'total error': 191, 'total found': 10, 'first results': 4, 'second results': 1, 'third results': 1},
                     'gtr-t5-xl': {'time elapsed': 1312.3310718536377, 'total error': 188, 'total found': 10, 'first results': 3, 'second results': 2, 'third results': 0},
                     'all-roberta-large-v1': {'time elapsed': 485.50539660453796, 'total error': 234, 'total found': 7, 'first results': 1, 'second results': 1, 'third results': 1},
                     'multi-qa-distilbert-cos-v1': {'time elapsed': 46.21957087516785, 'total error': 172, 'total found': 12, 'first results': 3, 'second results': 2, 'third results': 1},
                     'all-distilroberta-v1': {'time elapsed': 70.17474913597107, 'total error': 206, 'total found': 9, 'first results': 0, 'second results': 4, 'third results': 2},
                     'msmarco-bert-base-dot-v5': {'time elapsed': 96.14858531951904, 'total error': 135, 'total found': 14, 'first results': 4, 'second results': 3, 'third results': 3},
                     'multi-qa-MiniLM-L6-cos-v1': {'time elapsed': 14.608932256698608, 'total error': 183, 'total found': 11, 'first results': 4, 'second results': 0, 'third results': 0},
                     'msmarco-distilbert-dot-v5': {'time elapsed': 45.92693614959717, 'total error': 178, 'total found': 11, 'first results': 2, 'second results': 2, 'third results': 3},
                     'all-MiniLM-L12-v2': {'time elapsed': 27.071072340011597, 'total error': 203, 'total found': 10, 'first results': 2, 'second results': 1, 'third results': 0}}

| Model Name                     | Time Elapsed | Total Error | Total Found | First Results | Second Results | Third Results | Model Size |
|--------------------------------|--------------|-------------|-------------|---------------|----------------|---------------|---------------|
| multi-qa-mpnet-base-dot-v1      | 87.77s       | 117         | 16          | 6             | 2              | 2             | 420 MB |
| msmarco-bert-base-dot-v5        | 96.15s       | 135         | 14          | 4             | 3              | 3             | 420 MB |
| multi-qa-distilbert-cos-v1      | 46.22s       | 172         | 12          | 3             | 2              | 1             | 250 MB |
| msmarco-distilbert-dot-v5       | 45.93s       | 178         | 11          | 2             | 2              | 3             | 250 MB |
| multi-qa-MiniLM-L6-cos-v1       | 14.61s       | 183         | 11          | 4             | 0              | 0             | 80 MB |
| gtr-t5-xl                       | 1312.33s     | 188         | 10          | 3             | 2              | 0             | 2370 MB |
| all-mpnet-base-v2               | 89.32s       | 191         | 10          | 4             | 1              | 1             | 420 MB |
| all-MiniLM-L12-v2               | 27.07s       | 203         | 10          | 2             | 1              | 0             | 120 MB |
| all-distilroberta-v1            | 70.17s       | 206         | 9           | 0             | 4              | 2             | 290 MB |
| all-roberta-large-v1            | 485.51s      | 234         | 7           | 1             | 1              | 1             | 1360 MB |


In [None]:
!python -m pip install -U angle-emb


In [None]:
from angle_emb import AnglE

model = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()
run_semantic_search("UAE-Large-v1", model)

In [None]:
!pip install voyageai

In [None]:
voyageai.api_key = "VOYAGEAI API KEY HERE"

run_semantic_search("voyage-lite-01-instruct", None)

In [None]:
!pip install llmx
!pip install -U cohere

In [None]:
import cohere
cohere_key = "COHERE API KEY HERE"
model = cohere.Client(cohere_key)

In [None]:
run_semantic_search("embed-english-v3.0", model)

In [None]:
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
run_semantic_search("BAAI/bge-large-zh-v1.5", model)

In [None]:
model = SentenceTransformer('llmrails/ember-v1')
run_semantic_search("llmrails/ember-v1", model)

In [None]:
model = SentenceTransformer("jamesgpt1/sf_model_e5")
run_semantic_search("jamesgpt1/sf_model_e5", model)

In [None]:
model = SentenceTransformer("thenlper/gte-large")
run_semantic_search("thenlper/gte-large", model)

In [None]:
model = SentenceTransformer("thenlper/gte-large")
run_semantic_search("thenlper/gte-large", model)

In [None]:
model = SentenceTransformer("infgrad/stella-base-en-v2")
run_semantic_search("infgrad/stella-base-en-v2", model)

In [None]:
!pip install InstructorEmbedding

In [None]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-xl')
run_semantic_search("hkunlp/instructor-xl", model)

In [None]:
model = INSTRUCTOR('hkunlp/instructor-large')
run_semantic_search("hkunlp/instructor-large", model)

In [None]:
model = INSTRUCTOR('intfloat/e5-large-v2')
run_semantic_search("intfloat/e5-large-v2", model)

In [None]:
!pip install transformers
from transformers import AutoModel

In [None]:
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
run_semantic_search("jinaai/jina-embeddings-v2-base-en", model)

In [None]:
model = SentenceTransformer("sentence-t5-xl")
run_semantic_search("sentence-t5-xl", model)

  | Model                               | Time Elapsed (s) | Total Error | Total Found (21) | First Results | Second Results | Third Results |
  |-------------------------------------|------------------|-------------|-------------|---------------|----------------|---------------|
  | multi-qa-mpnet-base-dot-v1          | 87.77            | 117         | 16          | 6             | 2              | 2             |
  | msmarco-bert-base-dot-v5            | 96.15            | 135         | 14          | 4             | 3              | 3             |
  | hkunlp/instructor-xl                | 1595.49          | 158         | 14          | 2             | 0              | 3             |
  | voyage-lite-01-instruct             | 355.76           | 166         | 12          | 2             | 5              | 0             |
  | thenlper/gte-large                  | 340.72           | 168         | 12          | 4             | 2              | 1             |
  | multi-qa-distilbert-cos-v1          | 46.22            | 172         | 12          | 3             | 2              | 1             |
  | msmarco-distilbert-dot-v5           | 45.93            | 178         | 11          | 2             | 2              | 3             |
  | multi-qa-MiniLM-L6-cos-v1           | 14.61            | 183         | 11          | 4             | 0              | 0             |
  | hkunlp/instructor-large             | 395.49           | 188         | 11          | 1             | 2              | 2             |
  | gtr-t5-xl                           | 1312.33          | 188         | 10          | 3             | 2              | 0             |
  | all-mpnet-base-v2                   | 89.32            | 191         | 10          | 4             | 1              | 1             |
  | all-MiniLM-L12-v2                   | 27.07            | 203         | 10          | 2             | 1              | 0             |
  | cohere-embed-english-v3.0           | 10.09            | 194         | 9           | 3             | 2              | 2             |
  | all-distilroberta-v1                | 70.17            | 206         | 9           | 0             | 4              | 2             |
  | sentence-t5-xl                      | 1517.48          | 209         | 9           | 2             | 2              | 1             |
  | jamesgpt1/sf_model_e5               | 341.54           | 211         | 9           | 3             | 0              | 1             |
  | UAE-Large-v1                        | 2043.65          | 214         | 8           | 3             | 1              | 2             |
  | llmrails/ember-v1                   | 354.12           | 216         | 8           | 1             | 3              | 0             |
  | all-roberta-large-v1                | 485.51           | 234         | 7           | 1             | 1              | 1             |
  | infgrad/stella-base-en-v2           | 96.30            | 241         | 7           | 1             | 0              | 2             |
  | jinaai/jina-embeddings-v2-base-en   | 132.16           | 246         | 6           | 1             | 1              | 0             |
  | intfloat/e5-large-v2                | 342.50           | 256         | 5           | 1             | 0              | 1             |
  | BAAI/bge-large-zh-v1.5              | 409.35           | 266         | 5           | 0             | 0              | 1             |


