### Imports

In [1]:
import numpy as np
from numpy.linalg import norm
import ollama
import os
import json

link to video reference: https://www.youtube.com/watch?v=V1Mz8gMBDMo

In [13]:
def parse_file(filename):
    with open(filename, encoding='utf-8-sig') as f:
        paragraphs = []
        buffer = []
        for line in f.readlines():
            line = line.strip()
            if line:
                buffer.append(line)
            elif len(buffer):
                paragraphs.append(" ".join(buffer))
                buffer = []
        if len(buffer):
            paragraphs.append(" ".join(buffer))
    return paragraphs

def save_embeddings(filename, embeddings):
    # create directory if not exists
    if not os.path.exists("embeddings"):
        os.makedirs("embeddings")
    # dump embedding to json file
    with open(f"embeddings/{filename}.json", 'w') as f:
        json.dump(embeddings, f)

def load_embeddings(filename):
    # check if file exists
    if not os.path.exists(f"embeddings/{filename}.json"):
        return False
    with open(f"embeddings/{filename}.json", 'r') as f:
        return json.load(f)

def get_embeddings(filename, modelname, chunks):
    # check if embeddings are already computed
    if filename and ((embeddings := load_embeddings(filename)) is not False):
        return embeddings
    # compute embeddings
    embeddings = [
            ollama.embeddings(model=modelname, prompt=chunk)["embedding"] 
            for chunk in chunks
        ]
    # save embeddings
    if filename:
        save_embeddings(filename, embeddings)
    return embeddings

def find_most_similar(needle, haystack, sort=True):
    needle_norm = norm(needle)
    similarity_scores = [
        np.dot(needle, item) / (needle_norm * norm(item))
        for item in haystack
    ]
    if not sort:
        return zip(similarity_scores, range(len(haystack)))
    return sorted(zip(similarity_scores, range(len(haystack))), reverse=True)

def find_most_similar_from_multiple(needles, haystacks, sort=True):
    total_score_chunks = []
    for query_embedding in needles:
        chunks = find_most_similar(query_embedding, haystacks, sort=False)
        total_score_chunks.append(chunks)
    len_query = len(needles)
    # addition of the scores of the chunks
    total_score = [0] * len(haystacks)
    for chunks in total_score_chunks:
        for score, index in chunks:
            total_score[index] += score / len_query
    if not sort:
        return zip(total_score, range(len(haystacks)))
    return sorted(zip(total_score, range(len(haystacks))), reverse=True)

def parse_response_llm(response):
    result = []
    separators = [". ", ": ", ") ", ".", ":", ")"]
    for line_raw in response.split("\n"):
        line = line_raw.strip()
        if line:
            for sep in separators:
                if sep in line:
                    result.append(line.split(sep)[1])
                    break
    return result

def get_similar_chunks(chunks, threshold=0.62):
    return [(score, index) for score, index in chunks if score >= threshold]

### Prompt = query

In [40]:
prompt = "Est-ce que tu pourrais m'en dire plus sur cette entreprise ?"
queries = [prompt]
modelname = "mistral"

In [41]:
queries_embeddings = get_embeddings(None, modelname, queries)

### Query expansion

In [43]:
SYSTEM_PROMPT = """Vous êtes un assistant de modèle linguistique d'IA. Votre tâche consiste à générer cinq versions françaises différentes de la question donnée par l'utilisateur afin d'extraire les documents pertinents d'une base de données vectorielle. 
En générant des perspectives multiples sur la question de l'utilisateur, votre objectif est d'aider l'utilisateur à surmonter certaines limites de la recherche de similarité basée sur la distance.
l'utilisateur à surmonter certaines des limites de la recherche de similarité basée sur la distance. 
Fournissez ces questions alternatives séparées par des nouvelles lignes.
"""

response = ollama.chat(
    model=modelname,
    messages=[
        {
            "role": "system",
            "content": SYSTEM_PROMPT,
        },
        {"role": "user", "content": "Question originale : " + prompt},
    ],
)

queries = [prompt] + parse_response_llm(response["message"]["content"])

print("\n".join(queries))

Est-ce que tu pourrais m'en dire plus sur cette entreprise ?
Pourraient-tu me donner des informations supplémentaires sur cette entreprise ?
Existe-t-il de plus amples détails sur ce qui concernent cette entreprise que tu pourrais m'en partager ?
Serait-il possible de fournir des renseignements complémentaires sur l'entreprise en question ?
Pouvez-vous me donner plus de connaissances détaillées sur cet entrepreneur ou cette organisation ?
Aimerais-je recevoir plus d'informations contextuelles sur cette entreprise ?


## 1. Obtain the potential interesting files

In [35]:
description_filenames = ["pacte_novation_2_short.txt", "jeux_olympiques_paris_short.txt"]

In [32]:
# creating embeddings for each description (entire text)
embeddings = []
paragraphs = []
for filename in description_filenames:
    current_paragraphs = [" ".join(parse_file(f"{filename}"))]
    print(f"current_paragraphs length: {len(current_paragraphs)}")
    embeddings += get_embeddings(filename, modelname, current_paragraphs)
    print(f"Embeddings for {filename} computed, length: {len(embeddings)}")
    paragraphs += current_paragraphs

current_paragraphs length: 1
Embeddings for pacte_novation_2_short.txt computed, length: 1
current_paragraphs length: 1
Embeddings for jeux_olympiques_paris_short.txt computed, length: 2


In [44]:
most_similar_chunks = find_most_similar_from_multiple(queries_embeddings, embeddings)

# print all shapes
print("query embeddings:", len(queries_embeddings))
print("paragraphs:", len(paragraphs))
print("embeddings:", len(embeddings))


print("most similar chunks:", most_similar_chunks)

for score, index in most_similar_chunks:
    print(f"Score: {score}")
    print(f"Description: {description_filenames[index]}")
    print()

query embeddings: 1
paragraphs: 2
embeddings: 2
most similar chunks: [(0.4882849055500085, 0), (0.4861526996652893, 1)]
Score: 0.4882849055500085
Description: pacte_novation_2_short.txt

Score: 0.4861526996652893
Description: jeux_olympiques_paris_short.txt



In [45]:
similar_chunks = get_similar_chunks(most_similar_chunks, threshold=0.5)

for score, index in similar_chunks:
    print(f"Score: {score:.2f}")
    print(paragraphs[index])
    print()

similar_filenames = [description_filenames[index] for _, index in similar_chunks]

## 2. Extract the relevant information from the previous files

In [23]:
filenames = [filename.replace("_short", "") for filename in similar_filenames]
print(filenames)

[]


In [24]:
embeddings = []
paragraphs = []

for filename in filenames:
    paragraphs += parse_file(filename)
    embeddings += get_embeddings(filename, modelname, paragraphs)

### calculate the cosine similarity between the queries and the documents

In [None]:
# sort the chunks by the total score
sorted_chunks = find_most_similar_from_multiple(queries_embeddings, embeddings, sort=True)

for score, index in sorted_chunks:
    print(f"Score: {score:.2f}")
    print(paragraphs[index])
    print()

In [None]:
# when score >= 0.65, we consider the document as relevant
similar_chunks = get_similar_chunks(sorted_chunks)
similar_chunks = similar_chunks[:5]

for score, index in similar_chunks:
    print(f"Score: {score:.2f}")
    print(paragraphs[index])
    print()