In [1]:
from milvus_model.hybrid import BGEM3EmbeddingFunction
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    AnnSearchRequest,
    WeightedRanker,
)
import pandas as pd

uri="http://localhost:19530/dolphinai_db"
col_name = "hybrid_sap_collection"
limit = 10
output_fields=["document_id","chunk_id","file_name","chunk_name","chunk_text","chunk_token_length"]


# HYBRID SEARCH
def hybrid_search(
    col,
    query_dense_embedding,
    query_sparse_embedding,
    sparse_weight=1.0,
    dense_weight=1.0,
    limit=limit,
):
    dense_search_params = {"index_type": "GPU_IVF_FLAT"
                     ,"metric_type": "IP"
                     ,"field_name":"dense_vector"
                     ,"params": { "nlist": 1024 }
                     }
    dense_req = AnnSearchRequest(
        [query_dense_embedding], "dense_vector", dense_search_params, limit=limit
    )
    sparse_search_params = {"index_type": "SPARSE_INVERTED_INDEX"
                     ,"metric_type": "IP"
                     ,"field_name":"sparse_vector"
                     }
    sparse_req = AnnSearchRequest(
        [query_sparse_embedding], "sparse_vector", sparse_search_params, limit=limit
    )
    rerank = WeightedRanker(sparse_weight, dense_weight)
    res = col.hybrid_search(
        [sparse_req, dense_req]
        ,rerank=rerank
        ,limit=limit
        ,output_fields=output_fields
    )[0]
    # return [hit.get("text") for hit in res]
    return res

def convert_explode_order_and_sort(obj_list):
    """
    Converts a list of objects with a __dict__ attribute into a Pandas DataFrame,
    explodes the 'fields' dictionary into separate columns, reorders the columns,
    and sorts by 'distance' in ascending order.

    Args:
        obj_list (list): A list of objects, where each object has a __dict__ attribute.

    Returns:
        pd.DataFrame: A DataFrame containing the attributes of each object as rows,
                      with the 'fields' column exploded into separate columns,
                      columns reordered, and sorted by 'distance'.
    """
    # Convert list of objects to DataFrame
    df = pd.DataFrame([obj.__dict__ for obj in obj_list])
    
    # Explode the 'fields' dictionary into separate columns
    if 'fields' in df.columns:
        fields_df = pd.json_normalize(df['fields'])
        df = df.drop(columns=['fields']).join(fields_df)
    
    # Define the desired column order
    column_order = ['distance',"document_id","chunk_id","file_name","chunk_name","chunk_text","chunk_token_length"]
    
    # Reorder the columns (missing columns will be ignored)
    df = df[[col for col in column_order if col in df.columns]]
    
    # Sort by 'distance' in ascending order
    df = df.sort_values(by='distance', ascending=True)
    
    return df


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
connections.connect(uri=uri)
col = Collection(col_name, consistency_level="Strong")

ef = BGEM3EmbeddingFunction(use_fp16=False, device="cuda:1")
dense_dim = ef.dim["dense"]

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 140121.51it/s]
  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [86]:
# Enter your search query
# query = "Describe how the SAP partner determination in SD works"
# query = "What is Credit Control Area?"
query = "Come funziona in controllo di disponibilità su SAP?"
# query = "What is EDI and why it is important?"
# query = "What are the steps involved in preparing the system for using the Import Basis Module?"
print(query)

# Generate embeddings for the query
query_embeddings = ef([query])

Come funziona in controllo di disponibilità su SAP?


In [87]:
hybrid_results = hybrid_search(
    col,
    query_embeddings["dense"][0],
    query_embeddings["sparse"][[0]],
    sparse_weight=1.0,
    dense_weight=1.0,
)

In [88]:
# Example usage:
df = convert_explode_order_and_sort(hybrid_results)

In [89]:
# Get the index of the row with the minimum distance for each document_id group
min_distance_idx = df.groupby('document_id')['distance'].idxmin()

# Use these indices to get the rows with the lowest distance for each document_id
df_min_distance = df.loc[min_distance_idx].reset_index(drop=True)

# Sort by distance in ascending order
df_min_distance_sorted = df_min_distance.sort_values(by='distance', ascending=True)

# Create a list of tuples (document_id, distance)
result_list = list(df_min_distance_sorted[['document_id', 'distance']].itertuples(index=False, name=None))

# Display the result
print(result_list)


[('57e37cfaede996af1e1b07740cbc4c5a', 0.6848868727684021), ('465fb34556525345f5c23ef40c3fb5cb', 0.6883894205093384)]


In [90]:
import json

# Initialize an empty dictionary to store the final result
document_chunks = {}

# Loop over the result_list and extract relevant rows for each document_id
for document_id, _ in result_list:
    # Extract rows for the current document_id and sort them by chunk_id
    relevant_rows = df[df['document_id'] == document_id].sort_values(by='chunk_id')
    
    # Extract the sorted chunk_id values and store them in the dictionary
    document_chunks[document_id] = relevant_rows['chunk_id'].tolist()

# # Convert the result to JSON format (if needed)
# json_result = json.dumps(document_chunks, indent=4)

# Display the JSON result
print(document_chunks)


{'57e37cfaede996af1e1b07740cbc4c5a': [59, 102, 109, 110, 111, 112, 113, 114, 115], '465fb34556525345f5c23ef40c3fb5cb': [7]}


In [91]:
from pymilvus import MilvusClient

client = MilvusClient(
    uri=uri
)

def get_data_milvus(doc_id,chunk_id):
    res = client.query(
    collection_name=col_name,
    filter=f'(document_id == "{doc_id}") and (chunk_id =={chunk_id})',
    output_fields=output_fields,
    limit=1
        )
    return res[0]
    
import pandas as pd

def new_row_to_df(res):
    """
    Adds the values from res[0] to the DataFrame, setting 'distance' to None.

    Args:
    df (pd.DataFrame): The original DataFrame.
    res (dict): A dictionary containing the new row data.

    Returns:
    pd.DataFrame: The updated DataFrame with the new row added.
    """
    # Create a dictionary from res[0] and add a 'distance' key with None
    new_row = {
        'distance': None,
        'document_id': res['document_id'],
        'chunk_id': res['chunk_id'],
        'file_name': res['file_name'],
        'chunk_name': res['chunk_name'],
        'chunk_text': res['chunk_text'],
        'chunk_token_length': res['chunk_token_length']
    }

    # Convert the dictionary to a DataFrame
    new_row_df = pd.DataFrame([new_row])

    # # Append the new row to the existing DataFrame
    # df = pd.concat([df, new_row_df], ignore_index=True)
    
    return new_row_df


In [92]:
for doc_id in document_chunks:
    # print(doc_id)
    for chunk_id in document_chunks[doc_id]:
        # print(chunk_id)
        if chunk_id-1 not in document_chunks[doc_id]:
            # print(chunk_id, "-->", chunk_id-1)
            new_row_df = new_row_to_df(get_data_milvus(doc_id,chunk_id-1))
            # print(new_row_df["chunk_id"])
            if int(new_row_df["chunk_token_length"][0]) + int(df['chunk_token_length'].sum()) < 20000:
                df = pd.concat([df, new_row_df], ignore_index=True)
        if chunk_id+1 not in document_chunks[doc_id]:
            # print(chunk_id, "-->", chunk_id+1)
            new_row_df = new_row_to_df(get_data_milvus(doc_id,chunk_id+1))
            if int(new_row_df["chunk_token_length"][0]) + int(df['chunk_token_length'].sum()) < 20000:
                df = pd.concat([df, new_row_df], ignore_index=True)

  df = pd.concat([df, new_row_df], ignore_index=True)
  df = pd.concat([df, new_row_df], ignore_index=True)


In [93]:
# Step 1: Sort the DataFrame by document_id and chunk_id within each group
df_sorted = df.sort_values(by=['document_id', 'chunk_id'])

# Step 2: Initialize an empty list to store concatenated texts
concatenated_texts = []

# Step 3: Iterate over the sorted document_id list and concatenate text values for each group
for document_id in result_list:
    # print("DocumentID: ", document_id)
    # Filter the DataFrame for the current document_id
    group = df_sorted[df_sorted['document_id'] == document_id[0]]
    # print(group)
    
    # Concatenate the text values for the current group
    concatenated_text = " ".join(group['chunk_text'].tolist())
    
    # Append the concatenated text to the list
    concatenated_texts.append(concatenated_text)

# Step 4: Concatenate all the texts in the list, separating them with "\n\n"
final_concatenated_text = "\n\n".join(concatenated_texts)

# Display the final concatenated text
print(final_concatenated_text)


Sales Documents > Summary Overview of a Sales Document
 | Organizational Data | Description | Origination
 | --- | --- | ---
 | VBAK-VKORG | Sales organization | Chosen
 | VBAK-VTWEG | Distribution channel | Chosen
 | VBAK-SPART | Division | Chosen
 | VBAK-AUART | Sales document type | Chosen
 | VBAK-KALSM | Pricing procedure | Customer indicator plus document indicator plus sales area
 | Header Data | Description | Origination
 | KUAGV-KUNNR | Sold to party | Customer master record
 | VBAK-VKBUR | Sales office | Customer master record
 | VBAK-VKGRP | Sales group | Customer master record
 | VBKD-PRSDT | Pricing date | Entered, copied, or automatic
 | VBAK-AUDAT | Document date | System entry
 | VBAK-AUGRU | Order reason | Entered
 | VBAK-WAERK | Document currency | Customer master record
 | VBKD-KONDA | Price group | Customer master record
 | VBKD-BZIRK | Sales district | Customer master record
 | VBKD-KDGRP | Customer group | Customer master record
 | VBAK-VSBED | Shipping condition |

In [94]:
from langchain_community.embeddings import OllamaEmbeddings
from transformers import AutoTokenizer
from huggingface_hub import login
from llama_index.llms.ollama import Ollama

# !!!!!!!!!!!!!!!!!!!!!!!! SOOOOOOOOOOOOOOOOOOOOOOOOOOS !!!!!!!!!!!!!!!!!!!!!!!!
# hide the token fom the huggingface
login(token="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")
#######################################


# Define Embedding model
# bge_m3_ef =OllamaEmbeddings(model="bge-m3",show_progress=True)
# bge_m3_ef =OllamaEmbeddings(model="bge-m3")
model_id = "mistralai/Mixtral-8x7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Define LLM
llm = Ollama(model="dolphinai-mixtral:8x7b", request_timeout=200.0)
# llm = Ollama(model="mixtral:8x22b", request_timeout=200.0)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer(
        text,
        return_tensors="pt"
    )["input_ids"][0]
    return len(tokens)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


In [95]:
prompt =f"""
        Question: 
            {query}
        Context:
        {final_concatenated_text}

                
        """
                
        # Istruzioni di sistema:
        # Fornire una risposta esaustiva alla domanda data utilizzando il contesto sottostante.
        # Si prega di non fornire commenti dalla propria parte.
        # Rispondere nella stessa lingua della domanda data.
            
        # System Instructions:
        # Provide a comprehensive answer to the given question using the context below.
        # Please do not provide comments from your side.
        # Answer in the same language as the provided question.



print(tiktoken_len(prompt))

14325


In [96]:
prompt = (
    prompt
)

response = llm.complete(prompt)
response

CompletionResponse(text=' Here is a list of requirements and controls for checking availability in Sales and Distribution processing:\n\nGeneral Control Features:\n\n1. Strategy group: combines allowed planning strategies, specified in the material master record or determined based on MRP group.\n2. MRP group: combines materials for specific MRP control, determines strategy group if missing in material master.\n3. Planning strategy: specifies requirements type for planning and customer requirements, controls interaction between Production Planning and Sales and Distribution.\n4. MRP type and item category: used to determine a corresponding requirements type if no other method is successful.\n5. Requirements type: identifies different requirements with specific control features based on the requirements class.\n6. Requirements Class: contains all control features for planning, specifies whether an availability check is required and whether requirements should be passed on.\n\nControllin

In [97]:
print("Question:",query )
print("Response:\n",response.text)

Question: Come funziona in controllo di disponibilità su SAP?
Response:
  Here is a list of requirements and controls for checking availability in Sales and Distribution processing:

General Control Features:

1. Strategy group: combines allowed planning strategies, specified in the material master record or determined based on MRP group.
2. MRP group: combines materials for specific MRP control, determines strategy group if missing in material master.
3. Planning strategy: specifies requirements type for planning and customer requirements, controls interaction between Production Planning and Sales and Distribution.
4. MRP type and item category: used to determine a corresponding requirements type if no other method is successful.
5. Requirements type: identifies different requirements with specific control features based on the requirements class.
6. Requirements Class: contains all control features for planning, specifies whether an availability check is required and whether requirem