In [1]:
from langchain_community.embeddings import OllamaEmbeddings
from transformers import AutoTokenizer
from huggingface_hub import login
from llama_index.llms.ollama import Ollama

# !!!!!!!!!!!!!!!!!!!!!!!! SOOOOOOOOOOOOOOOOOOOOOOOOOOS !!!!!!!!!!!!!!!!!!!!!!!!
# hide the token fom the huggingface
login(token="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")
#######################################


# Define Embedding model
# bge_m3_ef =OllamaEmbeddings(model="bge-m3",show_progress=True)
bge_m3_ef =OllamaEmbeddings(model="bge-m3")
model_id = "mistralai/Mixtral-8x7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Define LLM
llm = Ollama(model="dolphinai-mixtral:8x7b", request_timeout=200.0)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer(
        text,
        return_tensors="pt"
    )["input_ids"][0]
    return len(tokens)

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


In [None]:
from langchain_milvus import Milvus

uri="http://localhost:19530/dolphinai_db"
vector_store_loaded = Milvus(
    bge_m3_ef,
    connection_args={"uri": uri},
    collection_name="dolphinai_sap_collection",
    # collection_name="dolphinai_collection",
    vector_field="embedding"
)

In [None]:
# question ="trovami gli ordini di vendita di ACME del 2023 contenenti i prodotti 'levigatrice' o 'sega circolare'"
question ="How does availability control work in SAP?"
# question ="Come funziona in controllo di disponibilità su SAP?"

In [None]:
results = vector_store_loaded.similarity_search(
    question,
    k=10,
    )



In [None]:
from collections import defaultdict

# Group by file_name
grouped_documents = defaultdict(list)
for doc in results:
    file_name = doc.metadata['file_name']
    grouped_documents[file_name].append(doc)

# Sort each group by page_label
for file_name, d in grouped_documents.items():
    grouped_documents[file_name] = sorted(d, key=lambda x: x.metadata['page_label'])

# Convert to a sorted list of (file_name, documents) tuples
sorted_grouped_documents = sorted(grouped_documents.items(), key=lambda x: x[0])

# Output the result
for file_name, d in sorted_grouped_documents:
    print(f"File: {file_name}")
    for doc in d:
        print(f"  Page: {doc.metadata['page_label']}")


In [None]:
context=""
for i,res in enumerate(sorted_grouped_documents):
    print(res[0])
    context = f"{context}Reference number: {i}\nReference Text:\n"
    for r in res[1]:
        # print(r)
        context = f"{context}\n{r.page_content}\n\n"
    # print(f"* {res.page_content} [{res.metadata}]")
    # print("#"*25)
print(context)

In [None]:
prompt =f"""

Based on the given context provide a comprehensive answer to the following question.
If the answer requires listing of something then use bulletpoints or numerical listing.
Answer in the same language as the provided question
Question: 
    {question}

Context:
{context}
                
        """
print(tiktoken_len(prompt))


In [None]:
prompt = (
    prompt
)

response = llm.complete(prompt)


In [None]:
print(response)

# **Hybrid search**

In [1]:
from pymilvus import (
    AnnSearchRequest,
    WeightedRanker,
)

limit = 10
output_fields=["document_id","chunk_id","file_name","chunk_name","chunk_text","chunk_token_length"]

# DENSE SEARCH
def dense_search(
    col
    ,query_dense_embedding
    ,limit=limit):
    
    search_params = {"index_type": "GPU_IVF_FLAT"
                     ,"metric_type": "IP"
                     ,"field_name":"dense_vector"
                     ,"params": { "nlist": 1024 }
                     }
    res = col.search(
        [query_dense_embedding],
        anns_field="dense_vector",
        limit=limit,
        output_fields=output_fields,
        param=search_params,
    )[0]
    # return [hit.get("text") for hit in res]
    return res

# SPARCE SEARCH
def sparse_search(col
                  ,query_sparse_embedding
                  ,limit=limit):
    
    search_params = {"index_type": "SPARSE_INVERTED_INDEX"
                     ,"metric_type": "IP"
                     ,"field_name":"sparse_vector"
                     }
    res = col.search(
        [query_sparse_embedding],
        anns_field="sparse_vector",
        limit=limit,
        output_fields=output_fields,
        param=search_params,
    )[0]
    # return [hit.get("text") for hit in res]
    return res

# HYBRID SEARCH
def hybrid_search(
    col,
    query_dense_embedding,
    query_sparse_embedding,
    sparse_weight=1.0,
    dense_weight=1.0,
    limit=limit,
):
    dense_search_params = {"index_type": "GPU_IVF_FLAT"
                     ,"metric_type": "IP"
                     ,"field_name":"dense_vector"
                     ,"params": { "nlist": 1024 }
                     }
    dense_req = AnnSearchRequest(
        [query_dense_embedding], "dense_vector", dense_search_params, limit=limit
    )
    sparse_search_params = {"index_type": "SPARSE_INVERTED_INDEX"
                     ,"metric_type": "IP"
                     ,"field_name":"sparse_vector"
                     }
    sparse_req = AnnSearchRequest(
        [query_sparse_embedding], "sparse_vector", sparse_search_params, limit=limit
    )
    rerank = WeightedRanker(sparse_weight, dense_weight)
    res = col.hybrid_search(
        [sparse_req, dense_req]
        ,rerank=rerank
        ,limit=limit
        ,output_fields=output_fields
    )[0]
    # return [hit.get("text") for hit in res]
    return res


In [2]:
import pandas as pd

def convert_explode_order_and_sort(obj_list):
    """
    Converts a list of objects with a __dict__ attribute into a Pandas DataFrame,
    explodes the 'fields' dictionary into separate columns, reorders the columns,
    and sorts by 'distance' in ascending order.

    Args:
        obj_list (list): A list of objects, where each object has a __dict__ attribute.

    Returns:
        pd.DataFrame: A DataFrame containing the attributes of each object as rows,
                      with the 'fields' column exploded into separate columns,
                      columns reordered, and sorted by 'distance'.
    """
    # Convert list of objects to DataFrame
    df = pd.DataFrame([obj.__dict__ for obj in obj_list])
    
    # Explode the 'fields' dictionary into separate columns
    if 'fields' in df.columns:
        fields_df = pd.json_normalize(df['fields'])
        df = df.drop(columns=['fields']).join(fields_df)
    
    # Define the desired column order
    column_order = ['distance',"document_id","chunk_id","file_name","chunk_name","chunk_text","chunk_token_length"]
    
    # Reorder the columns (missing columns will be ignored)
    df = df[[col for col in column_order if col in df.columns]]
    
    # Sort by 'distance' in ascending order
    df = df.sort_values(by='distance', ascending=True)
    
    return df

# Example usage:
# df = convert_explode_order_and_sort(hybrid_results)
# print(df)


In [3]:
from milvus_model.hybrid import BGEM3EmbeddingFunction

ef = BGEM3EmbeddingFunction(use_fp16=False, device="cuda:1")
dense_dim = ef.dim["dense"]


  from .autonotebook import tqdm as notebook_tqdm
Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 3472.30it/s]
  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [4]:
# Enter your search query
query = "Describe how the SAP partner determination in SD works"
# query = "What is EDI and why it is important?"
# query = "What are the steps involved in preparing the system for using the Import Basis Module?"
print(query)

# Generate embeddings for the query
query_embeddings = ef([query])

Describe how the SAP partner determination in SD works


In [5]:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)
connections.connect(uri="http://localhost:19530/dolphinai_db")
col_name = "hybrid_sap_collection"
col = Collection(col_name, consistency_level="Strong")


In [7]:
dense_results = dense_search(col, query_embeddings["dense"][0])
sparse_results = sparse_search(col, query_embeddings["sparse"][[0]])
hybrid_results = hybrid_search(
    col,
    query_embeddings["dense"][0],
    query_embeddings["sparse"][[0]],
    sparse_weight=0.7,
    dense_weight=1.0,
)


In [61]:
# # Example usage:
# df = convert_explode_order_and_sort(dense_results)
# df

Unnamed: 0,distance,document_id,chunk_id,file_name,chunk_name,chunk_text,chunk_token_length
9,0.591019,251c9fc3a38c35ad22473d7ce342a1f6,122,01.Basic_Function_SD.pdf,Partner Control,Partner Control > Implementation Options\nSinc...,212
8,0.596102,57e37cfaede996af1e1b07740cbc4c5a,40,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Preparations for C...,927
7,0.602235,b2a110370ddea0483d18acbe324bb51e,178,07.Trasportation_46C.pdf,External Transportation Planning Systems,External Transportation Planning Systems > Def...,987
6,0.615691,251c9fc3a38c35ad22473d7ce342a1f6,125,01.Basic_Function_SD.pdf,Partner Determination Procedure,Partner Determination Procedure > Use\nIn the ...,571
5,0.617543,251c9fc3a38c35ad22473d7ce342a1f6,121,01.Basic_Function_SD.pdf,Partner Determination in Sales and Distribution,Partner Determination in Sales and Distributio...,617
4,0.631793,57e37cfaede996af1e1b07740cbc4c5a,179,04.McGrawHill-SD.pdf,Diverse Sales and Distribution Functions,Diverse Sales and Distribution Functions > Par...,907
3,0.635488,251c9fc3a38c35ad22473d7ce342a1f6,128,01.Basic_Function_SD.pdf,Partners in the Sales and Distribution Process,Partners in the Sales and Distribution Process...,982
2,0.647598,57e37cfaede996af1e1b07740cbc4c5a,42,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,72
1,0.66783,57e37cfaede996af1e1b07740cbc4c5a,41,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,950
0,0.680598,251c9fc3a38c35ad22473d7ce342a1f6,58,01.Basic_Function_SD.pdf,Partner Determination for Customer Hierarchy N...,Partner Determination for Customer Hierarchy N...,695


In [62]:
# # Example usage:
# df = convert_explode_order_and_sort(sparse_results)
# df

Unnamed: 0,distance,document_id,chunk_id,file_name,chunk_name,chunk_text,chunk_token_length
9,0.167548,57e37cfaede996af1e1b07740cbc4c5a,54,04.McGrawHill-SD.pdf,Sales Documents,termination indicates to the system to carry o...,885
8,0.167548,57e37cfaede996af1e1b07740cbc4c5a,53,04.McGrawHill-SD.pdf,Sales Documents,termination indicates to the system to carry o...,885
7,0.171292,57e37cfaede996af1e1b07740cbc4c5a,185,04.McGrawHill-SD.pdf,Diverse Sales and Distribution Functions,Diverse Sales and Distribution Functions > Out...,845
6,0.173001,57e37cfaede996af1e1b07740cbc4c5a,95,04.McGrawHill-SD.pdf,Available to Promise and Transfer of Requirements,Available to Promise and Transfer of Requireme...,969
5,0.173278,251c9fc3a38c35ad22473d7ce342a1f6,95,01.Basic_Function_SD.pdf,Creating Master Records for Material Determina...,Creating Master Records for Material Determina...,218
4,0.174238,57e37cfaede996af1e1b07740cbc4c5a,176,04.McGrawHill-SD.pdf,Diverse Sales and Distribution Functions,Diverse Sales and Distribution Functions\nThis...,913
3,0.177647,251c9fc3a38c35ad22473d7ce342a1f6,128,01.Basic_Function_SD.pdf,Partners in the Sales and Distribution Process,Partners in the Sales and Distribution Process...,982
2,0.186524,251c9fc3a38c35ad22473d7ce342a1f6,119,01.Basic_Function_SD.pdf,Material Listing and Exclusion,Material Listing and Exclusion > Purpose\nMate...,468
1,0.188908,57e37cfaede996af1e1b07740cbc4c5a,41,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,950
0,0.195554,251c9fc3a38c35ad22473d7ce342a1f6,58,01.Basic_Function_SD.pdf,Partner Determination for Customer Hierarchy N...,Partner Determination for Customer Hierarchy N...,695


In [62]:
# Example usage:
df = convert_explode_order_and_sort(hybrid_results)
df

Unnamed: 0,distance,document_id,chunk_id,file_name,chunk_name,chunk_text,chunk_token_length
9,0.66991,251c9fc3a38c35ad22473d7ce342a1f6,122,01.Basic_Function_SD.pdf,Partner Control,Partner Control > Implementation Options\nSinc...,212
8,0.671107,57e37cfaede996af1e1b07740cbc4c5a,40,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Preparations for C...,927
7,0.672543,b2a110370ddea0483d18acbe324bb51e,178,07.Trasportation_46C.pdf,External Transportation Planning Systems,External Transportation Planning Systems > Def...,987
6,0.675668,251c9fc3a38c35ad22473d7ce342a1f6,125,01.Basic_Function_SD.pdf,Partner Determination Procedure,Partner Determination Procedure > Use\nIn the ...,571
5,0.676095,251c9fc3a38c35ad22473d7ce342a1f6,121,01.Basic_Function_SD.pdf,Partner Determination in Sales and Distribution,Partner Determination in Sales and Distributio...,617
4,0.679358,57e37cfaede996af1e1b07740cbc4c5a,179,04.McGrawHill-SD.pdf,Diverse Sales and Distribution Functions,Diverse Sales and Distribution Functions > Par...,907
3,0.682928,57e37cfaede996af1e1b07740cbc4c5a,42,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,72
2,1.069371,251c9fc3a38c35ad22473d7ce342a1f6,128,01.Basic_Function_SD.pdf,Partners in the Sales and Distribution Process,Partners in the Sales and Distribution Process...,982
1,1.079025,57e37cfaede996af1e1b07740cbc4c5a,41,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,950
0,1.083247,251c9fc3a38c35ad22473d7ce342a1f6,58,01.Basic_Function_SD.pdf,Partner Determination for Customer Hierarchy N...,Partner Determination for Customer Hierarchy N...,695


In [9]:
# Group by document_id, and for each group, sort by chunk_id
df_grouped_sorted = df.groupby('document_id').apply(lambda x: x.sort_values('chunk_id')).reset_index(drop=True)
df_grouped_sorted

  df_grouped_sorted = df.groupby('document_id').apply(lambda x: x.sort_values('chunk_id')).reset_index(drop=True)


Unnamed: 0,distance,document_id,chunk_id,file_name,chunk_name,chunk_text,chunk_token_length
0,1.083247,251c9fc3a38c35ad22473d7ce342a1f6,58,01.Basic_Function_SD.pdf,Partner Determination for Customer Hierarchy N...,Partner Determination for Customer Hierarchy N...,695
1,0.676095,251c9fc3a38c35ad22473d7ce342a1f6,121,01.Basic_Function_SD.pdf,Partner Determination in Sales and Distribution,Partner Determination in Sales and Distributio...,617
2,0.66991,251c9fc3a38c35ad22473d7ce342a1f6,122,01.Basic_Function_SD.pdf,Partner Control,Partner Control > Implementation Options\nSinc...,212
3,0.675668,251c9fc3a38c35ad22473d7ce342a1f6,125,01.Basic_Function_SD.pdf,Partner Determination Procedure,Partner Determination Procedure > Use\nIn the ...,571
4,1.069371,251c9fc3a38c35ad22473d7ce342a1f6,128,01.Basic_Function_SD.pdf,Partners in the Sales and Distribution Process,Partners in the Sales and Distribution Process...,982
5,0.671107,57e37cfaede996af1e1b07740cbc4c5a,40,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Preparations for C...,927
6,1.079025,57e37cfaede996af1e1b07740cbc4c5a,41,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,950
7,0.682928,57e37cfaede996af1e1b07740cbc4c5a,42,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,72
8,0.679358,57e37cfaede996af1e1b07740cbc4c5a,179,04.McGrawHill-SD.pdf,Diverse Sales and Distribution Functions,Diverse Sales and Distribution Functions > Par...,907
9,0.672543,b2a110370ddea0483d18acbe324bb51e,178,07.Trasportation_46C.pdf,External Transportation Planning Systems,External Transportation Planning Systems > Def...,987


In [74]:
df

Unnamed: 0,distance,document_id,chunk_id,file_name,chunk_name,chunk_text,chunk_token_length
9,0.66991,251c9fc3a38c35ad22473d7ce342a1f6,122,01.Basic_Function_SD.pdf,Partner Control,Partner Control > Implementation Options\nSinc...,212
8,0.671107,57e37cfaede996af1e1b07740cbc4c5a,40,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Preparations for C...,927
7,0.672543,b2a110370ddea0483d18acbe324bb51e,178,07.Trasportation_46C.pdf,External Transportation Planning Systems,External Transportation Planning Systems > Def...,987
6,0.675668,251c9fc3a38c35ad22473d7ce342a1f6,125,01.Basic_Function_SD.pdf,Partner Determination Procedure,Partner Determination Procedure > Use\nIn the ...,571
5,0.676095,251c9fc3a38c35ad22473d7ce342a1f6,121,01.Basic_Function_SD.pdf,Partner Determination in Sales and Distribution,Partner Determination in Sales and Distributio...,617
4,0.679358,57e37cfaede996af1e1b07740cbc4c5a,179,04.McGrawHill-SD.pdf,Diverse Sales and Distribution Functions,Diverse Sales and Distribution Functions > Par...,907
3,0.682928,57e37cfaede996af1e1b07740cbc4c5a,42,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,72
2,1.069371,251c9fc3a38c35ad22473d7ce342a1f6,128,01.Basic_Function_SD.pdf,Partners in the Sales and Distribution Process,Partners in the Sales and Distribution Process...,982
1,1.079025,57e37cfaede996af1e1b07740cbc4c5a,41,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,950
0,1.083247,251c9fc3a38c35ad22473d7ce342a1f6,58,01.Basic_Function_SD.pdf,Partner Determination for Customer Hierarchy N...,Partner Determination for Customer Hierarchy N...,695


In [10]:
# Get the index of the row with the minimum distance for each document_id group
min_distance_idx = df.groupby('document_id')['distance'].idxmin()

# Use these indices to get the rows with the lowest distance for each document_id
df_min_distance = df.loc[min_distance_idx].reset_index(drop=True)

# Sort by distance in ascending order
df_min_distance_sorted = df_min_distance.sort_values(by='distance', ascending=True)

# Create a list of tuples (document_id, distance)
result_list = list(df_min_distance_sorted[['document_id', 'distance']].itertuples(index=False, name=None))

# Display the result
print(result_list)


[('251c9fc3a38c35ad22473d7ce342a1f6', 0.6699104309082031), ('57e37cfaede996af1e1b07740cbc4c5a', 0.6711069941520691), ('b2a110370ddea0483d18acbe324bb51e', 0.6725433468818665)]


In [37]:
import json

# Initialize an empty dictionary to store the final result
document_chunks = {}

# Loop over the result_list and extract relevant rows for each document_id
for document_id, _ in result_list:
    # Extract rows for the current document_id and sort them by chunk_id
    relevant_rows = df[df['document_id'] == document_id].sort_values(by='chunk_id')
    
    # Extract the sorted chunk_id values and store them in the dictionary
    document_chunks[document_id] = relevant_rows['chunk_id'].tolist()

# # Convert the result to JSON format (if needed)
# json_result = json.dumps(document_chunks, indent=4)

# Display the JSON result
print(document_chunks)


{'251c9fc3a38c35ad22473d7ce342a1f6': [58, 121, 122, 125, 128], '57e37cfaede996af1e1b07740cbc4c5a': [40, 41, 42, 179], 'b2a110370ddea0483d18acbe324bb51e': [178]}


In [38]:
document_chunks

{'251c9fc3a38c35ad22473d7ce342a1f6': [58, 121, 122, 125, 128],
 '57e37cfaede996af1e1b07740cbc4c5a': [40, 41, 42, 179],
 'b2a110370ddea0483d18acbe324bb51e': [178]}

In [13]:
total_chunk_token_length = df['chunk_token_length'].sum()
int(total_chunk_token_length)

6920

In [103]:
output_fields

['document_id',
 'chunk_id',
 'file_name',
 'chunk_name',
 'chunk_text',
 'chunk_token_length']

In [29]:
df.columns

Index(['distance', 'document_id', 'chunk_id', 'file_name', 'chunk_name',
       'chunk_text', 'chunk_token_length'],
      dtype='object')

In [54]:
from pymilvus import MilvusClient

client = MilvusClient(
    uri="http://localhost:19530/dolphinai_db"
)

def get_data_milvus(doc_id,chunk_id):
    res = client.query(
    collection_name="hybrid_sap_collection",
    filter=f'(document_id == "{doc_id}") and (chunk_id =={chunk_id})',
    output_fields=["document_id","chunk_id","file_name","chunk_name","chunk_text","chunk_token_length"],
    limit=1
        )
    return res[0]
    
import pandas as pd

def new_row_to_df(res):
    """
    Adds the values from res[0] to the DataFrame, setting 'distance' to None.

    Args:
    df (pd.DataFrame): The original DataFrame.
    res (dict): A dictionary containing the new row data.

    Returns:
    pd.DataFrame: The updated DataFrame with the new row added.
    """
    # Create a dictionary from res[0] and add a 'distance' key with None
    new_row = {
        'distance': None,
        'document_id': res['document_id'],
        'chunk_id': res['chunk_id'],
        'file_name': res['file_name'],
        'chunk_name': res['chunk_name'],
        'chunk_text': res['chunk_text'],
        'chunk_token_length': res['chunk_token_length']
    }

    # Convert the dictionary to a DataFrame
    new_row_df = pd.DataFrame([new_row])

    # # Append the new row to the existing DataFrame
    # df = pd.concat([df, new_row_df], ignore_index=True)
    
    return new_row_df


In [63]:


for doc_id in document_chunks:
    print(doc_id)
    for chunk_id in document_chunks[doc_id]:
        # print(chunk_id)
        if chunk_id-1 not in document_chunks[doc_id]:
            print(chunk_id, "-->", chunk_id-1)
            new_row_df = new_row_to_df(get_data_milvus(doc_id,chunk_id-1))
            print(new_row_df["chunk_id"])
            if int(new_row_df["chunk_token_length"][0]) + int(df['chunk_token_length'].sum()) < 20000:
                print("YYYYYYYYYY")
                df = pd.concat([df, new_row_df], ignore_index=True)
        if chunk_id+1 not in document_chunks[doc_id]:
            print(chunk_id, "-->", chunk_id+1)
            new_row_df = new_row_to_df(get_data_milvus(doc_id,chunk_id+1))
            if int(new_row_df["chunk_token_length"][0]) + int(df['chunk_token_length'].sum()) < 20000:
                df = pd.concat([df, new_row_df], ignore_index=True)
                print("KKKKKKKKKKKKKK")
                
        
df

251c9fc3a38c35ad22473d7ce342a1f6
58 --> 57
0    57
Name: chunk_id, dtype: int64
YYYYYYYYYY
58 --> 59


  df = pd.concat([df, new_row_df], ignore_index=True)
  df = pd.concat([df, new_row_df], ignore_index=True)


KKKKKKKKKKKKKK
121 --> 120
0    120
Name: chunk_id, dtype: int64
YYYYYYYYYY
122 --> 123
KKKKKKKKKKKKKK
125 --> 124
0    124
Name: chunk_id, dtype: int64
YYYYYYYYYY
125 --> 126
KKKKKKKKKKKKKK
128 --> 127
0    127
Name: chunk_id, dtype: int64
YYYYYYYYYY
128 --> 129
KKKKKKKKKKKKKK
57e37cfaede996af1e1b07740cbc4c5a
40 --> 39
0    39
Name: chunk_id, dtype: int64
YYYYYYYYYY
42 --> 43
KKKKKKKKKKKKKK
179 --> 178
0    178
Name: chunk_id, dtype: int64
YYYYYYYYYY
179 --> 180
KKKKKKKKKKKKKK
b2a110370ddea0483d18acbe324bb51e
178 --> 177
0    177
Name: chunk_id, dtype: int64
YYYYYYYYYY
178 --> 179
KKKKKKKKKKKKKK


Unnamed: 0,distance,document_id,chunk_id,file_name,chunk_name,chunk_text,chunk_token_length
0,0.66991,251c9fc3a38c35ad22473d7ce342a1f6,122,01.Basic_Function_SD.pdf,Partner Control,Partner Control > Implementation Options\nSinc...,212
1,0.671107,57e37cfaede996af1e1b07740cbc4c5a,40,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Preparations for C...,927
2,0.672543,b2a110370ddea0483d18acbe324bb51e,178,07.Trasportation_46C.pdf,External Transportation Planning Systems,External Transportation Planning Systems > Def...,987
3,0.675668,251c9fc3a38c35ad22473d7ce342a1f6,125,01.Basic_Function_SD.pdf,Partner Determination Procedure,Partner Determination Procedure > Use\nIn the ...,571
4,0.676095,251c9fc3a38c35ad22473d7ce342a1f6,121,01.Basic_Function_SD.pdf,Partner Determination in Sales and Distribution,Partner Determination in Sales and Distributio...,617
5,0.679358,57e37cfaede996af1e1b07740cbc4c5a,179,04.McGrawHill-SD.pdf,Diverse Sales and Distribution Functions,Diverse Sales and Distribution Functions > Par...,907
6,0.682928,57e37cfaede996af1e1b07740cbc4c5a,42,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,72
7,1.069371,251c9fc3a38c35ad22473d7ce342a1f6,128,01.Basic_Function_SD.pdf,Partners in the Sales and Distribution Process,Partners in the Sales and Distribution Process...,982
8,1.079025,57e37cfaede996af1e1b07740cbc4c5a,41,04.McGrawHill-SD.pdf,Master Data Configuration,Master Data Configuration > Partner Determinat...,950
9,1.083247,251c9fc3a38c35ad22473d7ce342a1f6,58,01.Basic_Function_SD.pdf,Partner Determination for Customer Hierarchy N...,Partner Determination for Customer Hierarchy N...,695


In [70]:
df.sort_values(["document_id", "chunk_id"])
# Step 1: Sort the DataFrame by document_id and chunk_id within each group
df_sorted = df.sort_values(by=['document_id', 'chunk_id'])

# Step 2: Initialize an empty list to store concatenated texts
concatenated_texts = []

# Step 3: Iterate over the sorted document_id list and concatenate text values for each group
for document_id in sorted_document_ids:
    # print("DocumentID: ", document_id)
    # Filter the DataFrame for the current document_id
    group = df_sorted[df_sorted['document_id'] == document_id]
    # print(group)
    
    # Concatenate the text values for the current group
    concatenated_text = " ".join(group['chunk_text'].tolist())
    
    # Append the concatenated text to the list
    concatenated_texts.append(concatenated_text)

# Step 4: Concatenate all the texts in the list, separating them with "\n\n"
final_concatenated_text = "\n\n".join(concatenated_texts)

# Display the final concatenated text
print(final_concatenated_text)

Unnamed: 0,distance,document_id,chunk_id,file_name,chunk_name,chunk_text,chunk_token_length
10,,251c9fc3a38c35ad22473d7ce342a1f6,57,01.Basic_Function_SD.pdf,Customer Hierarchies in Sales Order Processing,Customer Hierarchies in Sales Order Processing...,110
9,1.083247,251c9fc3a38c35ad22473d7ce342a1f6,58,01.Basic_Function_SD.pdf,Partner Determination for Customer Hierarchy N...,Partner Determination for Customer Hierarchy N...,695
11,,251c9fc3a38c35ad22473d7ce342a1f6,59,01.Basic_Function_SD.pdf,Use,Use\nIf you want to create a sales order for a...,59
12,,251c9fc3a38c35ad22473d7ce342a1f6,120,01.Basic_Function_SD.pdf,Creating Master Records for Material Listing a...,Creating Master Records for Material Listing a...,205
4,0.676095,251c9fc3a38c35ad22473d7ce342a1f6,121,01.Basic_Function_SD.pdf,Partner Determination in Sales and Distribution,Partner Determination in Sales and Distributio...,617
0,0.66991,251c9fc3a38c35ad22473d7ce342a1f6,122,01.Basic_Function_SD.pdf,Partner Control,Partner Control > Implementation Options\nSinc...,212
13,,251c9fc3a38c35ad22473d7ce342a1f6,123,01.Basic_Function_SD.pdf,Partner Type,Partner Type > Use\nThe partner type enables g...,320
14,,251c9fc3a38c35ad22473d7ce342a1f6,124,01.Basic_Function_SD.pdf,Partner Functions,Partner Functions > Use\nBy assigning a partne...,664
3,0.675668,251c9fc3a38c35ad22473d7ce342a1f6,125,01.Basic_Function_SD.pdf,Partner Determination Procedure,Partner Determination Procedure > Use\nIn the ...,571
15,,251c9fc3a38c35ad22473d7ce342a1f6,126,01.Basic_Function_SD.pdf,Partner Source,Partner Source > Use\nIn the standard system t...,238


In [73]:
# Step 1: Sort the DataFrame by document_id and chunk_id within each group
df_sorted = df.sort_values(by=['document_id', 'chunk_id'])

# Step 2: Initialize an empty list to store concatenated texts
concatenated_texts = []

# Step 3: Iterate over the sorted document_id list and concatenate text values for each group
for document_id in result_list:
    # print("DocumentID: ", document_id)
    # Filter the DataFrame for the current document_id
    group = df_sorted[df_sorted['document_id'] == document_id[0]]
    # print(group)
    
    # Concatenate the text values for the current group
    concatenated_text = " ".join(group['chunk_text'].tolist())
    
    # Append the concatenated text to the list
    concatenated_texts.append(concatenated_text)

# Step 4: Concatenate all the texts in the list, separating them with "\n\n"
final_concatenated_text = "\n\n".join(concatenated_texts)

# Display the final concatenated text
print(final_concatenated_text)


Customer Hierarchies in Sales Order Processing > Use
Customer hierarchies are used to determine pricing and rebates in sales and billing documents.
When you process a sales order for a customer who is assigned to a customer hierarchy, the system automatically determines the corresponding hierarchy path.

Hierarchy Paths
The hierarchy path shows the relationship of a customer to the chain of nodes all the way up to the top level of the hierarchy.
The system uses partner determination to build the hierarchy path in the sales order.
 Partner Determination for Customer Hierarchy Nodes > Purpose
During sales order processing, the system automatically determines special partner functions in the partner data of the document.
The system uses these partner functions for the following purposes:
• To determine the hierarchy path and store it in the document
• To store hierarchy data per item (the pricing of individual items in the order may relate to different hierarchy nodes)
• To make it possib

In [74]:
from langchain_community.embeddings import OllamaEmbeddings
from transformers import AutoTokenizer
from huggingface_hub import login
from llama_index.llms.ollama import Ollama

# !!!!!!!!!!!!!!!!!!!!!!!! SOOOOOOOOOOOOOOOOOOOOOOOOOOS !!!!!!!!!!!!!!!!!!!!!!!!
# hide the token fom the huggingface
login(token="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")
#######################################


# Define Embedding model
# bge_m3_ef =OllamaEmbeddings(model="bge-m3",show_progress=True)
# bge_m3_ef =OllamaEmbeddings(model="bge-m3")
model_id = "mistralai/Mixtral-8x7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Define LLM
llm = Ollama(model="dolphinai-mixtral:8x7b", request_timeout=200.0)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer(
        text,
        return_tensors="pt"
    )["input_ids"][0]
    return len(tokens)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


In [75]:
prompt =f"""
System Instructions:
Provide a comprehensive answer to the following question using the context.
Please do not provide comments from your side.
Answer in the same language as the provided question
Question: 
    {query}

Context:
{final_concatenated_text}
                
        """
print(tiktoken_len(prompt))





12731


In [77]:
prompt = (
    prompt
)

response = llm.complete(prompt)
response

CompletionResponse(text=' The text describes the integration between SAP R/3 and external Transportation Planning Systems (TPS) using the SD-TPS interface. This interface is necessary because SAP R/3 does not offer automated support for optimizing transportation planning due to the large variety of possible processing procedures and optimization criteria.\n\nThe process begins with creating deliveries in R/3, which can be manually grouped into shipments for planning. However, if an external TPS is used, deliveries are sent to the TPS for optimization according to specific criteria. The generated shipments are then confirmed in R/3, triggering the generation of shipment documents.\n\nThe organizational link between R/3 and the TPS is established by assigning the TPS to a transportation planning point in R/3. One TPS can be assigned to multiple transportation planning points if necessary.\n\nThe SD-TPS interface supports various functions, including the transfer of location master data f

In [78]:
query

'Describe how the SAP partner determination in SD works'

In [79]:
print(response.text)

 The text describes the integration between SAP R/3 and external Transportation Planning Systems (TPS) using the SD-TPS interface. This interface is necessary because SAP R/3 does not offer automated support for optimizing transportation planning due to the large variety of possible processing procedures and optimization criteria.

The process begins with creating deliveries in R/3, which can be manually grouped into shipments for planning. However, if an external TPS is used, deliveries are sent to the TPS for optimization according to specific criteria. The generated shipments are then confirmed in R/3, triggering the generation of shipment documents.

The organizational link between R/3 and the TPS is established by assigning the TPS to a transportation planning point in R/3. One TPS can be assigned to multiple transportation planning points if necessary.

The SD-TPS interface supports various functions, including the transfer of location master data from R/3 to the TPS, the transfe

In [14]:
# def doc_text_formatting(ef, query, docs):
#     tokenizer = ef.model.tokenizer
#     query_tokens_ids = tokenizer.encode(query, return_offsets_mapping=True)
#     query_tokens = tokenizer.convert_ids_to_tokens(query_tokens_ids)
#     formatted_texts = []

#     for doc in docs:
#         ldx = 0
#         landmarks = []
#         encoding = tokenizer.encode_plus(doc, return_offsets_mapping=True)
#         tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])[1:-1]
#         offsets = encoding["offset_mapping"][1:-1]
#         for token, (start, end) in zip(tokens, offsets):
#             if token in query_tokens:
#                 if len(landmarks) != 0 and start == landmarks[-1]:
#                     landmarks[-1] = end
#                 else:
#                     landmarks.append(start)
#                     landmarks.append(end)
#         close = False
#         formatted_text = ""
#         for i, c in enumerate(doc):
#             if ldx == len(landmarks):
#                 pass
#             elif i == landmarks[ldx]:
#                 if close:
#                     formatted_text += "</span>"
#                 else:
#                     formatted_text += "<span style='color:red'>"
#                 close = not close
#                 ldx = ldx + 1
#             formatted_text += c
#         if close is True:
#             formatted_text += "</span>"
#         formatted_texts.append(formatted_text)
#     return formatted_texts

# from IPython.display import Markdown, display

# display(Markdown("**Dense Search Results:**"))
# formatted_results = doc_text_formatting(ef, query, dense_results)
# for result in dense_results:
#     display(Markdown(result))

# display(Markdown("\n**Sparse Search Results:**"))
# formatted_results = doc_text_formatting(ef, query, sparse_results)
# for result in formatted_results:
#     display(Markdown(result))

# display(Markdown("\n**Hybrid Search Results:**"))
# formatted_results = doc_text_formatting(ef, query, hybrid_results)
# for result in formatted_results:
#     display(Markdown(result))
