In [None]:
import os

# Execute this code only if in colab
if 'COLAB_GPU' in os.environ:
  print("Executing in Colab!")
  # Cloning GitHub repository
  !git clone https://github.com/fwitschel/WIMA.git
  %cd WIMA



First, we will install required dependencies.

In [None]:
!pip install langchain langchain-community pypdf sentence_transformers faiss-cpu langchain-anthropic groq

In [None]:
import pandas as pd
df = pd.read_csv('/content/WIMA/data/IRF_Data_small.csv', sep=";")
df.head()

### Parsing the documents
First, we will load the csv document and parse it using Pandas.

In [None]:
from langchain_core.documents import Document
docs = []
for index, row in df.iterrows():
    person = row['person']
    title = row['dc.title']
    document = Document(
        page_content= "Author: " + str(person) + ", Title: " + str(title),
        metadata={"source": "IRF"}
    )
    docs.append(document)

print(docs[0])

In [None]:
print(docs[0].page_content[0:250])

Once we have the text from the document, we have to split it into smaller chunks. We can use LangChain's available splitters, like CharacterTextSplitter in this case:

In [None]:
from langchain.text_splitter import CharacterTextSplitter
splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=30)
chunks = splitter.split_documents(docs)
print(chunks[0])

We will be using BGE-small, an opensource embeddings model. We will download it from HuggingFace Hub and run it on all chunks to calculate their vector representations.

In [None]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

chunk_texts = list(map(lambda d: d.page_content, chunks))
embeddings = bge_embeddings.embed_documents(chunk_texts)
print(embeddings[0])

Once we have the vector representations for all chunks, we can create an in-memory vector database and store all vectors in it. For this example, we will be using a FAISS database.

In [None]:
from langchain_community.vectorstores import FAISS

text_embedding_pairs = zip(chunk_texts, embeddings)
db = FAISS.from_embeddings(text_embedding_pairs, bge_embeddings)

The database is now set up. Now, we will be taking queries from the user on this information. In the case of expert search, we expect the user to ask for an area of expertise. Then, we retrieve the top k most similar chunks to that query.

In [None]:
topk = 42
query = "knowledge management"

contexts = db.similarity_search(query, k=topk)

for i in range(topk):
  print(contexts[i].page_content)


After retrieving the relevant context, we build a prompt using this information and the user's original query. We will use a Llama model via the Huggingface API:



> This example uses Huggingface API to call the model. In order for it to work, remember to set the Secret Variable "HUGGINGFACE_API_KEY" to your own Huggingface API Key, or change the model to any of your choice.



In [None]:
from groq import Groq
def llm(groq_client, prompt):
  chat_completion = groq_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama-3.3-70b-versatile",
  )

  return chat_completion.choices[0].message.content

In [None]:
from google.colab import userdata
groq_client = Groq(
    api_key=userdata.get('GROQ_API_KEY')
)

In [None]:
context = '\n\n'.join(list(map(lambda c: c.page_content, contexts)))
prompt = f"""You are an assistant that helps users to find people with a certain expertise, based on their publications.
You need to analyse the publications and check who has most relevant publications in the given field! The following context contains several relevant publications;
for each publication, the author is provided first: {context}. Please only return internal experts!
The field that we are interested in is {query}"""
response = llm(groq_client, prompt)
print(response)