In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.vectorstores.utils import filter_complex_metadata

`from langchain_community.vectorstores import Chroma`

- Purpose: Imports a vector store called Chroma, responsible for efficiently storing and retrieving text-based data in a way that language models can understand.

- Role: Used to create a vectorstore for storing and indexing documents, allowing for efficient retrieval and search capabilities.

`from langchain_community.chat_models import ChatOllama`

- Purpose: Imports a chat model implementation specifically designed for Ollama, a tool for running large language models locally.

- Role: Likely used to enable conversational interactions with a language model through the web app.

`from langchain_community.embeddings import FastEmbedEmbeddings`

- Purpose: Imports a module for generating text embeddings, which are numerical representations of text that capture its meaning and semantic relationships.

- Role: Used to create embeddings for documents and text inputs, allowing for similarity comparisons and other language-related tasks.

`from langchain.schema.output_parser import StrOutputParser`

- Purpose: Imports a tool for parsing the output of language models, extracting relevant information from their responses.

- Role: Used to process and interpret the responses generated by the language model, making them usable within the application.

`from langchain_community.document_loaders.csv_loader import CSVLoader`

- Purpose: Imports a module specifically designed to load data from CSV files, a common format for storing structured data.

- Role: Used to load the CSV document containing the information that will be used for answering questions.

`from langchain.text_splitter import RecursiveCharacterTextSplitter`

- Purpose: Imports a text splitting tool that can divide text into smaller chunks based on specific criteria.

- Role: Might be used to split text into sentences or phrases for further processing or analysis.

`from langchain.schema.runnable import RunnablePassthrough`

- Purpose: Imports a component likely used for chaining together multiple actions or processes within LangChain.

- Role: Might be used to create a sequence of operations involving different LangChain components.

`from langchain.prompts import PromptTemplate`

- Purpose: Imports a tool for creating prompts, which are instructions or questions given to language models to guide their responses.

- Role: Used to formulate the prompts that will be sent to the language model for answering questions about the CSV data.

`from langchain.vectorstores.utils import filter_complex_metadata`

- Purpose: Imports a utility function for filtering metadata associated with vector store documents.

- Role: Might be used to manage or simplify the metadata associated with the stored documents.

In [None]:
class ChatCSV:
    vector_store = None
    retriever = None
    chain = None

    def __init__(self):
        """
        Initializes the question-answering system with default configurations.

        This constructor sets up the following components:
        - A ChatOllama model for generating responses ('neural-chat').
        - A RecursiveCharacterTextSplitter for splitting text into chunks.
        - A PromptTemplate for constructing prompts with placeholders for question and context.
        """
        # Initialize the ChatOllama model with 'neural-chat'.
        self.model = ChatOllama(model="neural-chat")

        # Initialize the RecursiveCharacterTextSplitter with specific chunk settings.
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)

        # Initialize the PromptTemplate with a predefined template for constructing prompts.
        self.prompt = PromptTemplate.from_template(
            """
            <s> [INST] You are a helpful HR assistant that analyses resumes from different candidates.
            Use the following pieces of retrieved context to answer the question.
            Give names when possible. If you don't know the answer,
            just say that you don't know.  [/INST] </s> 
            [INST] Question: {question} 
            Context: {context} 
            Answer: [/INST]
            """
        )

Inside the class, we’re initializing the attributes for the class.

- `self.model` creates an instance `ChatOllama` for processing text

- `self.text_splitter` creates an instance of RecursiveCharacterTextSplitter for splitting the text into chunks

- `self.prompt` creates an instance of PromptTemplate using a template string

Notice the placeholders `{question}` and `{context}`. These will be filled with actual values during runtime.

The next block of code defines an ingest method for the same class. The purpose of this method is to ingest data from a CSV file containing resumes, process the data, and set up components for further analysis.

In [None]:
def ingest(self, csv_file_path: str):
        '''
        Ingests data from a CSV file containing resumes, process the data, and set up the
        components for further analysis.

        Parameters:
        - csv_file_path (str): The file path to the CSV file.

        Usage:
        obj.ingest("/path/to/data.csv")

        This function uses a CSVLoader to load the data from the specified CSV file.

        Args:
        - file.path (str): The path to the CSV file.
        - encoding (str): The character encoding of the file (default is 'utf-8').
        - source_column (str): The column in the CSV containing the data (default is "Resume").
        '''        
        loader = CSVLoader(
            file_path=csv_file_path,
            encoding='utf-8',
            source_column="Resume"
            )
        
        # loads the data
        data = loader.load()

        # splits the documents into chunks
        chunks = self.text_splitter.split_documents(data)
        chunks = filter_complex_metadata(chunks)

        # creates a vector store using embedding
        vector_store = Chroma.from_documents(documents=chunks, embedding=FastEmbedEmbeddings())
        # sets up the retriever
        self.retriever = vector_store.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={
                "k": 3,
                "score_threshold": 0.5,
            },
        )

        # Define a processing chain for handling a question-answer scenario.
        # The chain consists of the following components:
        # 1. "context" from the retriever
        # 2. A passthrough for the "question"
        # 3. Processing with the "prompt"
        # 4. Interaction with the "model"
        # 5. Parsing the output using the "StrOutputParser"
        self.chain = ({"context": self.retriever, "question": RunnablePassthrough()}
                      | self.prompt
                      | self.model
                      | StrOutputParser())

The code below defines a method for asking questions through a processing chain, ensuring that the chain is set up before attempting to process the question. If the chain is not set up, it returns a message indicating the need to add a CSV document first.

In [None]:
 def ask(self, query: str):
        """
        Asks a question using the configured processing chain.

        Parameters:
        - query (str): The question to be asked.

        Returns:
        - str: The result of processing the question through the configured chain.
        If the processing chain is not set up (empty), a message is returned
        prompting to add a CSV document first.
        """
        if not self.chain:
            return "Please, add a CSV document first."

        return self.chain.invoke(query)

The code below defines a method that clears or resets components within a question-answering system, making them `None` and effectively clearing the existing configuration.

In [None]:
    def clear(self):
        """
        Clears the components in the question-answering system.

        This method resets the vector store, retriever, and processing chain to None,
        effectively clearing the existing configuration.
        """
        # Set the vector store to None.
        self.vector_store = None

        # Set the retriever to None.
        self.retriever = None

        # Set the processing chain to None.
        self.chain = None