## Chat Models for OPEN-AI

### Import Libraries

In [33]:
#For Environtment API key
from dotenv import load_dotenv
import os
from pathlib import Path


#OPENAI
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

#Read File
from PyPDF2 import PdfReader
import pandas as pd

#Split Text to Chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

#Vector Store & Retriever
from langchain.vectorstores import FAISS

#Contextualing Question
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

**Model & Embedding**

In [31]:
load_dotenv()
LLM =  ChatOpenAI(temperature=0.5, openai_api_key=os.getenv("OPENAI_API_KEY"))
EMBEDDING = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

### Read File
This code comprises two functions aimed at extracting text data from CSV and PDF files using Python libraries

In [24]:
def extract_text_from_csv(file):
    """
    Function to extract text data from a CSV file.

    Args:
    - file (str): Path to the CSV file.

    Returns:
    - str: Concatenated text data from the specified column ('facts') in the CSV file.
    """
    df = pd.read_csv(file)
    return ' '.join(df['facts'])

def extract_text_from_pdf(file):
    """
    Function to extract text data from a PDF file.

    Args:
    - file (str): Path to the PDF file.

    Returns:
    - str: Extracted text data from all pages of the PDF file.
    """
    pdf_text = ""
    pdf_reader = PdfReader(file) 
    for page in pdf_reader.pages:
        pdf_text += page.extract_text()
    return pdf_text

# Example usage:
file = extract_text_from_pdf("../docs/MTA023401.pdf")

### Split Text into Chunks

- The **RecursiveCharacterTextSplitter** takes a large text and splits it based on a specified chunk size.
- Chunking involves dividing the document into smaller, more manageable sections that fit comfortably within the context window of the large language model.

More details can be found in the following link
- [Understanding LangChain's RecursiveCharacterTextSplitter](https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846)
- [Langchain Documentation](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)


In [25]:
def split_text_into_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    return text_splitter.split_text(text)

splited_text = split_text_into_chunks(file)

### Vector Store & Retriever

- A vector store is a specialized database designed to store and manage vector embeddings.
- A retriever is an interface that returns documents given an unstructured query
- **FAISS** takes 2 mandatory parameters :
   - *texts* : A list that contain string as elements
   - *embedding* : Embedding models to transform all the text into embedding vectors

In [32]:
def create_vector_store(text_chunks):
      vector_store = FAISS.from_texts(texts=text_chunks, embedding=EMBEDDING)
      return vector_store

vector_store = create_vector_store(splited_text)
retriever = vector_store.as_retriever(search_type="similarity")

### Contextualizing the question

- Define a sub-chain that takes historical messages and the latest user question, and reformulates the question if it makes reference to any information in the historical information
- **create_history_aware_retriever** create a chain that takes conversation history and returns documents.

In [35]:
def contextualize_system_prompt():
    contextualize_q_system_prompt = """Given a chat history and the latest user question \
        which might reference context in the chat history, formulate a standalone question \
        which can be understood without the chat history. Do NOT answer the question, \
        If theres no chat history before then return it as it \
        just reformulate it if needed and otherwise return it as is."""
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    history_aware_retriever = create_history_aware_retriever(LLM, retriever, contextualize_q_prompt)
    return history_aware_retriever

history_aware_retriever = contextualize_system_prompt()

### Chain with History