# LangChain

ChatGPT, Custom Corpora, and Chat History

### (1) Environment Setup

In [None]:
!pip install langchain
!pip install openai
!pip install chromadb
!pip install tiktoken

# Install package
!pip install "unstructured[all-docs]"

In [None]:
import os
import openai

os.environ["OPENAI_API_KEY"] = ""

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import DirectoryLoader


from langchain.embeddings import OpenAIEmbeddings

# VectorstoreIndexCreator
# Def: Indexes are lookup data structures for looking up
# words within documents
from langchain.indexes import VectorstoreIndexCreator

# Vectorstore
# Def: Vectors are embeddings (or numerical representations) of words
# within documents.
from langchain.vectorstores import Chroma

# RecursiveCharacterTextSplitter
# Def: A text splitter is used to split larger text documents
# into batches to make it manageable to process by the OS
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


# Chat Memory
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


# Instructions for ChatGPT
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

### (2) Basic Example with Unstructured

In [20]:
# Loading a single document
# loader = UnstructFileLoader("./mathematical_notation.md")
# docs = loader.load()

In [21]:
# Loading multiple documents
loader = DirectoryLoader('./', glob="*.md")
docs = loader.load()

In [22]:
# Preview an example document
docs[0].page_content

'Measures of Central Tendency\n\nThe measures of central tendency are numbers that represent the location of averages in the data.\n\nThe measures of central tendency are mean, median, and mode; they are calculated as follows:\n\nConsider this set of values\n\n$$X = { 3,3,3,4,6,8,8,9 } $$\n\nThen, the measures are found as follows,\n\nMedian\n\nThe number right in the center of $X$. Since there are an even number of values in $X$, we take the two number in the middle and take their average.\n\nmedian $= \\dfrac{4+6}{2} = 5$\n\nMode\n\nThe most recurring number in $X$.\n\nmode = $3$\n\nMean\n\nThe mean is used to determine an absolute number by which to compare all other values in the data, With the mean, we can determine both (1) whether some value is above or below the average and (2) the degree of variation in the data.\n\nMean is related to variance (and standard deviation) which is covered in its own section\n\nTo calculate the mean average of $X$, sum all the values of $X$ and div

In [37]:
# Note:
# LangChain by default uses ChromaDB
index = VectorstoreIndexCreator().from_loaders([loader])

In [24]:
# index.query("How is a set written mathamticaly?")
# index.query("What is abstract algebra? ")
# index.query("What's the best way to learn probability theory?")
# index.query("Central tendncy vs dispersion?")

### (3) Fallback to ChatGPT

In [25]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

In [35]:
prompt_template = """If the context is not relevant,
please answer the question by using your own knowledge about the topic

{context}

Question: {question}
"""

QA_PROMPT = PromptTemplate.from_template(
    template=prompt_template
)

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key='answer'
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(), #index.vectorstore.as_retriever()
    chain_type_kwargs={"prompt": QA_PROMPT}
)

In [34]:
qa_chain({
    "query": "What is a set?"
})

{'query': 'What is a set?',
 'result': 'A set is a collection of distinct objects, called elements, that are grouped together based on a common characteristic or property. Sets are often represented by listing their elements inside curly braces, such as {1, 2, 3}, where 1, 2, and 3 are the elements of the set. Sets can be finite or infinite, and the elements can be numbers, letters, or any other type of object. The concept of sets is fundamental in mathematics and is used in various branches, such as set theory, algebra, and calculus.'}

### (4) Chat History

In [28]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

In [29]:
prompt_template = """If the context is not relevant,
please answer the question by using your own knowledge about the topic

{context}

Question: {question}
"""

QA_PROMPT = PromptTemplate.from_template(
    template=prompt_template
)

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
chat = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=vectorstore.as_retriever(),
    memory=memory,
    combine_docs_chain_kwargs={"prompt": QA_PROMPT}
)

In [30]:
# chat({
#       "question": "What is a set?"
# })

In [31]:
# chat({
#       "question": "What is brownian motion?"
# })

In [32]:
# chat({
#       "question": "So what's a set?"
# })