<a href="https://colab.research.google.com/github/daniel-hain/workshop_london_nlp_2023/blob/main/LMM_vectordb_agents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
# Installing dependencies needed for this notebook
!pip install langchain openai sentence_transformers unstructured chromadb -q

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

In [None]:
# Set the environment variable for the OpenAI API Key
import os
str = open('/content/gdrive/MyDrive/00_projects/apy_keys.txt', 'r').read()
os.environ["OPENAI_API_KEY"] = str

# Dataset

In [None]:
import pandas as pd

In [None]:
path_data = '/content/gdrive/MyDrive/00_projects/project_2023_llm_tryout/data/data_ai_issues_policy.csv'
df = pd.read_csv(path_data)

In [None]:
df = df.query("source == 'Parliament'")

In [None]:
df.head(5)

# Chroma

## Embeddings

In [None]:
# Chroma
import chromadb
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

# embeddings
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

# Load documents and split text
from langchain.document_loaders import DataFrameLoader #-> types of data loader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter  #-> to splits data to chunks

In [None]:
# Dataloader
loader = DataFrameLoader(df, page_content_column="text")

# Define embeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Split the text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
)
texts = text_splitter.split_documents(loader.load())

In [None]:
# Path to save Chroma
path_chroma = '/content/gdrive/MyDrive/00_projects/project_2023_llm_tryout/data/chroma_db'

In [None]:
# save to disk
#db = Chroma.from_documents(texts, embeddings,
#                           persist_directory=path_chroma)

In [None]:
# load from disk
db = Chroma(persist_directory=path_chroma, embedding_function=embeddings)

In [None]:
# test
query = 'Problems with AI?'

In [None]:
db.similarity_search_with_score(query)

# LMM

In [None]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory

from langchain import PromptTemplate, LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain

In [None]:
# Initialize ChatOpenAI with given parameters
llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo') # or 'gpt-4'
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) # If you want a memory for chat

In [None]:
prompt_template = """You are provided with parts of speaches at the UK parliament related to the question. \
Use them to answer the question at the end. \
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [None]:
chain_type_kwargs = {"prompt": PROMPT}

qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=db.as_retriever(),
                                 chain_type_kwargs=chain_type_kwargs
                                 )

In [None]:
query = "'ethical concerns regarding the use of artificial intelligence (AI)?"
result = qa.run(query)


In [None]:
qa.run(query)

In [None]:
db.similarity_search(query)

# LMM Agents

In [None]:
from glob import glob
from langchain.agents import AgentType
from langchain.schema.messages import SystemMessage
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.prompts import MessagesPlaceholder

In [None]:
from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo,
)

In [None]:
vectorstore_info = VectorStoreInfo(
    name="Scotish Parlamentary Speeches",
    description="A vectorstore of all recent speeches in the Scottish Parlament related to AI, and relevant meta data",
    vectorstore=db,
)
toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info)

In [None]:
agent_executor = create_vectorstore_agent(llm=llm,
                                          toolkit=toolkit,
                                          prefix='You have access to a database on Scotish parliament speeches. \
                                          Use it to answer the question at the end. \
                                          If you do not know the answer, just say that you do not know, do not try to make up an answer.',
                                          handle_parsing_errors='Check your output and make sure it conforms! Avoid output that cannot be parsed.',
                                          #memory=memory,
                                          verbose=True)

In [None]:
promt_text = "Summarize how speaker 'Emma Harper' and speaker 'Patrick Harvie' discuss artificial intelligence. In particular, contrast their opinions."

In [None]:
agent_executor.run(promt_text)

In [None]:
db.similarity_search(
    'artificial intelligence (AI)',
    k=5,
    filter={'speaker':'Patrick Harvie'}
)

In [None]:
db.similarity_search(
    'artificial intelligence (AI)',
    k=5,
    filter={'speaker':'Emma Harper'}
)

# Other application: Arxiv summaries

To BE continued...

In [None]:
!pip install arxiv
!pip install pymupdf

In [None]:
from langchain.document_loaders import ArxivLoader

In [None]:
docs = ArxivLoader(query="2303.10130").load()
len(docs)

In [None]:
docs[0].metadata  # meta-information of the Document

In [None]:
docs[0].page_content[0:]  # all pages of the Document content