In [3]:
from config import aws_region_name, aws_access_key, aws_secret_key
import boto3
def create_client():
    '''
    Create AWS Bedrock client to interact with LLM
    '''
    bedrock = boto3.client(service_name='bedrock-runtime',
                        region_name=aws_region_name,
                        aws_access_key_id=aws_access_key,
                        aws_secret_access_key=aws_secret_key
                        )
    return bedrock
bedrock_client = create_client()

## Load PDF Documents

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader

def load_pdfs(chunk_size=3000, chunk_overlap=100):

    # load the pdf documents
    loader=PyPDFDirectoryLoader("PDF Documents")
    documents=loader.load()

    # split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, 
                                                   chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents=documents)
    return docs

## Build vector store

In [2]:
from langchain_community.embeddings import BedrockEmbeddings
from langchain_community.vectorstores.faiss import FAISS

def create_vector_store(docs):

    # Set up bedrock client
    bedrock = create_client()
    bedrock_embeddings=BedrockEmbeddings(model_id='amazon.titan-embed-text-v1', client=bedrock)

    # create and save the vector store
    vector_store = FAISS.from_documents(docs, bedrock_embeddings)
    vector_store.save_local("faiss_index")
    
    return None

## Load LLM

In [32]:
from langchain.llms.bedrock import Bedrock

def create_llm(bedrock_client, model_id, model_kwargs):

    # load llama2 
    llm = Bedrock(model_id='meta.llama2-13b-chat-v1', 
                  client=bedrock_client,
                  #streaming=True,
                  model_kwargs=model_kwargs)
    return llm

In [17]:
model_kwargs = {'temperature':0}

In [18]:
# Llama 13b chat
llm = create_llm(bedrock_client, 'meta.llama2-13b-chat-v1', model_kwargs)

In [29]:
# Llama 70b instruct
llm = create_llm(bedrock_client, 'meta.llama3-70b-instruct-v1:0', model_kwargs)

## Create retrieval chain

It will receive user query -> retrieve most similar chunks from vector store -> send embedded query and retrieved chunks to loaded LLM, generating response with context

In [None]:
from langchain.llms.bedrock import Bedrock
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

# load llm
llm = create_llm(bedrock_client=bedrock_client)

# load embeddings and vector store
bedrock_embeddings=BedrockEmbeddings(model_id='amazon.titan-embed-text-v1', client=bedrock_client)
vector_store = FAISS.load_local('faiss_index', bedrock_embeddings, allow_dangerous_deserialization=True)

# create memory history
message_history = ChatMessageHistory()
# store past queries for follow up questions
memory = ConversationBufferMemory(
    memory_key="chat_history",
    output_key="answer",
    chat_memory=message_history,
    return_messages=True,
)

# create qa chain
qa_chain = ConversationalRetrievalChain.from_llm(llm, 
                                       chain_type='stuff', 
                                       retriever=vector_store.as_retriever(search_type='similarity', search_kwargs={"k":3}), # use 3 embeddings from context
                                       return_source_documents=True,
                                       memory=memory)

## Build UI with Chainlit

Chainlit is used for creating the function that establishes the chain

In [None]:
import chainlit as cl

@cl.on_chat_start
async def create_qa_chain():

    # create client 
    bedrock_client = create_client()

    # load llm
    llm = create_llm(bedrock_client=bedrock_client)

    # load embeddings and vector store
    bedrock_embeddings=BedrockEmbeddings(model_id='amazon.titan-embed-text-v1', client=bedrock_client)
    vector_store = FAISS.load_local('faiss_index', bedrock_embeddings, allow_dangerous_deserialization=True)
    
    # create memory history
    message_history = ChatMessageHistory()
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        output_key="answer",
        chat_memory=message_history,
        return_messages=True,
    )

    # create qa chain
    qa_chain = ConversationalRetrievalChain.from_llm(llm, 
                                           chain_type='stuff', 
                                           retriever=vector_store.as_retriever(search_type='similarity', search_kwargs={"k":3}),
                                           return_source_documents=True,
                                           memory=memory
                                           )
    
    # add custom messages to the user interface
    msg = cl.Message(content="Loading the bot...")
    await msg.send()
    msg.content = "Hi, Welcome to the QA Chatbot! Please ask your question."
    await msg.update()
    
    cl.user_session.set('qa_chain' ,qa_chain)

In [None]:
import re

@cl.on_message
async def generate_response(query):
    qa_chain = cl.user_session.get('qa_chain')

    res = await qa_chain.acall(query.content, callbacks=[cl.AsyncLangchainCallbackHandler(
        stream_final_answer=True, 
        )])

    # extract results and source documents
    result, source_documents = res['answer'], res['source_documents']

    # Extract all values associated with the 'metadata' key
    source_documents = str(source_documents)
    metadata_values = re.findall(r"metadata={'source': '([^']*)', 'page': (\d+)}", source_documents)

    # Convert metadata_values into a single string
    pattern = r'PDF Documents|\\'
    metadata_string = "\n".join([f"Source: {re.sub(pattern, '', source)}, page: {page}" for source, page in metadata_values])

    # add metadata (i.e., sources) to the results
    result += f'\n\n{metadata_string}'

    # send the generated response to the user
    await cl.Message(content=result).send()

## Testing CrewAI

- Agents: These are standalone units programmed to perform tasks , make decisions and communicate with other agents. They can use Tools which can be simple search functions or complex integrations involving other chains, APIs, etc.
- Tasks: Tasks are assignments or jobs that an AI agent needs to complete. They can include additional information like which agent should do it and what tools they might need.
- Crews: A Crew is a team of agents, each with a specific role, that work together to achieve a common goal. The process of forming a crew involves assembling agents, defining their tasks and establishing a sequence of task execution.

In [33]:
model_kwargs = {
    "max_gen_len": 512,
    "temperature": 0.5,
    "top_p": 0.9
}
llm = create_llm(bedrock_client, 'meta.llama3-70b-instruct-v1:0', model_kwargs)

In [16]:
from scholarly import scholarly
from bs4 import BeautifulSoup
import requests

In [7]:
query = 'Meta learning'

In [8]:
def scrape_pdf_links(url):
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    pdf_links = []
    for a_tag in soup.find_all('a'):
        #print(link)
        href = a_tag.get('href')
        content = a_tag.text.strip()
        #print("a tag content is", content)

        # If the link ends with ".pdf" or "PDF" is in the text
        if href:
            pdf_in_children = any('PDF' in child.text.upper() for child in a_tag.find_all())
            # for child in a_tag.find_all():
            #     print(child)
            #     if "PDF" in child.text.upper():
            #         print(child.text)
            if href.endswith('.pdf') or 'PDF' in content.upper() or pdf_in_children:
                pdf_links.append(href)
    return pdf_links

In [9]:
# This function will perform a Google Scholar search, grab the top_n results, and extract the PDF links from those results
# Note: Some websites require purchasing the PDF or logging in. We will not get links back for these cases
def google_scholar_search(query, top_n=5):
    search_query = scholarly.search_pubs(query)
    pdf_links = []
    for i in range(top_n):
        try:
            result = next(search_query)
            url = result['pub_url']
            #print(url)

            pdf_link = ''
            if url.endswith(".pdf"):
                pdf_link = url
            else:
                scraped_pdf_links = scrape_pdf_links(url)
                #print("DEBUG:", url, scraped_pdf_links)
                if len(scraped_pdf_links) > 0:
                    pdf_link = scraped_pdf_links[0]
            
            if pdf_link != '':
                # If link doesnt start with "http", we should replace "abs" in original link with "pdf"
                # This happens on arxiv and ACM library
                if not pdf_link.startswith("http"):
                    # arxiv and ACM library: replace "abs/" in original link with "pdf/"
                    if "abs/" in url:
                        pdf_link = url.replace("abs/", "pdf/")
                    # Springer links: replace "/article/" with the pdf_link
                    if "article/" in url:
                        index = url.find('/article/')
                        pdf_link = url[:index] + pdf_link

                pdf_links.append(pdf_link)
        except StopIteration:
            break
    
    return pdf_links

In [10]:
pdf_links = google_scholar_search(query)

https://link.springer.com/article/10.1023/a:1019956318069
https://arxiv.org/abs/1803.02999
http://proceedings.mlr.press/v97/finn19a.html?ref=https://githubhelp.com
https://ieeexplore.ieee.org/abstract/document/9428530/
https://www.sciencedirect.com/science/article/pii/S2352154621000024


In [12]:
pdf_links

['https://arxiv.org/pdf/1803.02999',
 'http://proceedings.mlr.press/v97/finn19a/finn19a.pdf']

In [13]:
import os

In [14]:
def download_pdfs(pdf_links):
    directory = 'PDF Documents/searched/'
    for i, link in enumerate(pdf_links):
        if not link.endswith(".pdf"):
            link += ".pdf"
        response = requests.get(link)
        print(link)
        print(response.status_code)
        with open(os.path.join(directory, f'paper_{i}.pdf'), 'wb') as f:
            f.write(response.content)

In [15]:
download_pdfs(pdf_links)

https://arxiv.org/pdf/1803.02999.pdf
200
http://proceedings.mlr.press/v97/finn19a/finn19a.pdf
200


### Multi-agent set-up for CrewAI

We can have multiple agents, each responsible for a particular area of research in AI.

Each agent shares a tool - to search on Google Scholar for information, or to search their vector DB. Every time relevant Google Scholar papers appear, the agent will save those downloaded PDFs to their vector store, updating it in the process

In [1]:
from crewai import Agent, Task, Crew, Process
from crewai_tools import tool
from langchain_community.vectorstores import Chroma

In [7]:
from langchain_community.llms import Ollama
import os
os.environ["OPENAI_API_KEY"] = "NA"

In [14]:
llm = Ollama(model="llama3")

For the purpose of testing the agent/task/crew with simple math

In [None]:
general_agent = Agent(role = "Math Professor",
                      goal = """Provide the solution to the students that are asking mathematical questions and give them the answer.""",
                      backstory = """You are an excellent math professor that likes to solve math questions in a way that everyone can understand your solution""",
                      allow_delegation = False,
                      verbose = True,
                      llm = llm)
task = Task (description="""what is 3 + 5""",
             expected_output="A math answer",
             agent = general_agent)

crew = Crew(
            agents=[general_agent],
            tasks=[task],
            verbose=2
        )

result = crew.kickoff()

print(result)

In [21]:
from langchain_community.embeddings import BedrockEmbeddings
bedrock = create_client()
bedrock_embeddings = BedrockEmbeddings(model_id='amazon.titan-embed-text-v1', client=bedrock)

In [21]:
def download_pdfs(self, pdf_links):
    directory = 'PDF Documents/searched/'
    for i, link in enumerate(pdf_links):
        if not link.endswith(".pdf"):
            link += ".pdf"
        response = requests.get(link)
        with open(os.path.join(directory, f'paper_{i}.pdf'), 'wb') as f:
            f.write(response.content)

def scrape_pdf_links(self, url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    pdf_links = []
    for a_tag in soup.find_all('a'):
        href = a_tag.get('href')
        content = a_tag.text.strip()

        # If the link ends with ".pdf" or "PDF" is in the text
        if href:
            pdf_in_children = any('PDF' in child.text.upper() for child in a_tag.find_all())

            if href.endswith('.pdf') or 'PDF' in content.upper() or pdf_in_children:
                pdf_links.append(href)
    return pdf_links

def google_scholar_search(self, query, top_n=5):
    search_query = scholarly.search_pubs(query)
    pdf_links = []
    for i in range(top_n):
        try:
            result = next(search_query)
            url = result['pub_url']
            #print(url)

            pdf_link = ''
            if url.endswith(".pdf"):
                pdf_link = url
            else:
                scraped_pdf_links = scrape_pdf_links(url)
                #print("DEBUG:", url, scraped_pdf_links)
                if len(scraped_pdf_links) > 0:
                    pdf_link = scraped_pdf_links[0]
            
            if pdf_link != '':
                # If link doesnt start with "http", we should replace "abs" in original link with "pdf"
                # This happens on arxiv and ACM library
                if not pdf_link.startswith("http"):
                    # arxiv and ACM library: replace "abs/" in original link with "pdf/"
                    if "abs/" in url:
                        pdf_link = url.replace("abs/", "pdf/")
                    # Springer links: replace "/article/" with the pdf_link
                    if "article/" in url:
                        index = url.find('/article/')
                        pdf_link = url[:index] + pdf_link

                pdf_links.append(pdf_link)
        except StopIteration:
            break
    
    return pdf_links

@tool("search-tool")
def search_concept_in_field(query: str):
    """Tool for an AI research agent to look up a concept related to its
    field of specialty on Google Scholar, and process PDFs it finds"""
    print("Performing Google Scholar search...")
    pdf_links = google_scholar_search(query)
    download_pdfs(pdf_links)


In [22]:
researcher = Agent(
    role='Senior Research Analyst',
    goal='Uncover cutting-edge developments in AI and data science',
    backstory="""
    You work at a research firm that aims to stay at the forefront of AI technologies.
    The firm aims to invent new technologies that apply the latest research. Your role
    is to supply the latest and greatest findings in Generative AI.
    """,
    verbose=True,
    allow_delegation=False,
    llm=llm,
    tool=[search_concept_in_field]
)

In [23]:
task1 = Task(
 description="Conduct a brief analysis of the latest GenerativeAI advancements in 2024, using the tool…",
 expected_output="Full analysis report in bullet points",
 agent=researcher
)
crew = Crew(
 agents=[researcher],
 tasks=[task1],
 verbose=2,
)



In [24]:
result = crew.kickoff()

[1m[95m [DEBUG]: == Working Agent: Senior Research Analyst[00m
[1m[95m [INFO]: == Starting Task: Conduct a brief analysis of the latest GenerativeAI advancements in 2024, using the tool…[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mThought: I'm excited to dive into the latest advancements in Generative AI!

Final Answer:

**Generative AI Advancements in 2024 Analysis Report**

• **Text-to-Image Generation**: Researchers from Meta AI and Google Brain introduced a new text-to-image generation model that can generate high-quality images given a textual description. The model uses a combination of transformers and convolutional neural networks to produce photorealistic images.

• **Audio Synthesis**: A team from Stanford University and Google developed an audio synthesis model that can generate realistic audio clips based on a given prompt. The model uses a transformer-based architecture and can be applied to various applications such as music generation, voic