In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough

In [3]:
os.environ["OPENAI_API_KEY"] = os.getenv('openai_token')

## Step 1. data load
### WebBaseLoader 이용하여 web data load

In [8]:
from langchain.document_loaders import WebBaseLoader

web_urls = ['https://www.liverpoolfc.com/news/talking-points-jota-returns-no50-nunez-delivers-gakpos-landmark-and-more',
            'https://www.liverpoolfc.com/news/liverpool-go-top-league-nunez-and-jota-strike-burnley']

loader = WebBaseLoader(
    web_paths=(web_urls[0],),
    bs_kwargs={
        "parse_only": bs4.SoupStrainer(
            class_=("css-13o9mi9", "css-gtn94w",)
        )
    }, # BeautifulSoup Parser
)
docs = loader.load()

In [9]:
docs

[Document(page_content="Darwin Nunez and Diogo Jota were on target as Liverpool beat Burnley 2-0 to move to the top of the Premier League on Boxing Day.Nunez’s early goal set the Reds on their way at Turf Moor, with Jota emerging from the bench to seal the points late on in an action-packed contest.Here are five things we noticed on a positive night in Lancashire…Nunez delivers a late Christmas presentIt took Liverpool fewer than six minutes to break this game open, and in some style too.Cody Gakpo escaped down the left and pulled the ball back for Nunez, who slotted home a brilliant first-time finish.The Uruguay international shaped his body to arc a clinical strike into the bottom right corner to land his first goal for the Reds since November 1.It was the highlight of a fine all-round performance from the No.9, who was a constant menace throughout.A nice late Christmas present for Reds fans.Jota’s timely returnSo often an arch predator for Liverpool, Jota needed only six minutes to 

In [10]:
len(docs[0].page_content)

3536

## Step 2. Split text
* 지정된 `chunk_size`로 split하되 `chunk_overlap` 허용

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [12]:
all_splits

[Document(page_content='Darwin Nunez and Diogo Jota were on target as Liverpool beat Burnley 2-0 to move to the top of the Premier League on Boxing Day.Nunez’s early goal set the Reds on their way at Turf Moor, with Jota emerging from the bench to seal the points late on in an action-packed contest.Here are five things we noticed on a positive night in Lancashire…Nunez delivers a late Christmas presentIt took Liverpool fewer than six minutes to break this game open, and in some style too.Cody Gakpo escaped down the left and pulled the ball back for Nunez, who slotted home a brilliant first-time finish.The Uruguay international shaped his body to arc a clinical strike into the bottom right corner to land his first goal for the Reds since November 1.It was the highlight of a fine all-round performance from the No.9, who was a constant menace throughout.A nice late Christmas present for Reds fans.Jota’s timely returnSo often an arch predator for Liverpool, Jota needed only six minutes to 

In [14]:
len(all_splits), len(all_splits[0].page_content)

(5, 998)

## Step 3. Store data in vectorDB
* `OpenAIEmbeddings` model로 embedding 계산 후  vectorDB `Chroma` 에 저장

In [19]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

In [20]:
vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0xffff7e962650>

## Step 4. Retrieve 
* retriever의 `serch_type`
    * similarity (search_kwargs key: 'k')
    * mmr (Maximum marginal relevance retrieval)
    * similarity_score_threshold (search_kwargs key: 'score_threshold')

In [26]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [27]:
retrieved_docs = retriever.get_relevant_documents(
    "Who scored goals?"
)

In [28]:
len(retrieved_docs)

3

In [29]:
print(retrieved_docs[0].page_content)

Darwin Nunez and Diogo Jota were on target as Liverpool beat Burnley 2-0 to move to the top of the Premier League on Boxing Day.Nunez’s early goal set the Reds on their way at Turf Moor, with Jota emerging from the bench to seal the points late on in an action-packed contest.Here are five things we noticed on a positive night in Lancashire…Nunez delivers a late Christmas presentIt took Liverpool fewer than six minutes to break this game open, and in some style too.Cody Gakpo escaped down the left and pulled the ball back for Nunez, who slotted home a brilliant first-time finish.The Uruguay international shaped his body to arc a clinical strike into the bottom right corner to land his first goal for the Reds since November 1.It was the highlight of a fine all-round performance from the No.9, who was a constant menace throughout.A nice late Christmas present for Reds fans.Jota’s timely returnSo often an arch predator for Liverpool, Jota needed only six minutes to make his mark here.The


In [30]:
print(retrieved_docs[-1].page_content)

victory for us'. Liverpool captain Virgil van Dijk praised what he believed to be an 'important' victory for the Reds over Burnley in the Premier League on Boxing Day. He might have had a goal of his own soon after, but saw his emphatic strike from a loose ball ruled out for a foul by Nunez.Nevertheless, the 24-year-old can reflect on a job well done and a promising first half-century of games in red, which has included 14 goals.Defence stands firmLiverpool could have been out of sight by the hour mark, but they were forced to weather a storm in the second half as Burnley came on strong in the closing stages.The good news for Klopp, however, is that defensively his team continue to show impressive resilience.After 19 league games, the Reds have conceded only 16 goals, the joint-fewest in the division, and both captain Virgil van Dijk and youngster Jarell Quansah were excellent at Turf Moor.Quansah notably shone on just his third Premier League start, with the manager describing his


## Step 5. Generate
* context로 retriever, question으로 RunnablePassthrough()
    * RunnablePassthrough()는 input을 받아 rag chain에 흘려주는 역할
* llm은 `gpt-3.5-turbo`
* lag chain에 question이 invoke되면 *Step 4* 의 retrieve 부터 실행

In [31]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [33]:
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [34]:
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking! Liverpool will be the Champion of this season" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""

rag_prompt = PromptTemplate.from_template(template)

In [35]:
rag_prompt

PromptTemplate(input_variables=['context', 'question'], template='Use the following pieces of context to answer the question at the end.\nIf you don\'t know the answer, just say that you don\'t know, don\'t try to make up an answer.\nUse three sentences maximum and keep the answer as concise as possible.\nAlways say "thanks for asking! Liverpool will be the Champion of this season" at the end of the answer.\n{context}\nQuestion: {question}\nHelpful Answer:')

In [36]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)


In [38]:
rag_chain.invoke("Who scored goals?")

'Darwin Nunez and Diogo Jota scored the goals for Liverpool. Thanks for asking! Liverpool will be the Champion of this season.'

In [40]:
rag_chain.invoke("Who assisted on Nunez's goal?")

"Cody Gakpo assisted on Nunez's goal. Thanks for asking! Liverpool will be the Champion of this season."