<a href="https://colab.research.google.com/github/darinkist/Medium-Article-Transparent-Question-Answering-Bot/blob/main/CodeForArticleWebsiteExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pickle

import faiss
import pandas as pd
import requests
import xmltodict
from bs4 import BeautifulSoup
from langchain import OpenAI, PromptTemplate
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from tqdm.notebook import tqdm

os.environ["OPENAI_API_KEY"] = "<YOUR-API-KEY>"

In [None]:
# Gathering links to all articles by using sitemap
r = requests.get("https://news.itsfoss.com/sitemap-posts.xml")
xml = r.text
rss = xmltodict.parse(xml)

article_links = [entry["loc"] for entry in rss["urlset"]["url"]]

In [None]:
# Helper to extract article content
def extract_content(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")

    elements = [
        soup.select_one(".c-topper__headline"),
        soup.select_one(".c-topper__standfirst"),
        soup.select_one(".c-content"),
    ]

    text = "".join([element.get_text() for element in elements])

    return text


articles = []
# Limited the list of > 900 articles to 10 for this example
# Just remove [0:10] if you want all articles
for url in tqdm(article_links[0:10], desc="Extracting article content"):
    articles.append({"source": url, "content": extract_content(url)})

In [None]:
# Split article data into chunks
rec_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)

web_docs, meta = [], []

for article in tqdm(articles, desc="Splitting articles into chunks"):
    splits = rec_splitter.split_text(article["content"])
    web_docs.extend(splits)
    meta.extend([{"source": article["source"]}] * len(splits))

In [None]:
# Create embeddings and store them together with the chunks in a vector store
article_store = FAISS.from_texts(
    texts=web_docs, embedding=OpenAIEmbeddings(), metadatas=meta
)

### Question Answering Bot

In [None]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",
    output_key="answer",
    return_messages=True,
)

In [None]:
template = """You are a chatbot having a conversation with a human.
Given the following extracted parts of a long document and a question,
create a final answer.
{context}
{chat_history}
Human: {question}
Chatbot:"""

question_prompt = PromptTemplate(
    input_variables=["chat_history", "question", "context"], template=template
)

In [None]:
# Do now the transparent question answering
article_chain = RetrievalQAWithSourcesChain.from_llm(
    llm=OpenAI(temperature=0.0),
    retriever=article_store.as_retriever(k=4),
    memory=memory,
    question_prompt=question_prompt,
)

In [None]:
article_chain({"question": "What is Skiff?"}, return_only_outputs=True)

In [None]:
article_chain(
    {"question": "What are its functionalities?"},
    return_only_outputs=True,
)