In [2]:
%pip install openai langchain pypdf chromadb tiktoken jupyterlab

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
with open("openai.key", "r") as f:
    key = f.read()
    os.environ["OPENAI_API_KEY"] = key


In [4]:
from pathlib import Path

reports_dir = Path("reports/")
reports = []
for file in reports_dir.iterdir():
    reports.append(str(file))

In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

documents = []

for report in reports:
    loader = PyPDFLoader(report)
    documents.extend(loader.load())

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap  = 0,
    length_function = len,
)


In [6]:
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

texts = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(texts, embeddings)
#retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":1})
#qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever, return_source_documents=True)

Using embedded DuckDB without persistence: data will be transient


In [7]:
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import (
    ConversationalRetrievalChain,
    LLMChain
)
from langchain.chat_models import ChatOpenAI

_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
condense_question_prompt = PromptTemplate.from_template(_template)

template = """You are an AI assistant for answering questions about threat intelligence.
You are given the following extracted parts of a long intelligence document and a question. Provide a concise and accurate answer.
If you don't know the answer, just say "I don't have the data to answer that." Don't try to make up an answer.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
qa_prompt = PromptTemplate(template=template, input_variables=["question", "context"])


# define two LLM models from OpenAI
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
 
streaming_llm = OpenAI(
    streaming=True,
    callback_manager=CallbackManager([
        StreamingStdOutCallbackHandler()
    ]),
    verbose=True,
    max_tokens=150,
    temperature=0.2
)
 
# use the LLM Chain to create a question creation chain
question_generator = LLMChain(
    llm=llm,
    prompt=condense_question_prompt
)
 
# use the streaming LLM to create a question answering chain
doc_chain = load_qa_chain(
    llm=streaming_llm,
    chain_type="stuff",
    prompt=qa_prompt
)

In [8]:
chatbot = ConversationalRetrievalChain(
    retriever=vectorstore.as_retriever(),
    combine_docs_chain=doc_chain,
    question_generator=question_generator,
    return_source_documents=True
)

In [9]:
# create a chat history buffer
chat_history = []
# gather user input for the first question to kick off the bot
question = input()

# keep the bot running in a loop to simulate a conversation
while True:
    result = chatbot(
        {"question": question, "chat_history": chat_history}
    )
    print("\n\nReferences:")
    [print("{}".format(document.metadata['source'])) for document in result['source_documents']]
    print("========================")
    chat_history.append((result["question"], result["answer"]))
    question = input()


Yes, Russia is seen as a threat to Europe. Russia has used military pressure against Estonia, Latvia and Lithuania, and has conducted military exercises simulating a full-scale war with NATO in Europe. Russia has also used European politicians to disseminate propaganda and has threatened the EU with military conflict.

References:
reports/efia2018.pdf
reports/efia2019.pdf
reports/nis2017.pdf
reports/efia2019.pdf

Russia is seen as a threat to Europe due to its continued opposition to the system of Western values, its military pressure against Estonia, Latvia and Lithuania, and its attempts to challenge the sanctions regime. Russia has also taken steps to ease tensions with the West, and seeks to continue economic and political cooperation with Western countries in general and Europe in particular.

References:
reports/nis2017.pdf
reports/efia2018.pdf
reports/nis2019.pdf
reports/efia2018.pdf

I don't have the data to answer that.

References:
reports/sapo2020.pdf
reports/sapo2021.pdf
r

KeyboardInterrupt: 

In [None]:
'''
result = qa_chain({"question": "What is China's modus operandi?", "chat_history": None})
print(result)
'''

In [None]:
'''
query = "What is China's modus operandi?"
r = qa({"query": query})
print(f"Svar:{r.get('result')}")
print(f"\nReferenser:")
for document in r.get('source_documents'):
    print(f"Rapport: {document.metadata['source']}, sida {document.metadata['page']}")
'''