In [1]:
# Cell 1: Import necessary libraries
import os
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from ipywidgets import widgets
from IPython.display import display, clear_output

In [2]:
def simple_text_loader(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        metadata = {
            "source": os.path.basename(file_path)
        }
        return [Document(page_content=content, metadata=metadata)]
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return []

def load_documents(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            documents.extend(simple_text_loader(file_path))
    return documents

In [3]:
# Cell 3: Load and process documents
# Load documents
documents = load_documents('./policies/')
print(f"Loaded {len(documents)} documents")

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)
print(f"Created {len(splits)} splits")

Loaded 4 documents
Created 54 splits


In [4]:
# Cell 4: Create vector store
# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create vector store
vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
print("Vector store created")


  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store created


In [5]:
# Cell 5: Set up retriever and language model
# Initialize retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

text_generation_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,  # This should generate about 100 words
    temperature=0.2,
    no_repeat_ngram_size=3,
    do_sample=True,
    top_k=50,
    top_p=0.95,
)

# Initialize language model
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)
print("QA chain created")

QA chain created


  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [6]:
# Cell 6: Define question-answering function
def ask_question(question):
    result = qa_chain({"query": question})
    return result["result"], result["source_documents"]

In [15]:
# Cell 7: Example usage
question2 = "What is the purpose of the Lion Group Human Resource Manual?"
question = "Hi how are you"
answer, sources = ask_question(question)
print(f"Question: {question}\n")
print(f"Answer: {answer}\n")
print("Sources:")
for source in sources:
    print(f"- Source: {source.metadata['source']}")

Question: Hi how are you

Answer: I don't know

Sources:
- Source: LGPG-GHR-003.txt
- Source: LGPG-GHR-003.txt
- Source: LGPG-GHR-003.txt
- Source: LGPG-GHR-006.txt
- Source: LGPG-GHR-003.txt


In [14]:
from ipywidgets import widgets
from IPython.display import display, clear_output

def on_ask_button_clicked(b):
    question = question_widget.value
    if question.lower() == 'exit':
        print("Exiting Q&A session.")
        return
    
    answer, sources = ask_question(question)
    
    clear_output(wait=True)
    display(question_widget)
    display(ask_button)
    
    print(f"Question: {question}\n")
    print(f"Answer: {answer}\n")
    print("Sources:")
    for source in sources:
        print(f"- Source: {source.metadata['source']}")

question_widget = widgets.Text(
    value='',
    placeholder='Type your question here',
    description='Question:',
    disabled=False
)

ask_button = widgets.Button(
    description='Ask',
    disabled=False,
    button_style='', 
    tooltip='Click to ask the question',
    icon='question'
)

ask_button.on_click(on_ask_button_clicked)

display(question_widget)
display(ask_button)

Text(value='Hi how are you', description='Question:', placeholder='Type your question here')

Button(description='Ask', icon='question', style=ButtonStyle(), tooltip='Click to ask the question')

Question: Hi how are you

Answer: I don't know

Sources:
- Source: LGPG-GHR-003.txt
- Source: LGPG-GHR-003.txt
- Source: LGPG-GHR-003.txt
- Source: LGPG-GHR-006.txt
- Source: LGPG-GHR-003.txt
