In [1]:
pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = LANGCHAIN_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['USER_AGENT'] = "chris bot (chriswillsflannery@gmail.com)"

In [3]:
from bs4 import BeautifulSoup, SoupStrainer
from langchain.schema import Document
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate

def parse_html(soup):    
    parsed_content = []
    for tag in soup.find_all(['h3', 'p']):
        if tag.name == 'h3':
            parsed_content.append({"type": "question", "content": tag.get_text().strip()})
        elif tag.name == 'p':
            parsed_content.append({"type": "answer", "content": tag.get_text().strip()})
    return parsed_content

class CustomWebLoader(WebBaseLoader):
    def load(self):
        try:
            soup = self.scrape()
            parsed_content = parse_html(soup)
            
            docs = []
            for item in parsed_content:
                metadata = {"type": item["type"]}
                docs.append(Document(page_content=item["content"], metadata=metadata))
            return docs
        except Exception as e:
            print(f"An error occurred while loading documents: {e}")
            return []

# split pairs 
def split_qa_pairs(docs):
    qa_pairs = []
    current_qa = {'question': '', 'answer': ''}
    for doc in docs:
        if doc.metadata['type'] == 'question':
            # If we encounter a new question and we have a previous QA pair, add it to the list
            if current_qa['question'] or current_qa['answer']:
                qa_pairs.append(Document(
                    page_content=f"Q: {current_qa['question']}\nA: {current_qa['answer']}",
                    metadata={'type': 'qa_pair'}
                ))
            # Start a new QA pair
            current_qa = {'question': doc.page_content, 'answer': ''}
        elif doc.metadata['type'] == 'answer':
            # If we encounter an answer without a question, use a placeholder
            if not current_qa['question']:
                current_qa['question'] = 'Unknown Question'
            current_qa['answer'] = doc.page_content
    
    # Add the last QA pair if it exists
    if current_qa['question'] or current_qa['answer']:
        qa_pairs.append(Document(
            page_content=f"Q: {current_qa['question']}\nA: {current_qa['answer']}",
            metadata={'type': 'qa_pair'}
        ))
    return qa_pairs

# Usage remains the same
url = "https://chriswillsflannery.vercel.app/posts/ragExamplesForJobApplication"
loader = CustomWebLoader(url)
try:
    docs = loader.load()
    if not docs:
        print("No documents were loaded.")
    else:
        print(f"Number of documents: {len(docs)}")
        for doc in docs[:2]:  # Print first two documents as an example
            print(f"Type: {doc.metadata['type']}, Content: {doc.page_content[:100]}...")
except Exception as e:
    print(f"An error occurred while loading documents: {e}")

# Text split if content loaded
if docs:
    qa_splits = split_qa_pairs(docs)
    
    try:
        vectorstore = Chroma.from_documents(documents=qa_splits, embedding=OpenAIEmbeddings())
        print("Vectorstore created")
    except Exception as e:
        print(f"Vectorstore creation failed: {e}")

retriever = vectorstore.as_retriever()

Number of documents: 44
Type: answer, Content: Jupyter notebook with RAG pipeline using these Q&A...
Type: answer, Content: Try it out here (Flask app)...
Vectorstore created


In [4]:
# Retrieval and generation
prompt_template = PromptTemplate.from_template("""
You are an assistant helping a job applicant complete job applications. Use the following retrieved context to inform the tone and style of your answers. The context contains examples of the applicant's previous responses to job application questions.

Retrieved context:
{context}

Now, based on the style and content of the above context, please answer the following job application question:

Question: {question}

Answer:
""")

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

def format_docs(docs):
    return "\n\n".join(f"{doc.metadata['type'].capitalize()}: {doc.page_content}" for doc in docs)


rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt_template
    | llm
    | StrOutputParser()
)

# Usage
try:
    formatted_docs = format_do
    response = rag_chain.invoke("How do you approach troubleshooting and debugging issues?")
    print(response)
except Exception as e:
    print(f"An error occurred during the RAG chain execution: {e}")

I approach troubleshooting and debugging with a methodical and collaborative mindset. I believe in first understanding the root cause of the issue by analyzing logs, error messages, and code snippets. I also like to consult with team members or experts in the specific technology if needed, to gain different perspectives and insights. Once I have identified the problem, I work on implementing a solution step by step, testing each change to ensure it resolves the issue without causing any new problems. I am persistent and patient in my approach, always striving to improve my problem-solving skills and learn from each debugging experience.
