In [1]:
pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = LANGCHAIN_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [3]:
os.environ['USER_AGENT'] = "chris bot (chriswillsflannery@gmail.com)"

In [17]:
from bs4 import BeautifulSoup, SoupStrainer
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    text = ""
    for tag in soup.find_all(['h1', 'h4', 'p']):
        text += tag.get_text() + "\n\n"
    return text

# create custom WebBaseLoader
class CustomWebLoader(WebBaseLoader):
    def parse(self, raw_html):
        return parse_html(raw_html)

loader = CustomWebLoader("https://chriswillsflannery.vercel.app/posts/willAITakeCodingJobs")
docs = loader.load()

# check loaded content
print(f"Number of documetns: {len(docs)}")
if docs:
    print(f"Content length: {len(docs[0].page_content)}")
    print(f"First 500 characters:\n{docs[0].page_content[:500]}")

# text split if content lodaded successfully
if docs and docs[0].page_content:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    splits = text_splitter.split_documents(docs)

    try:
        vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
        print("Vectorstore created successfully")
    except Exception as e:
        print(f"Error creating vectorstore: {e}")
else:
    print("No content was loaded form webpage")

retriever = vectorstore.as_retriever()

Number of documetns: 1
Content length: 8258
First 500 characters:
ChrisWF.Will AI replace coding jobs?Short answer, yeah, probably. How soon? Well, that largely remains to be seen.Up until a couple weeks ago, I wasn't afraid of AI taking software engineering jobs. I tend to agree with thinkfluencers in the tech and AI space like Mike Solana, who says in his newsletter Pirate Wires, 'Artificial intelligence is a serious and potentially (probably!) paradigm-altering technology. There are risks here, which will become obvious in a more concrete sense as the techn
Vectorstore created successfully


In [22]:
# retrieval and generation
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template("""
You are an assistant for helping a content creator to create scripts for his videos.
Use the following pieces of retrieved context to help inform the tone of voice for the scripts.
Given a topic, you should create a script for a 2-minute video.
The script should include a hook with any of the following components:
Component 1: an extreme result or problem that your video is going to fix
Component 2: Always use the word "You"
Component 3: I'm going to give you some massive amount of value in this one video
Component 4: use extreme words like "exactly"
The script should obliterate the viewer's objections.
It should attempt to infer what the user is thinking about the previously mentioned extreme claim, and tell it straight back to them.
For example: 'I know you're probably thinking this sounds corny or stupid or cliche but hear me out nobody is going to teach you this stuff so listen up'
The script should lock in the scarcity/extremity of the video so viewers have to listen - this can be related to why this video is unique.
Topic: {topic} 
Context: {context} 
Answer:
""")

In [23]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "topic": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("The future of software development")

TypeError: unsupported operand type(s) for |: 'dict' and 'str'