In [1]:
import concurrent.futures
import requests
import re
import tiktoken
import concurrent
import ast
import json
from bs4 import BeautifulSoup
from openai import OpenAI
from csv import reader, writer
from functools import lru_cache
from tenacity import retry, wait_random_exponential, stop_after_attempt
from pathlib import Path
from tqdm import tqdm
from scipy import spatial
from collections import namedtuple
from termcolor import colored
from IPython.display import Markdown

URL = "https://paulgraham.com/{}"
GPT_MODEL = "gpt-3.5-turbo-0613"
EMBEDDING_MODEL = "text-embedding-ada-002"
LIBRARY = "../data/essays.csv"
client = OpenAI()
Essay = namedtuple("Essay", ["title", "url", "embedding", "text"])


def is_library_empty(library):
    try:
        with open(library, "r", newline="") as f_object:
            reader_object = reader(f_object)
            first_data_row = next(reader_object, None)
            return first_data_row is None
    except FileNotFoundError:
        return True


def initiate_library(library):
    filename = Path(library)
    filename.parent.mkdir(exist_ok=True)
    with open(filename, "w"):
        pass


@lru_cache
def get_page_content(url):
    page = requests.get(URL.format(url))
    return BeautifulSoup(page.content, "html.parser")


@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def get_embedding_response(text):
    response = client.embeddings.create(input=text, model=EMBEDDING_MODEL)
    return response


def create_chunks(text, n, tokenizer):
    tokens = tokenizer.encode(text)
    i = 0
    while i < len(tokens):
        j = min(i + int(1.5 * n), len(tokens))
        while j > i + int(0.5 * n):
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("?"):
                break
            j -= 1
        if j == i + int(0.5 * n):
            j = min(i + n, len(tokens))
        yield tokens[i:j]
        i = j


def get_essay_urls():
    soup = get_page_content(url="articles.html")
    links = soup.findAll("a")

    essay_urls = {
        link["href"]
        for link in links
        if (link["href"].endswith(".html"))
        and link["href"] not in {"index.html", "rss.html"}
    }
    return essay_urls


def get_essay_object(url):
    soup = get_page_content(url)
    raw_text = soup.get_text(separator="\n", strip=True)
    raw_text_without_notes = raw_text.split("Notes")[0]
    raw_text_without_thankyous = raw_text_without_notes.split("Thanks")[0]
    clean_text = re.sub(r"\[\n\d+\n\]", "", raw_text_without_thankyous)
    result = {
        "title": clean_text.split("\n")[0],
        "date": clean_text.split("\n")[1],
        "essay_text": "".join(clean_text.split("\n")[2:]),
    }
    return result


def get_essays(urls, library):
    if is_library_empty(library):
        initiate_library(library)
        for url in tqdm(urls, position=0):
            essay_object = get_essay_object(url)

            embedding_response = get_embedding_response(essay_object["title"])

            file_reference = [
                essay_object["title"],
                URL.format(url),
                embedding_response.data[0].embedding,
                essay_object["essay_text"],
            ]

            with open(library, "a") as f_object:
                writer_object = writer(f_object)
                writer_object.writerow(file_reference)
                f_object.close()


def read_essay_library(library):
    essay_objects = []
    with open(library, "r", newline="\n") as f_object:
        reader_object = reader(f_object)
        for row in reader_object:
            essay = Essay(
                title=row[0],
                url=row[1],
                embedding=ast.literal_eval(row[2]),
                text=row[3],
            )
            essay_objects.append(essay)
    return essay_objects

In [2]:
# essay_urls = get_essay_urls()
# get_essays(essay_urls, LIBRARY)

In [3]:
essay_library = read_essay_library(LIBRARY)

In [4]:
def get_essays_ranked_by_relatedness(
    query,
    top_n=5,
    similarity_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
):
    query_embedding_response = get_embedding_response(query)
    query_embedding = query_embedding_response.data[0].embedding
    essays_and_relatedness = [
        (essay, similarity_fn(query_embedding, essay.embedding))
        for essay in essay_library
    ]
    essays_and_relatedness.sort(key=lambda x: x[1], reverse=True)
    essays, relatedness = zip(*essays_and_relatedness)
    return essays[:top_n]

In [5]:
def chunk_essay_text(result):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    chunks = create_chunks(result, 1800, tokenizer)
    text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
    return text_chunks


def summarize_chunk(template_prompt, content):
    full_prompt = template_prompt + content
    response = client.chat.completions.create(
        model=GPT_MODEL,
        messages=[{"role": "user", "content": full_prompt}],
        temperature=0,
    )
    return response.choices[0].message.content


def read_and_summarize(title):
    results = ""
    essay_to_summarize = [essay.text for essay in essay_library if essay.title == title][0]
    text_chunks = chunk_essay_text(essay_to_summarize)

    summary_prompt = """Summarize this text from Paul Grahams esssay.\n\nContent:"""

    with concurrent.futures.ThreadPoolExecutor(
        max_workers=len(text_chunks)
    ) as executor:
        futures = [
            executor.submit(summarize_chunk, text_chunk, summary_prompt)
            for text_chunk in text_chunks
        ]
        with tqdm(total=len(text_chunks)) as pbar:
            for _ in concurrent.futures.as_completed(futures):
                pbar.update(1)
        for future in futures:
            data = future.result()
            results += data

    response = client.chat.completions.create(
        model=GPT_MODEL,
        messages=[
            {
                "role": "user",
                "content": f"""Write a final summary for the requested Paul Graham essay from small summaries extracted from chunks of the same essay.
                            Chunked summaries:\n{results}\nSummary:\n""",
            }
        ],
        temperature=0,
    )

    return response

In [6]:
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def chat_completion_request(messages, tools=None, model=GPT_MODEL):
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, tools=tools
        )
        return response
    except Exception as e:
        print("Unable to generate ChatCompletion response")
        print(f"Exception: {e}")
        return e

In [7]:
class Conversation:
    def __init__(self):
        self.conversation_history = []

    def add_message(self, role, content):
        message = {"role": role, "content": content}
        self.conversation_history.append(message)

    def display_conversation(self):
        role_to_color = {
            "system": "red",
            "user": "green",
            "assistant": "blue",
            "function": "magenta",
        }
        for message in self.conversation_history:
            print(
                colored(
                    f"{message['role']}: {message['content']}\n\n",
                    role_to_color[message["role"]],
                )
            )

In [8]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_essays_ranked_by_relatedness",
            "description": "Use this function to get Paul Graham essays to answer user questions.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {"type": "string", "description": "User query in JSON."},
                },
                "required": ["query"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "read_and_summarize",
            "description": """Use this function to whole Paul Graham essay and provide summary for user.
                        You should NEVER call this function before get_essays_ranked_by_relatedness has been called in the conversation.""",
            "parameters": {
                "type": "object",
                "properties": {
                    "title": {
                        "type": "string",
                        "description": "Title of the requested Paul Graham essay to be summarized",
                    },
                },
            },
        },
    },
]

In [9]:
def call_pg_essay_function(messages, full_message):
    if (
        full_message.message.tool_calls[0].function.name
        == "get_essays_ranked_by_relatedness"
    ):
        try:
            parsed_arguments = json.loads(
                full_message.message.tool_calls[0].function.arguments
            )
            print("Getting most relevant Paul Graham essays")
            results = get_essays_ranked_by_relatedness(parsed_arguments["query"])
        except Exception as e:
            print(parsed_arguments)
            print("Function execution failed")
            print(f"Error message: {e}")
        
        str_result = ""
        for index, essay in enumerate(results, start=1):
            str_result += f"{index}. {essay.title}\n"

        messages.append(
            {
                "role": full_message.message.role,
                "name": full_message.message.tool_calls[0].function.name,
                "content": str_result,
            }
        )

        try:
            print("Got search results, summarizing content")
            response = chat_completion_request(messages)
            return response
        except Exception as e:
            print(type(e))
            raise Exception("Function chat request failed")

    elif full_message.message.tool_calls[0].function.name == "read_and_summarize":
        parsed_output = json.loads(
            full_message.message.tool_calls[0].function.arguments
        )
        print("Finding and reading essay")
        summary = read_and_summarize(parsed_output["title"])
        return summary
    else:
        raise Exception("Function does not exist and can't be called")


def chat_completion_with_tool_execution(messages, tools=None):
    response = chat_completion_request(messages, tools)
    full_message = response.choices[0]
    if full_message.finish_reason == "tool_calls":
        print("Function generation requested, calling function")
        return call_pg_essay_function(messages, full_message)
    else:
        print("Function not required, responding to user.")
        return response

In [10]:
# Initialize conversation with system message
system_message = """You are a helpful assistant that pulls relevant Paul Graham essays when needed to answer user questions.
                    You always provide essay title and url so that user could decide which essay to read to answer their question.
                    Begin!"""
conversation = Conversation()
conversation.add_message("system", system_message)

In [11]:
# Add a user message
conversation.add_message("user", "Hi, what advice does Paul Graham like to give on the topic of starting a startup?")
chat_response = chat_completion_with_tool_execution(conversation.conversation_history, tools)
assistant_message = chat_response.choices[0].message.content
conversation.add_message("assistant", assistant_message)
display(Markdown(assistant_message))

Function generation requested, calling function
Getting most relevant Paul Graham essays
Got search results, summarizing content


Here are the top 5 Paul Graham essays related to starting a startup:

1. Essay: How to Start a Startup
   URL: http://www.paulgraham.com/startuplessons.html

2. Essay: Ideas for Startups
   URL: http://www.paulgraham.com/startupideas.html

3. Essay: How to Fund a Startup
   URL: http://www.paulgraham.com/startupfunding.html

4. Essay: A Student's Guide to Startups
   URL: http://www.paulgraham.com/hs.html

5. Essay: The Future of Web Startups
   URL: http://www.paulgraham.com/webstartups.html

Feel free to click on the URLs to read any of these essays that you find most relevant to your question!

In [12]:
conversation.add_message("user", "Could you please read and summarize the first essay?")
updated_response = chat_completion_with_tool_execution(conversation.conversation_history, tools)
display(Markdown(updated_response.choices[0].message.content))

Function generation requested, calling function
Finding and reading essay


100%|██████████| 5/5 [00:03<00:00,  1.56it/s]


In this essay, Paul Graham discusses the key factors for creating a successful startup. He emphasizes the importance of having good people, creating a product that customers actually want, and minimizing expenses. Graham argues that a brilliant idea is not necessary to start a successful startup, as long as the product or service offers better technology than what is currently available. He also highlights the importance of addressing intellectual property issues and being cautious when approaching investors. Graham advises startups to focus on niche markets, target smaller customers, and be cautious with spending. He emphasizes the need to understand the business and put users first in order to succeed. Graham also discusses the challenges and considerations related to age when starting a startup. Ultimately, he suggests that starting a startup can be a way to solve the money problem and offers advice on how to approach it: build something users love and spend less than you make.