In [3]:
import concurrent.futures
import requests
import re
import tiktoken
import concurrent
import ast
from bs4 import BeautifulSoup
from openai import OpenAI
from csv import reader, writer
from functools import lru_cache
from tenacity import retry, wait_random_exponential, stop_after_attempt
from pathlib import Path
from tqdm import tqdm
from scipy import spatial
from collections import namedtuple


URL = "https://paulgraham.com/{}"
GPT_MODEL = "gpt-3.5-turbo-0613"
EMBEDDING_MODEL = "text-embedding-ada-002"
LIBRARY = "../data/essays.csv"
client = OpenAI()
Essay = namedtuple("Essay", ["title", "url", "embedding"])


def is_library_empty(library):
    try:
        with open(library, "r", newline="") as f_object:
            reader_object = reader(f_object)
            first_data_row = next(reader_object, None)
            return first_data_row is None
    except FileNotFoundError:
        return True


def initiate_library(library):
    filename = Path(library)
    filename.parent.mkdir(exist_ok=True)
    with open(filename, "w"):
        pass


@lru_cache
def get_page_content(url):
    page = requests.get(URL.format(url))
    return BeautifulSoup(page.content, "html.parser")


@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def get_embedding_response(text):
    response = client.embeddings.create(input=text, model=EMBEDDING_MODEL)
    return response


def create_chunks(text, n, tokenizer):
    tokens = tokenizer.encode(text)
    i = 0
    while i < len(tokens):
        j = min(i + int(1.5 * n), len(tokens))
        while j > i + int(0.5 * n):
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("?"):
                break
            j -= 1
        if j == i + int(0.5 * n):
            j = min(i + n, len(tokens))
        yield tokens[i:j]
        i = j


def get_essay_urls():
    soup = get_page_content(url="articles.html")
    links = soup.findAll("a")

    essay_urls = {
        link["href"]
        for link in links
        if (link["href"].endswith(".html"))
        and link["href"] not in {"index.html", "rss.html"}
    }
    return essay_urls


def get_essay_object(url):
    soup = get_page_content(url)
    raw_text = soup.get_text(separator="\n", strip=True)
    raw_text_without_notes = raw_text.split("Notes")[0]
    raw_text_without_thankyous = raw_text_without_notes.split("Thanks")[0]
    clean_text = re.sub(r"\[\n\d+\n\]", "", raw_text_without_thankyous)
    result = {
        "title": clean_text.split("\n")[0],
        "date": clean_text.split("\n")[1],
        "essay_text": "".join(clean_text.split("\n")[2:]),
    }
    return result


def chunk_essay_text(result):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    chunks = create_chunks(result["essay_text"], 1800, tokenizer)
    text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
    return text_chunks


def summarize_chunk(template_prompt, content):
    full_prompt = template_prompt + content
    response = client.chat.completions.create(
        model=GPT_MODEL, messages=[{"role": "user", "content": full_prompt}]
    )
    return response.choices[0].message.content


def summarize_essay(result):
    results = ""
    text_chunks = chunk_essay_text(result)

    summary_prompt = """Summarize this text from Paul Grahams esssay.\n\nContent:"""

    with concurrent.futures.ThreadPoolExecutor(
        max_workers=len(text_chunks)
    ) as executor:
        futures = [
            executor.submit(summarize_chunk, text_chunk, summary_prompt)
            for text_chunk in text_chunks
        ]
        with tqdm(total=len(text_chunks), position=1, leave=False) as pbar:
            for _ in concurrent.futures.as_completed(futures):
                pbar.update(1)
        for future in futures:
            data = future.result()
            results += data

    return results


def get_essays(urls, library):
    if is_library_empty(library):
        initiate_library(library)
        results = []
        for url in tqdm(urls, position=0):
            essay_object = get_essay_object(url)
            summarized_essay = summarize_essay(essay_object)
            embedding_response = get_embedding_response(summarized_essay)

            results.append(essay_object)

            file_reference = [
                essay_object["title"],
                URL.format(url),
                embedding_response.data[0].embedding,
            ]

            with open(library, "a") as f_object:
                writer_object = writer(f_object)
                writer_object.writerow(file_reference)
                f_object.close()

        return results


def read_essay_library(library):
    essay_objects = []
    with open(library, "r", newline="\n") as f_object:
        reader_object = reader(f_object)
        for row in reader_object:
            essay = Essay(title=row[0], url=row[1], embedding=ast.literal_eval(row[2]))
            essay_objects.append(essay)
    return essay_objects

In [52]:
essay_urls = get_essay_urls()

In [53]:
get_essays(essay_urls, LIBRARY)

100%|██████████| 220/220 [18:01<00:00,  4.92s/it]


[{'title': 'Keep Your Identity Small',
  'date': 'February 2009',
  'essay_text': "I finally realized today why politics and religion yield suchuniquely useless discussions.As a rule, any mention of religion on an online forum degeneratesinto a religious argument.  Why?  Why does this happen with religionand not with Javascript or baking or other topics people talk abouton forums?What's different about religion is that people don't feel they needto have any particular expertise to have opinions aboutit.  All they need is strongly held beliefs, and anyone can havethose.  No thread about Javascript will grow as fast as one aboutreligion, because people feel they have to be over some thresholdof expertise to post comments about that.  But on religion everyone'san expert.Then it struck me: this is the problem with politics too.  Politics,like religion, is a topic where there's no threshold of expertisefor expressing an opinion.  All you need is strong convictions.Do religion and politics h

In [11]:
essay_library = read_essay_library(LIBRARY)

In [41]:
def get_essays_ranked_by_relatedness(
    query,
    top_n=5,
    similarity_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
):
    essay_library = read_essay_library(LIBRARY)
    query_embedding_response = get_embedding_response(query)
    query_embedding = query_embedding_response.data[0].embedding
    essays_and_relatedness = [
        (essay, similarity_fn(query_embedding, essay.embedding))
        for essay in essay_library
    ]
    essays_and_relatedness.sort(key=lambda x: x[1], reverse=True)
    essays, relatedness = zip(*essays_and_relatedness)
    return essays[:top_n]

In [42]:
query = "How to start a startup"
get_essays_ranked_by_relatedness(query)

(Essay(title='How to Start Google', url='https://paulgraham.com/google.html', embedding=[0.016592775, -0.008017463, 0.020809079, -0.04548418, -0.010307239, 0.0036519673, -0.029734667, 0.019563647, -0.006126613, -0.016956026, 0.014478137, 0.0072131217, 0.0032952032, 0.009833717, -0.009678038, -0.007556913, 0.035053696, -0.012292146, -0.001316784, -0.0043200892, -0.035520732, -0.025894588, -0.002054637, -0.012357011, -0.013505144, 0.018876065, 0.018538762, 0.0018324703, 0.007777458, -0.006545, 0.0061233696, 0.011345099, -0.011779702, 0.013557036, -0.012525664, -0.009457492, -0.011799162, 0.0141927255, 0.014633816, -0.013595956, 0.023209129, 0.017280357, -0.0052995686, -0.011656457, -0.011552671, 0.00700555, -0.008173142, -0.017384144, 0.006619596, 0.022768037, 0.011429425, 0.038452685, -0.031836335, 5.1994317e-05, -0.02742543, 0.008205575, -0.024311854, 0.019797165, 0.0023708597, -0.002163288, 0.008095303, 0.0022362624, 0.00048406405, 0.0014797603, 0.006914737, 0.018512815, -0.030175759,