In [11]:
import math
import os
import pickle

import pandas as pd
import pinecone
import requests
import xmltodict
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS, Pinecone

In [2]:
load_dotenv()

True

In [3]:
import sys

sys.path.append('..')

from logger import logger
from config import PINECONE_ENV

## Scrape URLs

In [4]:
r = requests.get("http://eugeneyan.com/sitemap.xml")
xml = r.text
raw = xmltodict.parse(xml)

In [5]:
html = requests.get('https://eugeneyan.com/writing/content-moderation/').text
soup = BeautifulSoup(html, features="html.parser")

In [255]:
# _paragraphs = soup.find_all('p')
# _paragraphs[:5]

In [248]:
# paragraphs = []

# for p in _paragraphs:
#     if'class' in p.attrs and 'date' in p['class']:
#         continue
#     if p.get_text() == 'To cite this content, please use:':
#         break
#     paragraphs.append(p.get_text())

In [256]:
# lines = [line.strip() for line in paragraphs]
# lines[:5]

In [257]:
# lines = [line for line in lines if len(line) > 15]
# lines[:5]

In [259]:
# print('\n'.join(line for line in lines if line))

In [6]:
def extract_text_from(url, min_line_length=20, last_paragraph='To cite this content, please use:'):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    
    # Find all paragraphs and exclude all paragraphs after the "To cite this content, please use:" paragraph
    _paragraphs = soup.find_all('p')
    
    paragraphs = []
    for p in _paragraphs:
        if'class' in p.attrs and 'date' in p['class']:
            continue
        if p.get_text() == last_paragraph:
            break
        paragraphs.append(p.get_text())
    logger.debug(f'Paragraphs: {paragraphs[0]}')
    
    # Remove consecutive newlines
    lines = (line.strip() for line in paragraphs)
    
    # Remove lines that are less than 10 characters
    lines = (line for line in lines if len(line) > min_line_length)
    
    return '\n'.join(line for line in lines if line)

In [7]:
print(extract_text_from('https://eugeneyan.com/writing/mechanisms-for-projects/'))

How can we improve a machine learning project’s chance of success? Over the years, I’ve explored various mechanisms in both my own projects and those of my team members. Most people who tried these mechanisms ended up adopting them in future projects.
While these mechanisms were developed with machine learning projects in mind, with a few tweaks, they can be applied to other technical endeavors too.
If your team is like most teams I’ve been on, you have 2 - 3 problems for every available person. Thus, each member works on 1 or 2 problems simultaneously, with some folks taking 3 or more. And because everyone’s so busy, we barely have time to check in on each other’s projects outside of standup, planning, retrospective, etc.
This is an anti-pattern. It can lead to a project going off-track for months, or a critical error (e.g., incorrect training data, invalid train-validation split) going undetected until late in the implementation phase.
One solution is to have a pilot and copilot for 

In [8]:
pages = []

for info in raw['urlset']['url']:
    url = info['loc']
    if 'https://eugeneyan.com/writing/' in info['loc']:
        pages.append({'text': extract_text_from(url), 'url': url})

In [12]:
df = pd.DataFrame(pages)

# # Exclude short posts that may be talks and mostly images
df['text_len'] = df['text'].apply(lambda x: len(x))
df = df[df['text_len'] > 500]
df = df.drop(columns=['text_len'])

# Exclude certain urls
excluded_urls = {''}
df = df[~df['url'].isin(excluded_urls)]

In [15]:
df.to_parquet('../data/eugeneyan.parquet', compression='gzip')

## Split each page into documents

In [264]:
text_splitter = CharacterTextSplitter(chunk_size=1500, separator='\n')

docs, metadata = [], []

for page in pages:
    splits = text_splitter.split_text(page['text'])
    # for split in splits:
    #     docs.append(split)
    #     metadata.append({'source': split, 'url': page['url']})
    docs.extend(splits)
    metadata.extend([{'source': page['url']}] * len(splits))  # This Q&A chain relies on the url being in the 'source' key
    logger.info(f'Split {page["url"]} into {len(splits)} docs')

2023-03-26 17:06:54,129 - Split https://eugeneyan.com/writing/llm-bio/ into 10 docs
2023-03-26 17:06:54,131 - Split https://eugeneyan.com/writing/labeling-guidelines/ into 5 docs
2023-03-26 17:06:54,133 - Split https://eugeneyan.com/writing/content-moderation/ into 14 docs
2023-03-26 17:06:54,134 - Split https://eugeneyan.com/writing/mechanisms-for-teams/ into 6 docs
2023-03-26 17:06:54,134 - Split https://eugeneyan.com/writing/mechanisms-for-projects/ into 5 docs
2023-03-26 17:06:54,135 - Split https://eugeneyan.com/writing/roam-to-obsidian/ into 2 docs
2023-03-26 17:06:54,135 - Split https://eugeneyan.com/writing/getting-help/ into 3 docs
2023-03-26 17:06:54,136 - Split https://eugeneyan.com/writing/2022-in-review/ into 6 docs
2023-03-26 17:06:54,137 - Split https://eugeneyan.com/writing/autoencoders-vs-diffusers/ into 3 docs
2023-03-26 17:06:54,138 - Split https://eugeneyan.com/writing/text-to-image/ into 16 docs
2023-03-26 17:06:54,138 - Split https://eugeneyan.com/writing/recsys20

## Create a FAISS vector store for offline prototyping

In [265]:
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadata)
with open('../data/faiss_store.pkl', 'wb') as f:
    pickle.dump(store, f)  # This is a 10mb file

In [266]:
# question = 'Question for eugeneyan.com: Why is writing important?'

# with open('../data/faiss_store.pkl', 'rb') as f:
#     store = pickle.load(open('../data/faiss_store.pkl', 'rb'))

# chain = load_qa_with_sources_chain(ChatOpenAI(temperature=0), verbose=False)
# response = chain({'input_documents': store.similarity_search(question, 4), 
#                   'question': question})

In [267]:
# response

In [268]:
# # VectorDBQAWithSourcesChain is DEPRECATED

# question = 'Question for eugeneyan.com: Why is writing important?'

# with open('../data/faiss_store.pkl', 'rb') as f:
#     store = pickle.load(open('../data/faiss_store.pkl', 'rb'))

# llm=ChatOpenAI(temperature=0)
# chain = VectorDBQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', vectorstore=store)
# response = chain({'question': question})

In [303]:
# Use this instead of load_qa_with_sources_chain for more control
question = 'Question for eugeneyan.com: What is content moderation?'

with open('../data/faiss_store.pkl', 'rb') as f:
    store = pickle.load(open('../data/faiss_store.pkl', 'rb'))

llm=OpenAI(temperature=0)
chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', retriever=store.as_retriever(), return_source_documents=True)
response = chain({'question': question}, return_only_outputs=False)

In [304]:
response

{'question': 'Question for eugeneyan.com: What is content moderation?',
 'answer': ' Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. It involves collecting a set of ground truth, using supervised ML models, and applying heuristics and unsupervised models.\n',
 'sources': 'https://eugeneyan.com/writing/content-moderation/',
 'source_documents': [Document(page_content='Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. How do we know which are irrelevant, incorrect, or downright harmful? A related problem is detecting anomalous activity such as fraudulent transactions or malicious traffic.\nTo learn more about building robust content moderation systems, I dug into industry papers and tech blogs on classification, anomaly detection, and search relevance. Here are five patterns I observ

In [305]:
logger.info(f'Question: {response["question"]}')
logger.info(f'Answer: {response["answer"]}')
logger.info(f'Sources: {response["sources"]}')

sources = set(response['sources'].split(', '))

for doc in response['source_documents']:
    if doc.metadata["source"] in sources:
        logger.info(f'URL: {doc.metadata["source"]}\n')
        logger.info(f'Source: {doc.page_content}\n')
        logger.info('=====================')

2023-03-26 17:14:41,484 - Question: Question for eugeneyan.com: What is content moderation?
2023-03-26 17:14:41,486 - Answer:  Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. It involves collecting a set of ground truth, using supervised ML models, and applying heuristics and unsupervised models.

2023-03-26 17:14:41,488 - Sources: https://eugeneyan.com/writing/content-moderation/
2023-03-26 17:14:41,489 - URL: https://eugeneyan.com/writing/content-moderation/

2023-03-26 17:14:41,491 - Source: Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. How do we know which are irrelevant, incorrect, or downright harmful? A related problem is detecting anomalous activity such as fraudulent transactions or malicious traffic.
To learn more about building robust content moderation systems, I du

In [306]:
response['sources'].split(', ')

['https://eugeneyan.com/writing/content-moderation/']

In [149]:
doc

Document(page_content='How to Write: Advice from David Perell and Sahil Lavingia\neugeneyan\nStart Here\nWriting\nSpeaking\nNewsletter\nAbout\nHow to Write: Advice from David Perell and Sahil Lavingia\n[\nwriting\n]\n· 4 min read\nWriting is a superpower. Telepathy to be exact.\nThink about it—through writing, I broadcast ideas from my mind to yours. Across time. Across space. (Yes, the internet plays a big role, but let’s focus on writing.) The more effective your writing, the stronger your telepathic ability.\nWhy write about writing on this site?\nWriting is essential for effective data science. Good writing means you get buy-in on ideas, your methodology and experiments can be replicated, and readers understand enough to give feedback. Poor writing gets you zilch (and snores). Business folk have enough trouble understanding data geeks as it is—don’t make it harder with your writing.\nWriting is an important way to learn. (The other important way is learning.) When writing, you have

## Using pinecone

In [307]:
pinecone.init(api_key=os.getenv('PINECONE_API_KEY'), environment=PINECONE_ENV)

In [311]:
index_name = 'ask-ey'
index = pinecone.Index(index_name)

# Delete and recreate index
# pinecone.delete_index(index_name)
pinecone.create_index(index_name, dimension=1536, metric='cosine', pod_type='p2.x1')
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [313]:
# # Initialize with small set of data - 
# p = Pinecone.from_texts(docs[0:2], 
#                         embeddings, 
#                         index_name=index_name, 
#                         metadatas=metadata[0:2])

# index.describe_index_stats()

In [314]:
embeddings = OpenAIEmbeddings()

# Load existing pinecone index
store = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [315]:
# Add data to pinecone in chunks to avoid errors
chunk_size = 100
last_chunk = 0
num_chunks = math.ceil(len(docs) / chunk_size)

for i in range(last_chunk, num_chunks):    
    start_idx = i * chunk_size 
    end_idx = min(start_idx + chunk_size, len(docs))
    logger.info(f'Adding chunk {i+1} of {num_chunks} ({start_idx} to {end_idx}))')
    
    _docs = docs[start_idx:end_idx]
    _metadata = metadata[start_idx:end_idx]
    
    store.add_texts(_docs, _metadata)

2023-03-26 17:21:12,696 - Adding chunk 1 of 10 (0 to 100))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

2023-03-26 17:21:38,225 - Adding chunk 2 of 10 (100 to 200))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

2023-03-26 17:21:58,889 - Adding chunk 3 of 10 (200 to 300))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

2023-03-26 17:22:17,041 - Adding chunk 4 of 10 (300 to 400))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

2023-03-26 17:22:39,901 - Adding chunk 5 of 10 (400 to 500))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

2023-03-26 17:22:58,653 - Adding chunk 6 of 10 (500 to 600))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

2023-03-26 17:23:19,250 - Adding chunk 7 of 10 (600 to 700))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

2023-03-26 17:23:40,646 - Adding chunk 8 of 10 (700 to 800))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

2023-03-26 17:23:58,679 - Adding chunk 9 of 10 (800 to 900))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

2023-03-26 17:24:18,642 - Adding chunk 10 of 10 (900 to 1000))


Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

In [317]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [325]:
# Use this instead of load_qa_with_sources_chain for more control
question = 'Question for eugeneyan.com: Why is writing important?'

store = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)

llm=OpenAI(temperature=0)
chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', retriever=store.as_retriever(), return_source_documents=True)
response = chain({'question': question}, return_only_outputs=False)

In [326]:
response

{'question': 'Question for eugeneyan.com: Why is writing important?',
 'answer': ' Writing is important because it helps to clarify thinking, further learning, and share ideas with others.\n',
 'sources': 'https://eugeneyan.com/writing/how-to-write-david-x-sahil/, https://eugeneyan.com/writing/informal-mentors-chip-huyen/, https://eugeneyan.com/writing/writing-and-coding/, https://eugeneyan.com/writing/reading-note-taking-writing/',
 'source_documents': [Document(page_content='“There’s no such thing as good writing, only good rewriting” - Robert Graves\nWrite evergreen content. Focus on topics that will always add value and be helpful. Perhaps a short essay on why and how to write. Such writing stays relevant for years. (Nonetheless, many people write such pieces, so you might not contribute much. But hey, you learn a lot by writing about it). So reconsider writing that 183,768th piece of COVID-19 visualisation/dashboarding.\nIf you found this post useful, share this viral tweet with y

In [338]:
result_list = []
pretty_qa = ''

pretty_qa += f'**Question:** {response["question"]}\n'
pretty_qa += f'**Answer:** {response["answer"]}\n'
pretty_qa += f'**Sources:** {response["sources"]}\n\n'
result_list.append(pretty_qa)

for doc in response['source_documents']:
    pretty_source = ''
    pretty_source += f'**Source:** {doc.page_content}\n'
    pretty_source += f'**URL:** {doc.metadata["source"]}\n\n'
    result_list.append(pretty_source)

In [339]:
print(pretty_result)

Question: Question for eugeneyan.com: Why is writing important?
Answer:  Writing is important because it helps to clarify thinking, further learning, and share ideas with others.

Sources: https://eugeneyan.com/writing/how-to-write-david-x-sahil/, https://eugeneyan.com/writing/informal-mentors-chip-huyen/, https://eugeneyan.com/writing/writing-and-coding/, https://eugeneyan.com/writing/reading-note-taking-writing/

Source: “There’s no such thing as good writing, only good rewriting” - Robert Graves
Write evergreen content. Focus on topics that will always add value and be helpful. Perhaps a short essay on why and how to write. Such writing stays relevant for years. (Nonetheless, many people write such pieces, so you might not contribute much. But hey, you learn a lot by writing about it). So reconsider writing that 183,768th piece of COVID-19 visualisation/dashboarding.
If you found this post useful, share this viral tweet with your friends. Spread the word on writing effectively. =)
1

In [327]:
logger.info(f'Question: {response["question"]}')
logger.info(f'Answer: {response["answer"]}')
logger.info(f'Sources: {response["sources"]}')

sources = set(response['sources'].split(', '))

for doc in response['source_documents']:
    if doc.metadata["source"] in sources:
        logger.info(f'URL: {doc.metadata["source"]}\n')
        logger.info(f'Source: {doc.page_content}\n')
        logger.info('=====================')

2023-03-26 17:28:06,037 - Question: Question for eugeneyan.com: Why is writing important?
2023-03-26 17:28:06,038 - Answer:  Writing is important because it helps to clarify thinking, further learning, and share ideas with others.

2023-03-26 17:28:06,039 - Sources: https://eugeneyan.com/writing/how-to-write-david-x-sahil/, https://eugeneyan.com/writing/informal-mentors-chip-huyen/, https://eugeneyan.com/writing/writing-and-coding/, https://eugeneyan.com/writing/reading-note-taking-writing/
2023-03-26 17:28:06,040 - URL: https://eugeneyan.com/writing/how-to-write-david-x-sahil/

2023-03-26 17:28:06,040 - Source: “There’s no such thing as good writing, only good rewriting” - Robert Graves
Write evergreen content. Focus on topics that will always add value and be helpful. Perhaps a short essay on why and how to write. Such writing stays relevant for years. (Nonetheless, many people write such pieces, so you might not contribute much. But hey, you learn a lot by writing about it). So reco