# PyTorch-Assistant

Given a query, we:

a) find `k` context documents most likely to contain the answer (k-nearest-neighbors)

b) engineer a prompt with these `k` documents along with 2 generic examples of Q&A (see [ref](https://github.com/hwchase17/langchain/blob/0b204d8c2134cc488c333e86e0440977bdacf216/langchain/chains/qa_with_sources/stuff_prompt.py#L4))

c) hope GPT infers the right answer from the given context

d) verify GPT's answer by cross-checking the cited context document

We use `langchain`, an OSS library that has helper functions for the steps above. 

In [1]:
# from langchain import llms 
# dir(llms)

In [3]:
import os
import requests
import yaml
import time
import pickle
import tempfile
import subprocess
import pathlib
from bs4 import BeautifulSoup as BSHTML
from requests.models import JSONDecodeError
from dotenv import load_dotenv, find_dotenv

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import MarkdownTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI, GPT4All

load_dotenv()

True

## BYO API Key

Get one here https://beta.openai.com/account/api-keys

In [4]:
# EMBED = "openai"
# embeddings = OpenAIEmbeddings()
EMBED = "hg"
embeddings = HuggingFaceEmbeddings()

## Get Data

### Chunking
Each page is chunked into smaller sub-pages. This is a workaround for LLM prompt size limits.

In [5]:
base_folder = "knowledgebase"
vector_folder = f"vectorstore/{EMBED.lower()}_embeddings"

In [5]:
def preprocess_and_pickle(page_iter, src_name):
    docs = []
    splitter = MarkdownTextSplitter(chunk_size=1024, chunk_overlap=0)
    for page in page_iter:
        docs.extend((splitter.create_documents([page['text']], [page['metadata']])))
    pickle.dump(docs, open(f'{base_folder}/{src_name}.pkl', 'wb'))

### blogs

In [9]:
def get_blogs(repo_owner='pytorch', repo_name='pytorch.github.io'):
    with tempfile.TemporaryDirectory() as d:
#         subprocess.check_call(
#             f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
#             cwd=d,
#             shell=True,
#         )
#         git_sha = (
#             subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d)
#             .decode("utf-8")
#             .strip()
#         )
        repo_path = pathlib.Path("/home/ubuntu/Repositories/fb/langchain/pytorch.github.io")
        markdown_files = list(repo_path.glob("_posts/*.md"))
        for markdown_file in markdown_files:
            with open(markdown_file, "r") as f:
                filename = markdown_file.parts[-1]
                title = os.path.splitext('-'.join(filename.split('-')[3:]))[0]
                blog_url = f"https://pytorch.org/blog/{title}/"
                yield {'text': f.read(), 'metadata': {"source": blog_url}}

# if you're in a hurry
# !curl "https://pytorch-qabot.s3.amazonaws.com/knowledgebase/blogs.pkl" --create-dirs -O --output-dir ./knowledgebase

In [10]:
preprocess_and_pickle(get_blogs(), 'blogs')


### forum

In [None]:
# def get_forum(period='weekly'):
#     host = "https://discuss.pytorch.org"

#     def _get_accepted_topics(period, page=0, dst=[]):
#         resp = requests.get(host+f'/top.json?page={page}&period={period}&per_page=100').json()
#         dst.extend([(d['id'], d['title']) for d in resp['topic_list']['topics'] if d['has_accepted_answer'] is True])
#         if 'more_topics_url' in resp['topic_list'].keys():
#             page += 1
#             _get_accepted_topics(period=period, page=page, dst=dst)
#         return dst

#     def _process_cooked(cooked):
#         bs = BSHTML(cooked)
#         p = ' '.join([x.get_text() for x in bs.find_all('p')])
#         return p

#     solved_topics = _get_accepted_topics(period)
#     for t, title in solved_topics:
#         try:
#             r = requests.get(host+f'/t/{t}/posts.json').json()
#         except JSONDecodeError:
#             continue
#         try:
#             q = title + '? ' + _process_cooked(r['post_stream']['posts'][0]['cooked'])
#             a = _process_cooked([x['cooked'] for x in r['post_stream']['posts'] if x['accepted_answer'] is True][0])
#         except IndexError:
#             print(f"Skipping https://discuss.pytorch.org/t/{t}/")
#             continue
#         text = "QUESTION: " + q + ' ANSWER: ' + a
#         yield {'text': text, 'metadata': {'source': f"https://discuss.pytorch.org/t/{t}/"}}

# preprocess_and_pickle(get_forum(), 'forum')

# # if you're in a hurry
# # !curl "https://pytorch-qabot.s3.amazonaws.com/knowledgebase/forum.pkl" --create-dirs -O --output-dir ./knowledgebase

### pytorch/docs/stable/*.html

In [None]:
# def get_docs(repo_owner='pytorch', repo_name='pytorch'):
#     with tempfile.TemporaryDirectory() as d:
#         subprocess.check_call(
#             f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
#             cwd=d,
#             shell=True,
#         )
#         repo_path = pathlib.Path(d + '/docs/source')
#         # repo_path = pathlib.Path('pytorch/docs/source')
#         markdown_files = list(repo_path.glob("**/*.rst"))
#         for markdown_file in markdown_files:
#             relative_path = markdown_file.relative_to(repo_path)
#             if '_' in markdown_file.name:
#                 continue
#             with open(markdown_file, "r") as f:
#                 i = markdown_file.parts.index('source')
#                 filename = os.path.splitext(relative_path)[0]
#                 page_url = f"https://pytorch.org/docs/stable/{filename}.html"
#                 yield {'text': f.read(), 'metadata': {"source": page_url}, "file":relative_path}

# preprocess_and_pickle(get_docs(), 'docs')


# # if you're in a hurry
# # !curl "https://pytorch-qabot.s3.amazonaws.com/knowledgebase/docs.pkl" --create-dirs -O --output-dir ./knowledgebase

## Create vector DB

In [9]:
from openai.error import RateLimitError

def create_vectorstores():
    pathlib.Path(vector_folder).mkdir(parents=True, exist_ok=True)
    for pages_path in os.listdir(base_folder):
        source = os.path.splitext(pages_path)[0]
        out_path = f"vectorstore/{EMBED.lower()}_embeddings/{source}.pkl"
        if os.path.exists(out_path):
            continue
            
        pages = pickle.load(open(os.path.join(base_folder, pages_path), 'rb'))
        docsearch = FAISS.from_documents([pages.pop(0)], embeddings)
        i, step = 0, 30
        while i<len(pages):
            texts = [d.page_content for d in pages[i:i+step]]
            meta = [d.metadata for d in pages[i:i+step]]
            try:
                docsearch.add_texts(texts, meta)
                i += step
            except Exception as err:
                print("Hit RateLimit @ i=", i, err)
                time.sleep(60)
        pickle.dump(docsearch, open(out_path, "wb"))

create_vectorstores()

# if you're in a hurry and on an arm64 
# !curl "https://pytorch-qabot.s3.amazonaws.com/vectorstore/openai_embeddings/blogs.pkl" --create-dirs -O --output-dir ./vectorstore/openai_embeddings
# !curl "https://pytorch-qabot.s3.amazonaws.com/vectorstore/openai_embeddings/docs.pkl" --create-dirs -O --output-dir ./vectorstore/openai_embeddings
# !curl "https://pytorch-qabot.s3.amazonaws.com/vectorstore/openai_embeddings/forum.pkl" --create-dirs -O --output-dir ./vectorstore/openai_embeddings

## Run it

In [1]:
%%bash
pip install pyllamacpp





In [8]:
import pyllamacpp

In [7]:
# qa_chain formats the prompt with context docs, passes it to the LLM and returns the answer

# base_llm = OpenAI(temperature=0.2)
base_llm = GPT4All()
qa_chain = load_qa_with_sources_chain(base_llm, chain_type="stuff")

KeyError: 'model'

In [6]:
queries = [
    "How to properly save embeddings?",
    "How to correctly access attribute of a custom layer inherited nn.Sequential class?",
    "How to assign a tensor to another tensor at different rows and columns?",
    "How to Recursively transforming Pytorch code to JIT script?",
    "Is current pytorch 2.0 version stable?"
]

for query in queries:
    print("QUERY: ", query)
    for vectordb in os.listdir('vectorstore/openai_embeddings'):
        source = os.path.splitext(vectordb)[0]
        vectordb = f'vectorstore/{EMBED}_embeddings/{vectordb}'
        db = pickle.load(open(vectordb, 'rb'))
        relevant_docs = db.similarity_search(query, k=4)
        print("From ", source)
        print(qa_chain.run(input_documents=relevant_docs, question=query))
        print("------")
    print("\n============\n")

QUERY:  How to properly save embeddings?
From  blogs


NameError: name 'qa_chain' is not defined