## Set-Up

In [1]:
import os
import pinecone
import pandas as pd
import openai

from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from dotenv import find_dotenv, load_dotenv

  from tqdm.autonotebook import tqdm


In [2]:
EMBEDDING_MODEL = 'text-embedding-ada-002'
COMPLETION_MODEL = 'gpt-3.5-turbo'

## Source the Data 

In [7]:
def load_data(fp):
    
    files = os.listdir(fp)
    pages = [(fp + '/'+ file) for file in files if '.html' in file]
    return pages

backend_path = 'data/backend_html'
frontend_path = 'data/frontend_html'

all_pages = load_data(backend_path) + load_data(frontend_path)

## Scrape Information 

In [8]:
text = []
source = {}

for page in all_pages:
    with open(page) as s:
        soup = BeautifulSoup(s, 'html.parser')
        
    ps = soup.find_all('p')
    for p in ps:
        
        text.append(p.text)
        source[p.text] = page

## Create Embeddings & Index

In [6]:
load_dotenv(find_dotenv())

openai.api_key = os.environ.get('OPENAI_API_KEY')
pinecone.init(api_key = os.environ.get('PINECONE_API_KEY'), environment = os.environ.get('PINECONE_ENV'))

index_name = 'quads'
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=2000)

index = pinecone.Index(index_name)

In [9]:
count = 0
batch_size = 32

for i in tqdm(range(0, len(text), batch_size)):
    
    # set end position of batch
    i_end = min(i+batch_size, len(text))
    
    # get batch of lines and IDs
    lines_batch = text[i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    
    # create embeddings
    res = openai.Embedding.create(input=lines_batch, engine=EMBEDDING_MODEL)
    embeds = [record['embedding'] for record in res['data']]
    
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

  0%|          | 0/38 [00:00<?, ?it/s]

## Query with Natural Language

In [10]:
with open('prompts/rules.txt') as r:
    rules = r.read()

In [15]:
def get_answer(question):

    xq = openai.Embedding.create(model=EMBEDDING_MODEL, input=question)['data'][0]['embedding']
    answers = index.query([xq], top_k = 6, include_metadata=True)
    
    plausible = ''
    i = 0
    for match in answers['matches']:
        if match['score'] >= 0.75:
            plausible += str(i) + '. ' + match['metadata']['text'] + '\n\n'
            i += 1
            
    header = 'You are given the following question: ' + question + '\n'
    body = 'A crash-course on development gives the following possible answers: ' + plausible + '\n'
    footer = 'Combine the plausible answers to be a coherent and grammatically-correct answer.'
    
    prompt = header + body + footer
    
    chatgpt = openai.ChatCompletion.create(
        model = COMPLETION_MODEL,
        messages = [
            {'role':'system', 'content': rules},
            {'role':'user', 'content':prompt}
        ]
    )
    
    cleaned = chatgpt['choices'][0]['message']['content'].strip()
    
    print('The model finds that: \n' + cleaned)
    print('---------------------- \n\n')
    print('Some of the relevant sources include: \n' + plausible)

In [16]:
question = input()

What are some JavaScript best practices?


In [17]:
get_answer(question)

The model finds that: 
Some best practices in JavaScript include staying consistent with your coding style, whether it be naming variables or defining functions. It is also advisable to check for existing libraries before starting work on a complicated feature, but if it is not too difficult to implement yourself, it's usually better to not use a library. Another tip is to use newer code features that are widely supported, even though it may require additional effort to maintain.
---------------------- 


Some of the relevant sources include: 
0. Below are some general opinions for writing cleaner Javascript. They are not absolute, and you definitely may have some reasons to not follow some if not all of them. Otherwise, the only thing more perfect than perfect is consistency.

1. Javascript tends to have multiple ways of doing things. Defining functions, variables, and more. Usually there are good reasons to pick one over another, but the main thing to pay attention to is to stay cons