## Set-Up

In [53]:
import os
import pinecone
import pandas as pd
import openai
import warnings

from bs4 import BeautifulSoup
from tqdm.autonotebook import tqdm
from dotenv import find_dotenv, load_dotenv

warnings.filterwarnings('ignore')

In [54]:
EMBEDDING_MODEL = 'text-embedding-ada-002'
COMPLETION_MODEL = 'gpt-3.5-turbo'

## Source the Data 

In [55]:
def load_data(fp):
    
    files = os.listdir(fp)
    pages = [(fp + '/'+ file) for file in files if '.html' in file]
    return pages

backend_path = '../data/backend_html'
frontend_path = '../data/frontend_html'

all_pages = load_data(backend_path) + load_data(frontend_path)

## Scrape Information 

In [56]:
text = []
source = {}

for page in all_pages:
    with open(page) as s:
        soup = BeautifulSoup(s, 'html.parser')
        
    ps = soup.find_all('p')
    for p in ps:
        
        text.append(p.text)
        
        marker = page.index('_html/') + 6
        source[p.text] = page[marker:]

## Create Embeddings & Index

In [57]:
load_dotenv(find_dotenv())

openai.api_key = os.environ.get('OPENAI_API_KEY')
pinecone.init(api_key = os.environ.get('PINECONE_API_KEY'), environment = os.environ.get('PINECONE_ENV'))

index_name = 'quads'
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)
    
pinecone.create_index(index_name, dimension=1536)
index = pinecone.Index(index_name)

In [58]:
count = 0
batch_size = 32

for i in tqdm(range(0, len(text), batch_size)):
    
    # set end position of batch
    i_end = min(i+batch_size, len(text))
    
    # get batch of lines and IDs
    lines_batch = text[i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    
    # create embeddings
    res = openai.Embedding.create(input=lines_batch, engine=EMBEDDING_MODEL)
    embeds = [record['embedding'] for record in res['data']]
    
    # prep metadata and upsert batch
    meta = [{'text': line} for line in lines_batch]
    to_upsert = zip(ids_batch, embeds, meta)
    
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

  0%|          | 0/38 [00:00<?, ?it/s]

## Query with Natural Language

In [59]:
with open('../prompts/rules.txt') as r:
    rules = r.read()

In [61]:
def get_answer(question):

    xq = openai.Embedding.create(model=EMBEDDING_MODEL, input=question)['data'][0]['embedding']
    answers = index.query([xq], top_k = 6, include_metadata=True)
    
    plausible = ''
    i = 0
    source_list = []
    for match in answers['matches']:
        if match['score'] >= 0.75:
            plausible += str(i) + '. ' + match['metadata']['text'] + '\n\n'
            i += 1
            source_list.append(source[match['metadata']['text']])
            
    source_list = set(source_list)
            
    header = 'You are given the following question: ' + question + '\n'
    body = 'A crash-course on development gives the following possible answers: ' + plausible + '\n'
    footer = 'Combine the plausible answers to be a coherent and grammatically-correct answer.'
    
    prompt = header + body + footer
    
    chatgpt = openai.ChatCompletion.create(
        model = COMPLETION_MODEL,
        messages = [
            {'role':'system', 'content': rules},
            {'role':'user', 'content':prompt}
        ]
    )
    
    cleaned = chatgpt['choices'][0]['message']['content'].strip()
    
    print('The model finds that: \n' + cleaned)
    print('---------------------- \n\n')
    print('SOURCES:')
    [print(elem) for elem in source_list]

In [62]:
question = input()

What is React?


In [63]:
get_answer(question)

The model finds that: 
React is a JavaScript library that is widely used for building user interfaces. It was created and is maintained by Facebook and is popular for creating reusable UI components for web applications. Despite not being the most performant, it is relatively older, more mature, has a large and robust community, and is the most used JS library/framework. It is based on the SPA (single page application) model, where the webpage starts with a blank HTML template and relies entirely on JavaScript to manage the application's state and create/manage all necessary HTML elements. As a result, React uses JSX, a sort-of JavaScript version of HTML, inside JavaScript files to dictate the webpage's structure. React has a "virtual DOM" in the background that keeps track of how the page should look, changing the actual page to apply changes when necessary.
---------------------- 


SOURCES:
HW Adding ESLint f43800236d794c108cbe8507b3b482bd.html
Connecting to React a78ea8448d6a4a1f94