# Azure OpenAI Service - Q&A with semantic answering Quickstart app

This notebook helps you to build a simple Q&A demo application by doing the following steps

1. Data preparation - you will need to adapt this code to have it work with your data
1. Embedding creation - this will mostly work out of the box
1. Prompt creation - this will mostly work out of the box, but you could adapt this a little bit
1. App creation - this will mostly work out of the box, but you can make changes if needed

Firstly, create a file called `.env` in this folder, and add the following content, obviously with your values:

```
OPENAI_API_KEY=xxxxxx
OPENAI_API_BASE=https://xxxxxxx.openai.azure.com/
```

In [None]:
!pip install -r requirements.txt

In [None]:
import os
import json
import tiktoken
import openai
import numpy as np
import pickle
from dotenv import load_dotenv
from openai.embeddings_utils import cosine_similarity
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Load environment variables
load_dotenv()

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = "2022-12-01"
openai.api_base = os.getenv('OPENAI_API_BASE')
openai.api_key = os.getenv("OPENAI_API_KEY")

# Define embedding model and encoding
EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_ENCODING = 'cl100k_base'
EMBEDDING_CHUNK_SIZE = 8000
COMPLETION_MODEL = 'text-davinci-003'

# initialize tiktoken for encoding text
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)

## Data preparation

Adapt this code to read in our data, the output should be an Python array with dicts inside, containing the keys filename, text

In [None]:
# list all files in the data
data_dir = os.path.join(os.getcwd(), "data/")
files = os.listdir(data_dir)

# read content from each file and append it to documents
documents = []
for file in files:
    with open(os.path.join(data_dir, file), "r") as f:
        # read the content from the txt file
        content = f.read()
        documents.append({
            "filename": file,
            "content": content,
        })

# print some stats about the documents
print(f"Loaded {len(documents)} documents")
for doc in documents:
    num_tokens = len(encoding.encode(doc['content']))
    print(f"Filename: {doc['filename']} Content: {doc['content'][:80]}... \n---> Tokens: {num_tokens}\n")

Let's create the function to embed a single document:

In [None]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text):
    # remove newlines and double spaces
    text = text.replace("\n", " ").replace("  ", " ")
    return openai.Embedding.create(input=text, engine=EMBEDDING_MODEL)["data"][0]["embedding"]

In [None]:
# Create embeddings for all docs
for doc in documents:
    doc['embedding'] = get_embedding(doc['content'])
    print(f"Created embedding for {doc['filename']}")
    
# Save documents to disk
pickle.dump(documents, open("documents.pkl", "wb"))

Now let's write our app template, feel free to change the title, etc.:

In [None]:
%%writefile ./app.py

import os
import openai
import streamlit as st
import tiktoken
import numpy as np
import pickle
from openai.embeddings_utils import cosine_similarity
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Load environment variables
load_dotenv()

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = "2022-12-01"
openai.api_base = os.getenv('OPENAI_API_BASE')
openai.api_key = os.getenv("OPENAI_API_KEY")

COMPLETION_MODEL = 'text-davinci-003'
EMBEDDING_MODEL = 'text-embedding-ada-002'

documents = pickle.load(open("documents.pkl", "rb"))

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text):
    # remove newlines and double spaces
    text = text.replace("\n", " ").replace("  ", " ")
    return openai.Embedding.create(input=text, engine=EMBEDDING_MODEL)["data"][0]["embedding"]

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def run_prompt(prompt, max_tokens=1000):
    response = openai.Completion.create(
        engine=COMPLETION_MODEL,
        prompt=prompt,
        temperature=0.7,
        max_tokens=max_tokens
    )
    return response['choices'][0]['text']

# configure UI elements with Streamlit

st.title('Demo app')
question = st.text_input('Question')
answer_button = st.button('Generate answer')

if answer_button:
    question_prompt = f"""You extract search queries from prompts and remove all styling options or other things (e.g., the formatting the user asks for). You do not answer the question.
Prompt: {question}\n
Query:"""
    actual_question = run_prompt(question_prompt, max_tokens=100)
    
    
    qe = get_embedding(actual_question)
    similarities = [cosine_similarity(qe, doc['embedding']) for doc in documents]
    max_i = np.argmax(similarities)

    st.write(f"Searching for: {actual_question} --> found answer in document: {documents[max_i]['filename']}")

    prompt = f"""
    Content:
    {documents[max_i]['content']}
    Please answer the question below using only the content from above. If you don't know the answer or can't find it, say "I couldn't find the answer".
    Question: {question}
    Answer:"""
    answer = run_prompt(prompt)

    st.write('Answer:')
    st.write(answer)