# Questions and Answers

In [None]:
import openai, os, glob, pandas as pd, numpy as np
from typing import List
import logging
logging.basicConfig(filename='qa.log', encoding='utf-8', level=logging.ERROR)

In [None]:
openai.api_key ='ENTER KEY HERE'
# Directory path containing text files
TEXTDIR = "btext/"
#File name for saving embbedings
EMBEDDINGSF = "embeddings.csv"

In [None]:
def get_all_text_to_df (dir: str) -> pd.core.frame.DataFrame:
    """Return contents of all text files in the directory dir removing new lines and extra spaces into dataframe"""

    text_files = glob.glob(os.path.join(TEXTDIR, "*.txt"))

    # Read all text files into a list of (file names, file_text) removing new_lines
    all_text = []
    for file in text_files:
        with open(file, "r") as f:
            file_text = f.read().replace('\n', ' ').replace("\\n", " ").replace('  ', ' ')
            all_text.append((file.split('/')[-1], file_text))
    df = pd.DataFrame(all_text, columns = ['title', 'text'])
    return (df)

In [None]:
df = get_all_text_to_df (TEXTDIR)

# If the df is bigger than, e.g., 100MB, then save a copy in a new file
memory_usage = df.memory_usage(deep=True).sum() / 1024**2 # convert bytes to megabytes
if memory_usage > 100 :
        print(f"Memory usage: {memory_usage:.2f} MB")
        df.to_csv('pdqana.csv')

[tiktoken](https://pub.dev/documentation/tiktoken/latest/) a Byte pair encoding (BPE) tokenizer.

In [None]:
import tiktoken

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Tokenize the text and save the number of tokens to a new column 'n_tokens'
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

print(f"The largest document has {df.n_tokens.max()} tokens, the shortest {df.n_tokens.min()}, the average has {int(df.n_tokens.mean())}.")

In [None]:
# Visualize the distribution of the number of tokens per row using a histogram
df.n_tokens.hist();

The current documents are too long for handling, so we need to split them into smaller documents.
`split_text_into_chunks` tokenizes the input of a document `doc` into sentences using `nltk.sent_tokenize()` function and then creates `chunks` by concatenating sentences until the chunk size limit is reached. The resulting chunks are returned as a list.

In [None]:
import nltk
# Check if the punkt tokenizer data files are present in the nltk data directory, else download them.
if not os.path.exists(nltk.data.find('tokenizers/punkt')):   nltk.download('punkt')
#else:   print('punkt tokenizer data files already present')

In [None]:
def split_text_into_chunks(doc:str, max_tokens:int =500) -> List[str]:
    """Return a list of strings from text where each string has less than max_tokens"""
    # Tokenize the string `text` into a list of sentences
    sentences = nltk.sent_tokenize(doc)

    # If there is only one sentence and it has less than `max_tokens` tokens, return it
    if len(sentences) == 1 and len(sentences[0].split()) <= max_tokens: return sentences

    chunks = []   # empty list to store the chunks
    current_chunk = ""
    try:
        # Loop through the sentences, grouping them into chunks of up to max_tokens
        for sentence in sentences:
            if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "

        # Add the last chunk
        if len(current_chunk) > 0: 
            chunks.append(current_chunk.strip()) 
        return chunks
    except Exception as e:
        logging.error(f"Error splitting text into chunks: {e}")
        return []

In [None]:
def create_shortened(df, max_tokens:int = 500) -> list:
    # apply the split_text_into_chunks() function to each row
    chunks = df.apply(lambda row: split_text_into_chunks(row['text'], max_tokens) 
                      if row['n_tokens'] > max_tokens else [row['text']], axis=1)

    # Flatten the list of chunks and return it
    return [chunk for sublist in chunks for chunk in sublist]

Create a shortened list in a new df, then apply `tokenizer.encode` and save the number of tokens in each row in a new `n_tokens` column. 
Notice that BPE `tokenizer.encode()` will generate more tokens.

In [None]:
sh = create_shortened(df, max_tokens=400)
df = pd.DataFrame(sh, columns = ['text'])
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

In [None]:
print(f"The largest document has {df.n_tokens.max()} tokens, the shortest {df.n_tokens.min()}, the average has {int(df.n_tokens.mean())}.")

df.n_tokens.hist();

Below generates embeddings for the text in each row of the `df.text` column, and stores them in a new column `'embeddings'`.
It uses the `apply()` method of the `df.text` column to apply a `lambda` function that takes `x`, (each row in the column). `openai.Embedding.create()` generates embeddings for a given input text, `x` using `'text-embedding-ada-002'`, a specific LM engine. 
`['data'][0]['embedding']` is extracting the embedding data from the API response, which is a dictionary.

In [None]:
%time df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])

### Save and retrieve the embeddings

if we read back the embeddings from a stored file we need to convert the string representations to the proper types.
The built-in `eval()` function converts the string representation of a list or array to an actual list or array object. 
Then `np.array()` converts each list object to a NumPy array object.

In [None]:
df.to_csv(EMBEDDINGSF)
df=pd.read_csv(EMBEDDINGSF, index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

### Create context
`create_context` for a given question by finding the most similar context from the dataframe `df` of precomputed embeddings and their corresponding texts. 
Arguments:
- `question`: a string representing the question for which the context is being created.
- `df`: a pandas dataframe containing precomputed embeddings and their corresponding texts.
- `max_len=1800`: an integer representing the maximum length of the context that can be returned.
- `size='ada'`: a string representing the size of the language model used to compute embeddings.

It first computes embeddings for the input question using the [text-embedding-ada-002](https://openai.com/blog/new-and-improved-embedding-model) engine, and then computes the distances between the question embeddings and the embeddings of the contexts in the dataframe `df` using the cosine distance metric.
It then sorts `df` by the distances in ascending order and adds the text of the contexts to the `returns` list until the length of the concatenated texts exceeds the `max_len` limit. The function concatenates the texts in the `returns` list using the string `"\n\n###\n\n"` as a delimiter and returns the resulting string as the final context.

In [None]:
from openai.embeddings_utils import distances_from_embeddings

distances_from_embeddings(query_embedding: List[float], embeddings: List[List[float]], distance_metric='cosine',) -> List[List]

[openai.Embedding.create](https://platform.openai.com/docs/api-reference/embeddings)

In [None]:
def create_context(df, question:str = None,  max_len:int =1800, size:str ="ada", separator:str ="\n\n###\n\n"):
    """
    Create a context for a question by finding the most similar context from the dataframe df
    """
    # Get the embeddings for the question
    try:
        q_embeddings = openai.Embedding.create(input=question, engine=f'text-embedding-{size}-002')['data'][0]['embedding']
    except Exception as e:  print(e)

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values)

    # Sort by distance and add the text to the context until the context is too long
    context_len = 0
    context = [row["text"] for i, row in df.sort_values('distances', ascending=True).iterrows() 
               if (context_len := context_len + row['n_tokens'] + 4) <= max_len]

    # Return the context
    return separator.join(context)

### Answer Question
`answer_question()` uses `create_context()` to find the most similar context to a given question in a given dataframe (`df`). It then uses OpenAI's API to generate a text response to the question based on the context.
Parameters:
- `df`: the pandas dataframe containing the texts to search for the context.
- `model`: the OpenAI model used for generating the response. Default is "text-davinci-003".
- `question`: the question to be answered.
- `max_len`: the maximum length of the context to be considered when searching for the most similar context. Default is 1800.
- `size`: the size of the OpenAI API engine to use for embedding the question. Default is "ada".
- `debug`: a boolean indicating whether to print the raw model response. Default is False.
- `max_tokens`: the maximum number of tokens to generate in the response. Default is 150.
- `stop_sequence`: a string sequence that indicates when the response should stop generating. Default is None.

It first calls `create_context()` to find the most similar context to the question. If `debug`, it prints the raw context.
It then generates a `response` using [openai.Completion.create](https://platform.openai.com/docs/api-reference/completions) by providing a prompt that includes the context and the question to be answered. It sets various parameters such as the maximum number of tokens to generate, and the model (default [text-davinci-003](https://help.openai.com/en/articles/6779149-how-do-text-davinci-002-and-text-davinci-003-differ)) to use. If an exception occurs during the generation of the response, an empty string is returned. Otherwise, the generated response text is returned.

In [None]:
def answer_question(df, model:str="text-davinci-003", question:str="", max_len:int=1800, size:str="ada",
    debug:bool=False, max_tokens:int=150, stop_sequence:str=None):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(df, question, max_len=max_len, size=size, )

    # If debug, print the raw model response
    if debug: print(f"Context:\n {context} \n\n")

    #Prompt includes the question and context
    prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, \
    say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:"
  
    params = {"prompt": prompt, 
              "temperature": 0, 
              "max_tokens":max_tokens, 
              "top_p":1, 
              "frequency_penalty":0,
              "presence_penalty":0, 
              "stop":stop_sequence, 
              "model":model
             }
    try: # Create a completions using the prompt
        response = openai.Completion.create(**params)
        return response.choices[0].text.strip()
    except Exception as e:
        print(e)
        return ""

In [None]:
questions = ["What is stable diffusion?", 
             "What is a Unet?",
             "How do we find a value for bandwidth?"
            ]
for q in questions:
    a = answer_question(df, question=q)
    print(f"Question: {q}\n Answer: {a} \n")
             

In [None]:
answer_question(df, question='How are latents used in Stable Diffusion?', debug=True)