In [None]:
'''
 * Copyright 2023 QuickAns
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 '''

Procedure:

0. Prerequisites: Import libraries, set API key
1. Collect: We download a few hundred Wikipedia articles about the 2022 Olympics
2. Chunk: Documents are split into short, semi-self-contained sections to be embedded
3. Embed: Each section is embedded with the OpenAI API
4. Store: Embeddings are saved in a CSV file (for large datasets, use a vector database)

## 0. Prerequisites

### Import libraries

In [None]:
# imports
import mwclient  # for downloading example Wikipedia articles
import mwparserfromhell  # for splitting Wikipedia articles into sections
import openai  # for generating embeddings
import pandas as pd  # for DataFrames to store article sections and embeddings
import re  # for cutting <ref> links out of Wikipedia articles
import tiktoken  # for counting tokens

Install any missing libraries with `pip install` in your terminal. E.g.,

```zsh
pip install openai
```

(You can also do this in a notebook cell with `!pip install openai`.)

If you install any libraries, be sure to restart the notebook kernel.

### Set API key (if needed)

Note that the OpenAI library will try to read your API key from the `OPENAI_API_KEY` environment variable. If you haven't already, set this environment variable by following [these instructions](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).

In [None]:
from keys import *
openai.api_key = OPENAI_API_KEY

Next, we'll recursively split long sections into smaller sections.

There's no perfect recipe for splitting text into sections.

Some tradeoffs include:
- Longer sections may be better for questions that require more context
- Longer sections may be worse for retrieval, as they may have more topics muddled together
- Shorter sections are better for reducing costs (which are proportional to the number of tokens)
- Shorter sections allow more sections to be retrieved, which may help with recall
- Overlapping sections may help prevent answers from being cut by section boundaries

Here, we'll use a simple approach and limit sections to 1,600 tokens each, recursively halving any sections that are too long. To avoid cutting in the middle of useful sentences, we'll split along paragraph boundaries when possible.

In [None]:
GPT_MODEL = "gpt-3.5-turbo"  # only matters insofar as it selects which tokenizer to use

def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

## 3. Embed document chunks

Now that we've split our library into shorter self-contained strings, we can compute embeddings for each.

(For large embedding jobs, use a script like [api_request_parallel_processor.py](api_request_parallel_processor.py) to parallelize requests while throttling to stay under rate limits.)

In [None]:
in_file = "ans_generator/data/ir_txt/paragraphs.txt"
with open(in_file, 'r') as f:
    data_book = f.read().split('\n')
    data_book.remove('')

In [None]:
# num_tokens(data[0])
# len(data)

In [None]:
# calculate embeddings
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

embeddings = []
for batch_start in range(0, len(data_book), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = data_book[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": data_book, "embedding": embeddings})

## 4. Store document chunks and embeddings

Because this example only uses a few thousand strings, we'll store them in a CSV file.

(For larger datasets, use a vector database, which will be more performant.)

In [None]:
# save document chunks and embeddings

SAVE_PATH = "ans_generator/data/ir_book_embeddings.csv"

df.to_csv(SAVE_PATH, index=False)


In [None]:
# def halved_by_delimiter(string: str, delimiter: str = "\n") -> list[str, str]:
#     """Split a string in two, on a delimiter, trying to balance tokens on each side."""
#     chunks = string.split(delimiter)
#     if len(chunks) == 1:
#         return [string, ""]  # no delimiter found
#     elif len(chunks) == 2:
#         return chunks  # no need to search for halfway point
#     else:
#         total_tokens = num_tokens(string)
#         halfway = total_tokens // 2
#         best_diff = halfway
#         for i, chunk in enumerate(chunks):
#             left = delimiter.join(chunks[: i + 1])
#             left_tokens = num_tokens(left)
#             diff = abs(halfway - left_tokens)
#             if diff >= best_diff:
#                 break
#             else:
#                 best_diff = diff
#         left = delimiter.join(chunks[:i])
#         right = delimiter.join(chunks[i:])
#         return [left, right]


# def truncated_string(
#     string: str,
#     model: str,
#     max_tokens: int,
#     print_warning: bool = True,
# ) -> str:
#     """Truncate a string to a maximum number of tokens."""
#     encoding = tiktoken.encoding_for_model(model)
#     encoded_string = encoding.encode(string)
#     truncated_string = encoding.decode(encoded_string[:max_tokens])
#     if print_warning and len(encoded_string) > max_tokens:
#         print(f"Warning: Truncated string from {len(encoded_string)} tokens to {max_tokens} tokens.")
#     return truncated_string


# def split_strings_from_subsection(
#     subsection: tuple[list[str], str],
#     max_tokens: int = 1000,
#     model: str = GPT_MODEL,
#     max_recursion: int = 5,
# ) -> list[str]:
#     """
#     Split a subsection into a list of subsections, each with no more than max_tokens.
#     Each subsection is a tuple of parent titles [H1, H2, ...] and text (str).
#     """
#     titles, text = subsection
#     string = "\n\n".join(titles + [text])
#     num_tokens_in_string = num_tokens(string)
#     # if length is fine, return string
#     if num_tokens_in_string <= max_tokens:
#         return [string]
#     # if recursion hasn't found a split after X iterations, just truncate
#     elif max_recursion == 0:
#         return [truncated_string(string, model=model, max_tokens=max_tokens)]
#     # otherwise, split in half and recurse
#     else:
#         titles, text = subsection
#         for delimiter in ["\n\n", "\n", ". "]:
#             left, right = halved_by_delimiter(text, delimiter=delimiter)
#             if left == "" or right == "":
#                 # if either half is empty, retry with a more fine-grained delimiter
#                 continue
#             else:
#                 # recurse on each half
#                 results = []
#                 for half in [left, right]:
#                     half_subsection = (titles, half)
#                     half_strings = split_strings_from_subsection(
#                         half_subsection,
#                         max_tokens=max_tokens,
#                         model=model,
#                         max_recursion=max_recursion - 1,
#                     )
#                     results.extend(half_strings)
#                 return results
#     # otherwise no split was found, so just truncate (should be very rare)
#     return [truncated_string(string, model=model, max_tokens=max_tokens)]
