## Follow these steps to reload the vector index, currently running in DataStax
* Download files from readme - go to configurations - project management and export docs.  You will get a download of ALL versions of fiddler documentation - you only need 23.x (latest).  Copy this into /documenation_data/23.x
* Make sure you copy in the ChangelongPosts (release notes) into the 23.x directory you plan to process
* you will have to clean up hidden docs that are in 23.x - this is done further down in this notebook
* you will need to add the caveats from old 23.x docs into new ones
* after preprocessing is done, you will need to run "loader_cassandra_vector_index.ipynb" to reload the vector index table.
* the "query_cassandra.ipynb" notebook has a cell to TRUNCATE the "fiddler_doc_snippets_openai" table before reloading it

### Notes and Hints

jupyter nbconvert *.ipynb --to markdown

The QuickStart .md files from ReadMe should have the google colab notebook content appended to the end of them in the /23.x/QuickStart Notebooks folder.  Make sure the new content from the colab notebooks gets appeneded for each release.  you can use this code below to assist with this effort.

### Here is possible code to script the appending of the google colab markdown notebooks to the quickstart docs pages

```python
for root, dirs, files in os.walk(“./fiddler-2023-8-15/v1.8/QuickStart Notebooks”):
    for name in files:
        path = os.path.join(root, name)
        if path[-3:] == ‘.md’:
            with open(path,‘r’) as f:
                file_str = f.read()
            ipynb_links = re.search(ipynb_slug, file_str)
            if ipynb_links:
                with open(“./fiddler-2023-8-15/quickstart/“+ipynb_links.group(1)+“.md”) as l:
                    QS = l.read()
                with open(path, ‘a’) as f:
                    f.write(QS)
                print(ipynb_links.group(1))
```

In [1]:
import os
import pandas as pd
import tiktoken
import openai
import re
from scipy import spatial 
import ast

In [2]:
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
GPT_MODEL = "gpt-3.5-turbo"
release_num = '23.7'

In [3]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

# def generate_embeddings(chunked_doc, tiktoken_encoding = "cl100k_base", token_limit = 8000 )
#     global EMBEDDING_MODEL = "text-embedding-ada-002"  
#     encoding = tiktoken.get_encoding(tiktoken_encoding)
#     embeddings=[]
#     for i in range(len(chunked_doc)):
#         fdl_doc_token_list = encoding.encode(chunked_doc[i])
#         if(len(fdl_doc_token_list)<token_limit):
#             response = openai.Embedding.create(model=EMBEDDING_MODEL, input=chunked_doc[i])
#             embeddings.append(response["data"][0]["embedding"])
#     return embeddings

def chunked_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    max_tokens: int = 2000,
) -> str:
    """Truncate a string to a maximum number of tokens."""
    encoding = tiktoken.encoding_for_model(model)
    encoded_string = encoding.encode(string)
    chunked_string = [encoding.decode(encoded_string[i:i+max_tokens]) for i in range(0, len(encoded_string), max_tokens)]
    return chunked_string

### creating embeddings for docs downloaded from readme

In [7]:
#change the path to where your downloaded folder is and choose the version of the docs you want to process
chunked_doc = []
for root, dirs, files in os.walk(f'documentation_data/v{release_num}'):
    for name in files:
        path = os.path.join(root, name)
        if path[-3:] == '.md':
            with open(path,'r') as f:
                file_str = f.read()
                chunked_doc.append(file_str)
                
len(chunked_doc)

241

In [8]:
#find and remove hidden pages
pattern = r'hidden:\s*(\w+)'

for doc in chunked_doc:
    match = re.search(pattern, doc)
    if match and match.group(1) == "true":
        chunked_doc.remove(doc)
        
len(chunked_doc)

223

In [9]:
#we will append page slugs to every chunk
slug_pattern = r'slug:\s*"(.*?)"'

In [10]:
#chunking docs to 750 tokens

token_lim_doc = []
for doc in chunked_doc:
    if num_tokens(doc) > 750:
        chunked_list = chunked_string(doc, max_tokens=750)
        chunked_doc_slug = re.search(slug_pattern, chunked_list[0]).group(0)
        for i in range(1, len(chunked_list)):
            chunked_list[i] = chunked_doc_slug + ' ' + chunked_list[i]
        
        token_lim_doc += chunked_list
    else:
        token_lim_doc.append(doc)

In [11]:
#Read in old caveats from previous version and output them to a new file
old_df = pd.read_csv('documentation_data/latest_v_23-4_tk750_fixurls2.csv')
caveats_df = old_df[~old_df['text'].str.contains('slug', case=False)]
caveats_df = caveats_df.drop(columns=['embedding'])
caveats_df.to_csv(f'documentation_data/v{release_num}/caveats.csv', index=False)

caveats_df.loc[len(caveats_df.index)] = ['LLM means large language model.  A large language model (LLM) is a type of artificial intelligence (AI) algorithm that uses deep learning techniques and massively large data sets to understand, summarize, generate and predict new content.']
caveats_df.loc[len(caveats_df.index)] = ['The term generative AI, or GenAI, also is closely connected with LLMs, which are, in fact, a type of generative AI that has been specifically architected to help generate text-based content.']
caveats_df.loc[len(caveats_df.index)] = ['FM, or FMs, means Foundation Models.  Foundation Models are the same as large language models.']

caveats_df

Unnamed: 0,text
279,package.py for R based models```python\nimport...
280,Once you have added a model on the Fiddler pla...
281,Custom metrics is an upcoming feature and it i...
282,Re-uploading in Fiddler essentially means havi...
320,"Currently, only the following fields in [fdl.M..."
321,AI has been in the limelight thanks to ‌recent...
6,LLM means large language model. A large langu...
7,"The term generative AI, or GenAI, also is clos..."
8,"FM, or FMs, means Foundation Models. Foundati..."


In [13]:
#embeddings = generate_embeddings(token_lim_doc)
#df = pd.DataFrame({"text": chunked_doc, "embedding": embeddings})
df = pd.DataFrame({"text": token_lim_doc})
df = pd.concat([df,caveats_df], ignore_index=True)
df.to_csv(f'documentation_data/vector_index_feed_{release_num}.csv', index=False)
df

Unnamed: 0,text
0,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se..."
1,"slug: ""client-setup"" url=URL,\n org_id=..."
2,"---\ntitle: ""Customer Churn Prediction""\nslug:..."
3,"slug: ""customer-churn-prediction"" ""https://fi..."
4,"slug: ""customer-churn-prediction"" ca-1.png"",\n..."
...,...
347,"Currently, only the following fields in [fdl.M..."
348,AI has been in the limelight thanks to ‌recent...
349,LLM means large language model. A large langu...
350,"The term generative AI, or GenAI, also is clos..."


In [14]:
df[df['text'].str.contains('release', case=False)]

Unnamed: 0,text
70,"---\ntitle: ""Release 22.11 Notes""\nslug: ""rele..."
71,"---\ntitle: ""Release 23.2 Notes""\nslug: ""relea..."
72,"---\ntitle: ""Release 23.6 Notes""\nslug: ""relea..."
73,"---\ntitle: ""Release 23.3 Notes""\nslug: ""relea..."
74,"---\ntitle: ""Release 22.12 Notes""\nslug: ""rele..."
75,"---\ntitle: ""Release 23.4 Notes""\nslug: ""relea..."
76,"---\ntitle: ""Release 23.5 Notes""\nslug: ""relea..."
77,"---\ntitle: ""Release 23.1 Notes""\nslug: ""2023-..."
195,"---\ntitle: ""Embedding Visualization Chart Cre..."
348,AI has been in the limelight thanks to ‌recent...


### to clean html text [optional] 

In [1]:
#!pip install beautifulsoup4
from bs4 import BeautifulSoup

html_text = "<p>This is <b>HTML</b> text.</p>"
soup = BeautifulSoup(html_text, 'html.parser')
clean_text = soup.get_text()
print(clean_text)

This is HTML text.


### Example of adding a caveat to already existing docs

In [2]:
## example 1
Caveats = """Currently, only the following fields in [fdl.ModelInfo()](ref:fdlmodelinfo) can be updated:
> 
> - `custom_explanation_names`
> - `preferred_explanation_method`
> - `display_name`
> - `description` """

chunked_doc = [Caveats]

In [3]:
## example 2
Caveats = "Once you have added a model on the Fiddler platform using a specific model info object, that is fdl.ModelInfo, you cannot modify aspects such as features, inputs, outputs, model task etc. specified in the model info object. Currently, if you want to change fundamental details about a modelinfo object, then it is advised to create/add a new model with a new modelinfo object."
chunked_doc = [Caveats]

In [4]:
def generate_embeddings(chunked_doc, tiktoken_encoding = "cl100k_base", token_limit = 8000 )
    global EMBEDDING_MODEL = "text-embedding-ada-002"  
    encoding = tiktoken.get_encoding(tiktoken_encoding)
    embeddings=[]
    for i in range(len(chunked_doc)):
        fdl_doc_token_list = encoding.encode(chunked_doc[i])
        if(len(fdl_doc_token_list)<token_limit):
            response = openai.Embedding.create(model=EMBEDDING_MODEL, input=chunked_doc[i])
            embeddings.append(response["data"][0]["embedding"])
    return embeddings

Unnamed: 0,text,embedding
0,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."


In [None]:
embeddings = generate_embeddings(chunked_doc)
df = pd.DataFrame({"text": chunked_doc, "embedding": embeddings})

### finding urls

In [49]:
text = """Fiddler's role in the ML lifecycle is to monitor, explain, analyze, and improve ML deployments at enterprise scale.
It provides contextual insights at any stage of the ML lifecycle, helps improve predictions, increases transparency and fairness, 
and optimizes business revenue. 
Reference: [Fiddler Simple Monitoring Quick Start Guide](https://docs.fiddler.ai/docs/Fiddler_Quickstart_Simple_Monitoring)"""

In [50]:
urls = re.findall(url_pattern, text)

In [51]:
urls

['https://docs.fiddler.ai/docs/Fiddler_Quickstart_Simple_Monitoring']