In [1]:
import os
import pandas as pd
import tiktoken
import openai
import re
from scipy import spatial 
import ast

In [None]:
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
GPT_MODEL = "gpt-3.5-turbo"

In [None]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def generate_embeddings(chunked_doc, tiktoken_encoding = "cl100k_base", token_limit = 8000 )
    global EMBEDDING_MODEL = "text-embedding-ada-002"  
    encoding = tiktoken.get_encoding(tiktoken_encoding)
    embeddings=[]
    for i in range(len(chunked_doc)):
        fdl_doc_token_list = encoding.encode(chunked_doc[i])
        if(len(fdl_doc_token_list)<token_limit):
            response = openai.Embedding.create(model=EMBEDDING_MODEL, input=chunked_doc[i])
            embeddings.append(response["data"][0]["embedding"])
    return embeddings

def chunked_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    max_tokens: int = 2000,
) -> str:
    """Truncate a string to a maximum number of tokens."""
    encoding = tiktoken.encoding_for_model(model)
    encoded_string = encoding.encode(string)
    chunked_string = [encoding.decode(encoded_string[i:i+max_tokens]) for i in range(0, len(encoded_string), max_tokens)]
    return chunked_string

### creating embeddings for docs downloaded from readme

In [None]:
#change the path to where your downloaded folder is and choose the version of the docs you want to process
chunked_doc = []
for root, dirs, files in os.walk("./fiddler-2023-10-16/v23.4"):
    for name in files:
        path = os.path.join(root, name)
        if path[-3:] == '.md':
            with open(path,'r') as f:
                file_str = f.read()
                chunked_doc.append(file_str)

In [None]:
#find and remove hidden pages
pattern = r'hidden:\s*(\w+)'

for doc in chunked_doc:
    match = re.search(pattern, doc)
    if match and match.group(1) == "true":
        chunked_doc.remove(doc)

In [None]:
#we will append page slugs to every chunk
slug_pattern = r'slug:\s*"(.*?)"'

In [None]:
#chunking docs to 750 tokens

token_lim_doc = []
for doc in chunked_doc:
    if num_tokens(doc) > 750:
        chunked_list = chunked_string(doc, max_tokens=750)
        chunked_doc_slug = re.search(slug_pattern, chunked_list[0]).group(0)
        for i in range(1, len(chunked_list)):
            chunked_list[i] = chunked_doc_slug + ' ' + chunked_list[i]
        
        token_lim_doc += chunked_list
    else:
        token_lim_doc.append(doc)

In [None]:
embeddings = generate_embeddings(token_lim_doc)
df = pd.DataFrame({"text": chunked_doc, "embedding": embeddings})

### to clean html text [optional] 

In [1]:
#!pip install beautifulsoup4
from bs4 import BeautifulSoup

html_text = "<p>This is <b>HTML</b> text.</p>"
soup = BeautifulSoup(html_text, 'html.parser')
clean_text = soup.get_text()
print(clean_text)

This is HTML text.


### Example of adding a caveat to already existing docs

In [2]:
## example 1
Caveats = """Currently, only the following fields in [fdl.ModelInfo()](ref:fdlmodelinfo) can be updated:
> 
> - `custom_explanation_names`
> - `preferred_explanation_method`
> - `display_name`
> - `description` """

chunked_doc = [Caveats]

In [3]:
## example 2
Caveats = "Once you have added a model on the Fiddler platform using a specific model info object, that is fdl.ModelInfo, you cannot modify aspects such as features, inputs, outputs, model task etc. specified in the model info object. Currently, if you want to change fundamental details about a modelinfo object, then it is advised to create/add a new model with a new modelinfo object."
chunked_doc = [Caveats]

In [4]:
def generate_embeddings(chunked_doc, tiktoken_encoding = "cl100k_base", token_limit = 8000 )
    global EMBEDDING_MODEL = "text-embedding-ada-002"  
    encoding = tiktoken.get_encoding(tiktoken_encoding)
    embeddings=[]
    for i in range(len(chunked_doc)):
        fdl_doc_token_list = encoding.encode(chunked_doc[i])
        if(len(fdl_doc_token_list)<token_limit):
            response = openai.Embedding.create(model=EMBEDDING_MODEL, input=chunked_doc[i])
            embeddings.append(response["data"][0]["embedding"])
    return embeddings

Unnamed: 0,text,embedding
0,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."


In [None]:
embeddings = generate_embeddings(chunked_doc)
df = pd.DataFrame({"text": chunked_doc, "embedding": embeddings})

### finding urls

In [49]:
text = """Fiddler's role in the ML lifecycle is to monitor, explain, analyze, and improve ML deployments at enterprise scale.
It provides contextual insights at any stage of the ML lifecycle, helps improve predictions, increases transparency and fairness, 
and optimizes business revenue. 
Reference: [Fiddler Simple Monitoring Quick Start Guide](https://docs.fiddler.ai/docs/Fiddler_Quickstart_Simple_Monitoring)"""

In [50]:
urls = re.findall(url_pattern, text)

In [51]:
urls

['https://docs.fiddler.ai/docs/Fiddler_Quickstart_Simple_Monitoring']