In [1]:
import os
import re
import pandas as pd
import openai
import tiktoken
from scipy import spatial 
import ast

openai.api_key = os.environ.get('OPENAI_API_KEY')


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL_16k = "gpt-3.5-turbo-16k"
GPT_MODEL = "gpt-3.5-turbo"

In [2]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def chunked_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    max_tokens: int = 2000,
) -> str:
    """Truncate a string to a maximum number of tokens."""
    encoding = tiktoken.encoding_for_model(model)
    encoded_string = encoding.encode(string)
    chunked_string = [encoding.decode(encoded_string[i:i+max_tokens]) for i in range(0, len(encoded_string), max_tokens)]
    return chunked_string



## Readme docs processing 

In [3]:
href_pattern = r'href="([^"]+\.ipynb)"'

In [4]:
ipynb_slug = r'/([^/.]+)\.ipynb'

In [64]:
# #appending content of quickstart notebooks to the docs quickstart page

# for root, dirs, files in os.walk("./fiddler-2023-8-15/v1.8/QuickStart Notebooks"):
#     for name in files:
#         path = os.path.join(root, name)
#         if path[-3:] == '.md':
#             with open(path,'r') as f:
#                 file_str = f.read()
#             ipynb_links = re.search(ipynb_slug, file_str)
#             if ipynb_links:
#                 with open("./fiddler-2023-8-15/quickstart/"+ipynb_links.group(1)+".md") as l:
#                     QS = l.read()
                
#                 with open(path, 'a') as f:
#                     f.write(QS)
                
                    
# #                 print(ipynb_links.group(1))
            


     

In [5]:
# chunked_doc = []
# for root, dirs, files in os.walk("./fiddler-2023-8-15/Changelog Posts"):
#     for name in files:
#         path = os.path.join(root, name)
#         if path[-3:] == '.md':
#             with open(path,'r') as f:
#                 file_str = f.read()
#                 chunked_doc.append(file_str)

In [6]:
# len(chunked_doc)

In [7]:
# #find and remove hidden pages
# pattern = r'hidden:\s*(\w+)'

# for doc in chunked_doc:
#     match = re.search(pattern, doc)
#     if match and match.group(1) == "true":
#         chunked_doc.remove(doc)

In [8]:
# len(chunked_doc)

In [9]:
# embeddings=[]
# for i in range(len(chunked_doc)):
#     response = openai.Embedding.create(model=EMBEDDING_MODEL, input=chunked_doc[i])
#     embeddings.append(response["data"][0]["embedding"])

# df = pd.DataFrame({"text": chunked_doc, "embedding": embeddings})

In [10]:
# df.to_csv("release_notes.csv",index=False)

In [11]:
chunked_doc = []
for root, dirs, files in os.walk("./fiddler-2023-10-9/v23.4"):
    for name in files:
        path = os.path.join(root, name)
        if path[-3:] == '.md':
            with open(path,'r') as f:
                file_str = f.read()
                chunked_doc.append(file_str)

In [12]:
len(chunked_doc)

208

In [13]:
#find and remove hidden pages
pattern = r'hidden:\s*(\w+)'

for doc in chunked_doc:
    match = re.search(pattern, doc)
    if match and match.group(1) == "true":
        chunked_doc.remove(doc)

In [14]:
len(chunked_doc)

191

In [15]:
slug_pattern = r'slug:\s*"(.*?)"'

In [16]:
# re.search(slug_pattern, """ slug: "abc" bla bla slug: "pqr" """).group(0)

In [17]:
token_lim_doc = []
for doc in chunked_doc:
    if num_tokens(doc) > 750:
        chunked_list = chunked_string(doc, max_tokens=750)
        chunked_doc_slug = re.search(slug_pattern, chunked_list[0]).group(0)
        for i in range(1, len(chunked_list)):
            chunked_list[i] = chunked_doc_slug + ' ' + chunked_list[i]
        
        token_lim_doc += chunked_list
    else:
        token_lim_doc.append(doc)
        
        

In [18]:
len(token_lim_doc)

274

In [19]:
# token_lim_doc
token_sizes = []
for doc in token_lim_doc:
    token_sizes.append(num_tokens(doc))

In [25]:
# token_sizes
# token_lim_doc[3]

In [163]:
from statistics import mean
mean(token_sizes)

498.3344370860927

In [26]:
embeddings=[]
for i in range(len(token_lim_doc)):
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=token_lim_doc[i])
    embeddings.append(response["data"][0]["embedding"])

df = pd.DataFrame({"text": token_lim_doc, "embedding": embeddings})

In [27]:
df

Unnamed: 0,text,embedding
0,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.014995344914495945, -0.0026680435985326767..."
1,"slug: ""client-setup"" _TOKEN\n)\n```\n```python...","[-0.021182812750339508, -0.004293339792639017,..."
2,"---\ntitle: ""Customer Churn Prediction""\nslug:...","[-0.00745741231366992, -0.010434732772409916, ..."
3,"slug: ""customer-churn-prediction"" /bb02793-chu...","[0.002079722471535206, -0.0047837127931416035,..."
4,"slug: ""customer-churn-prediction"" Churn-image5...","[0.0011718893656507134, 0.000443618802819401, ..."
...,...,...
269,"---\ntitle: ""Uploading a TensorFlow HDF5 Model...","[-0.0022432487457990646, 0.016934363171458244,..."
270,"---\ntitle: ""Uploading an XGBoost Model Artifa...","[0.0012045818148180842, 0.013162259012460709, ..."
271,"---\ntitle: ""Uploading a TensorFlow SavedModel...","[0.0018563421908766031, 0.0009264442487619817,..."
272,"---\ntitle: ""Uploading a scikit-learn Model Ar...","[0.0031662925612181425, 0.018320860341191292, ..."


In [28]:
df.to_csv("latest_v_23-4_tk750.csv", index=False)

In [19]:
import pandas as pd
import numpy as np
embeddings_path = "../documentation_data/chatbot_08_15_23_tk750.csv"
df_old = pd.read_csv(embeddings_path)

In [20]:
emb_arr = np.array(df_old.iloc[0]['embedding'].split(','))
len(emb_arr)

1536

In [40]:
df_old[307:]

Unnamed: 0,text,embedding
307,Once you have added a model on the Fiddler pla...,"[-0.022226542234420776, 0.011369146406650543, ..."
308,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."
309,Re-uploading in Fiddler essentially means havi...,"[-0.01760503277182579, 0.011651406064629555, 0..."


In [52]:
chunked_doc = []
path = "/Users/murtuzashergadwala/fiddler-chatbot/fiddler-2023-10-9/Changelog Posts/release-233.md"
with open(path,'r') as f:
    file_str = f.read()
    chunked_doc.append(file_str)

In [53]:
chunked_doc

['---\ntitle: "Release 23.3 Notes"\nslug: "release-233"\ncreatedAt: "2023-08-15T18:03:45.797Z"\nhidden: false\n---\nThis page enumerates the new features and updates in Release 23.3 of the Fiddler platform.\n\n> 📘 Platform Release Version 23.3 & Doc v1.8 compatability note\n> \n> Note that the documentation version remains v1.8 with this release. The new and improved functionalities are added to their respective pages with the note regarding platform version 23.3 as a requirement.\n\n## Release of Fiddler platform version 23.3:\n\n- Support for added charting up to 6 metrics for one or multiple models \n\n- Ability to assign metrics to the left or right y-axis in monitoring charts\n\n- Addition of automatically created model monitoring dashboards\n\n- New Root Cause Analysis tab with data drift and data integrity information in monitoring charts \n\n## What\'s New and Improved:\n\n- **Multiple metric queries in monitoring charts**\n  - Flexibility to add up to 6 metrics queries to visu

In [54]:
embeddings=[]
response = openai.Embedding.create(model=EMBEDDING_MODEL, input=chunked_doc[0])
embeddings.append(response["data"][0]["embedding"])

df_release233 = pd.DataFrame({"text": chunked_doc, "embedding": embeddings})

In [55]:
df_release233

Unnamed: 0,text,embedding
0,"---\ntitle: ""Release 23.3 Notes""\nslug: ""relea...","[-0.005071498453617096, 0.0015428883489221334,..."


In [48]:
df2 = pd.concat([df, df_old[:5], df_release234,df_old[307:]], ignore_index=True)

In [51]:
df_old[:6]

Unnamed: 0,text,embedding
0,package.py for R based models```python\nimport...,"[-0.009950872510671616, -0.011770655401051044,..."
1,"---\ntitle: ""Release 22.11 Notes""\nslug: ""rele...","[-0.005742393434047699, 0.001501509454101324, ..."
2,"---\ntitle: ""Release 23.2 Notes""\nslug: ""relea...","[0.010167686268687248, -0.001216516480781138, ..."
3,"---\ntitle: ""Release 22.12 Notes""\nslug: ""rele...","[-0.010491670109331608, -0.0011956370435655117..."
4,"---\ntitle: ""Release 23.1 Notes""\nslug: ""2023-...","[-0.00474295811727643, 0.0016190075548365712, ..."
5,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.01489181537181139, -0.0028792002703994513,..."


In [56]:
df3 = pd.concat([df2,df_release233], ignore_index=True)
df3

Unnamed: 0,text,embedding
0,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.014995344914495945, -0.0026680435985326767..."
1,"slug: ""client-setup"" _TOKEN\n)\n```\n```python...","[-0.021182812750339508, -0.004293339792639017,..."
2,"---\ntitle: ""Customer Churn Prediction""\nslug:...","[-0.00745741231366992, -0.010434732772409916, ..."
3,"slug: ""customer-churn-prediction"" /bb02793-chu...","[0.002079722471535206, -0.0047837127931416035,..."
4,"slug: ""customer-churn-prediction"" Churn-image5...","[0.0011718893656507134, 0.000443618802819401, ..."
...,...,...
279,"---\ntitle: ""Release 23.4 Notes""\nslug: ""relea...","[-0.004607734736055136, -0.008832967840135098,..."
280,Once you have added a model on the Fiddler pla...,"[-0.022226542234420776, 0.011369146406650543, ..."
281,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."
282,Re-uploading in Fiddler essentially means havi...,"[-0.01760503277182579, 0.011651406064629555, 0..."


In [60]:
mask = df_old['text'].str.contains('Quickstart', case=False)

# Filter the DataFrame based on the mask
result = df_old[mask]

In [66]:
result

Unnamed: 0,text,embedding
14,"slug: ""fraud-detection"" to Data Integrity Iss...","[-0.015683425590395927, -0.005404040217399597,..."
36,"---\ntitle: ""CV Monitoring""\nslug: ""cv-monitor...","[-0.030289800837635994, -0.002532508224248886,..."
37,"slug: ""cv-monitoring"" auth_token=AUTH_TOKE...","[-0.025598619133234024, -0.004655908793210983,..."
41,"slug: ""cv-monitoring"" _ID, 'monitor']))\n```\n...","[-0.02605891041457653, -0.009838425554335117, ..."
42,"---\ntitle: ""Explainability with Model Artifac...","[0.0029013229068368673, 0.017439918592572212, ..."
43,"slug: ""explainability-with-model-artifact-quic...","[0.005093783605843782, 0.004136959556490183, -..."
44,"slug: ""explainability-with-model-artifact-quic...","[-0.01352930348366499, 0.01176003273576498, -0..."
45,"slug: ""explainability-with-model-artifact-quic...","[-0.005808187648653984, -0.011234029196202755,..."
46,"slug: ""explainability-with-model-artifact-quic...","[-0.021635664626955986, 0.008698590099811554, ..."
47,"---\ntitle: ""Simple Monitoring""\nslug: ""quick-...","[-0.0051681166514754295, 0.012547116726636887,..."


In [67]:
df4 = pd.concat([df3,result], ignore_index=True)

In [68]:
df4

Unnamed: 0,text,embedding
0,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.014995344914495945, -0.0026680435985326767..."
1,"slug: ""client-setup"" _TOKEN\n)\n```\n```python...","[-0.021182812750339508, -0.004293339792639017,..."
2,"---\ntitle: ""Customer Churn Prediction""\nslug:...","[-0.00745741231366992, -0.010434732772409916, ..."
3,"slug: ""customer-churn-prediction"" /bb02793-chu...","[0.002079722471535206, -0.0047837127931416035,..."
4,"slug: ""customer-churn-prediction"" Churn-image5...","[0.0011718893656507134, 0.000443618802819401, ..."
...,...,...
307,"---\ntitle: ""Class Imbalance Monitoring Exampl...","[-0.01375657506287098, -0.00021721386292483658..."
308,"slug: ""class-imbalance-monitoring-example"" ler...","[0.0003084054624196142, -0.009778305888175964,..."
309,"slug: ""class-imbalance-monitoring-example"" # S...","[-0.019176218658685684, -0.015687033534049988,..."
310,"---\ntitle: ""Class-Imbalanced Data""\nslug: ""cl...","[-0.01668614149093628, 0.00771642941981554, -0..."


In [69]:
df4.to_csv("latest_v_23-4_tk750.csv", index=False)

In [70]:
new = pd.read_csv("latest_v_23-4_tk750.csv")
new

Unnamed: 0,text,embedding
0,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.014995344914495945, -0.0026680435985326767..."
1,"slug: ""client-setup"" _TOKEN\n)\n```\n```python...","[-0.021182812750339508, -0.004293339792639017,..."
2,"---\ntitle: ""Customer Churn Prediction""\nslug:...","[-0.00745741231366992, -0.010434732772409916, ..."
3,"slug: ""customer-churn-prediction"" /bb02793-chu...","[0.002079722471535206, -0.0047837127931416035,..."
4,"slug: ""customer-churn-prediction"" Churn-image5...","[0.0011718893656507134, 0.000443618802819401, ..."
...,...,...
307,"---\ntitle: ""Class Imbalance Monitoring Exampl...","[-0.01375657506287098, -0.00021721386292483658..."
308,"slug: ""class-imbalance-monitoring-example"" ler...","[0.0003084054624196142, -0.009778305888175964,..."
309,"slug: ""class-imbalance-monitoring-example"" # S...","[-0.019176218658685684, -0.015687033534049988,..."
310,"---\ntitle: ""Class-Imbalanced Data""\nslug: ""cl...","[-0.01668614149093628, 0.00771642941981554, -0..."


In [17]:
stray_doc[0] = "package.py for R based models" + stray_doc[0]

In [18]:
stray_doc[0]

'package.py for R based models```python\nimport fiddler as fdl\n```\n\n\n```python\nprint(fdl.__version__)\n```\n\n    1.6.2\n\n\n\n```python\nurl = \'\'\ntoken = \'\'\norg_id = \'\'\n\nclient = fdl.FiddlerApi(url=url, org_id=org_id, auth_token=token, version=2)\n```\n\n\n```python\nproject_id = \'test_r3\'\nmodel_id = \'iris\'\ndataset_id = \'iris\'\n```\n\n\n```python\n# client.create_project(project_id=project_id)\n```\n\n\n```python\nimport pandas as pd\nfrom pathlib import Path\nimport yaml\n```\n\n\n```python\ndf = pd.read_csv(\'test_R/data_r.csv\')\ndf.head()\n```\n\n\n\n\n<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Sepal.Length</th>\n      <th>Sepal.Width</th>\n    

In [19]:
embeddings=[]
for i in range(len(stray_doc)):
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=stray_doc[i])
    embeddings.append(response["data"][0]["embedding"])

df = pd.DataFrame({"text": stray_doc, "embedding": embeddings})

In [20]:
df.to_csv("rmodel.csv", index=False)

### Assembling docs and caveats

In [170]:
# embeddings_path4 = "/Users/murtuzashergadwala/fiddler-chatbot/caveats.csv"
# df4 = pd.read_csv(embeddings_path4)

# embeddings_path5 = "/Users/murtuzashergadwala/fiddler-chatbot/caveats2.csv"
# df5 = pd.read_csv(embeddings_path5)

# embeddings_path6 = "/Users/murtuzashergadwala/fiddler-chatbot/caveats3.csv"
# df6 = pd.read_csv(embeddings_path6)

In [171]:
# df4['embedding'] = df4['embedding'].apply(ast.literal_eval)
# df5['embedding'] = df5['embedding'].apply(ast.literal_eval)
# df6['embedding'] = df6['embedding'].apply(ast.literal_eval)

In [172]:
# df3 = pd.concat([df, df4, df5, df6], ignore_index=True)

In [15]:
# df3

In [174]:
# df3.to_csv("chatbot_08_15_23_tk750.csv", index=False)

In [21]:
df

Unnamed: 0,text,embedding
0,package.py for R based models```python\nimport...,"[-0.009950872510671616, -0.011770655401051044,..."


In [22]:
embeddings_path2 = "/Users/murtuzashergadwala/fiddler-chatbot/chatbot_08_15_23_tk750.csv"
df2 = pd.read_csv(embeddings_path2)

In [24]:
df2.drop([0], inplace=True)
df2

Unnamed: 0,text,embedding
1,"---\ntitle: ""Release 22.11 Notes""\nslug: ""rele...","[-0.005742393434047699, 0.001501509454101324, ..."
2,"---\ntitle: ""Release 23.2 Notes""\nslug: ""relea...","[0.010167686268687248, -0.001216516480781138, ..."
3,"---\ntitle: ""Release 22.12 Notes""\nslug: ""rele...","[-0.010491670109331608, -0.0011956370435655117..."
4,"---\ntitle: ""Release 23.1 Notes""\nslug: ""2023-...","[-0.00474295811727643, 0.0016190075548365712, ..."
5,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.01489181537181139, -0.0028792002703994513,..."
...,...,...
305,"---\ntitle: ""Uploading a scikit-learn Model Ar...","[0.0031662925612181425, 0.018320860341191292, ..."
306,"---\ntitle: ""client.get_slice""\nslug: ""clientg...","[-0.0003570110129658133, -0.021801473572850227..."
307,Once you have added a model on the Fiddler pla...,"[-0.022226542234420776, 0.011369146406650543, ..."
308,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."


In [25]:
df3 = pd.concat([df, df2], ignore_index=True)

In [26]:
df3

Unnamed: 0,text,embedding
0,package.py for R based models```python\nimport...,"[-0.009950872510671616, -0.011770655401051044,..."
1,"---\ntitle: ""Release 22.11 Notes""\nslug: ""rele...","[-0.005742393434047699, 0.001501509454101324, ..."
2,"---\ntitle: ""Release 23.2 Notes""\nslug: ""relea...","[0.010167686268687248, -0.001216516480781138, ..."
3,"---\ntitle: ""Release 22.12 Notes""\nslug: ""rele...","[-0.010491670109331608, -0.0011956370435655117..."
4,"---\ntitle: ""Release 23.1 Notes""\nslug: ""2023-...","[-0.00474295811727643, 0.0016190075548365712, ..."
...,...,...
305,"---\ntitle: ""Uploading a scikit-learn Model Ar...","[0.0031662925612181425, 0.018320860341191292, ..."
306,"---\ntitle: ""client.get_slice""\nslug: ""clientg...","[-0.0003570110129658133, -0.021801473572850227..."
307,Once you have added a model on the Fiddler pla...,"[-0.022226542234420776, 0.011369146406650543, ..."
308,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."


In [27]:
df3.to_csv("chatbot_08_15_23_tk750.csv", index=False)

In [15]:
test = "/Users/murtuzashergadwala/fiddler-chatbot/chatbot_08_15_23_tk750.csv"
df_test = pd.read_csv(test)
df_test

Unnamed: 0,text,embedding
0,package.py for R based models```python\nimport...,"[-0.00931614637374878, -0.011871236376464367, ..."
1,"---\ntitle: ""Release 22.11 Notes""\nslug: ""rele...","[-0.005742393434047699, 0.001501509454101324, ..."
2,"---\ntitle: ""Release 23.2 Notes""\nslug: ""relea...","[0.010167686268687248, -0.001216516480781138, ..."
3,"---\ntitle: ""Release 22.12 Notes""\nslug: ""rele...","[-0.010491670109331608, -0.0011956370435655117..."
4,"---\ntitle: ""Release 23.1 Notes""\nslug: ""2023-...","[-0.00474295811727643, 0.0016190075548365712, ..."
...,...,...
305,"---\ntitle: ""Uploading a scikit-learn Model Ar...","[0.0031662925612181425, 0.018320860341191292, ..."
306,"---\ntitle: ""client.get_slice""\nslug: ""clientg...","[-0.0003570110129658133, -0.021801473572850227..."
307,Once you have added a model on the Fiddler pla...,"[-0.022226542234420776, 0.011369146406650543, ..."
308,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."


## Testing Chatbot 

In [10]:
embeddings_path = "./chatbot_08_15_23_tk750.csv"
df = pd.read_csv(embeddings_path)
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)


# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
):
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n], query_embedding


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int,
    introduction='You are a tool called Fiddler Chatbot and your purpose is to use the below documentation from the company Fiddler to answer the subsequent documentation questions. Also, if possible, give me the reference URLs according to the following instructions. The way to create the URLs is: if you are discussing a client method or an API reference add "https://docs.fiddler.ai/reference/" before the "slug" value of the document. If it is Guide documentation add "https://docs.fiddler.ai/docs/" before before the "slug" value of the document. Only use the value following "slug:" to create the URLs and do not use page titles for slugs. If you are using quickstart notebooks, do not generate references. Note that if a user asks about uploading events, it means the same as publishing events. If the answer cannot be found in the documentation, write "I could not find an answer."'

):
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses, query_embed = strings_ranked_by_relatedness(query, df)
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = string
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question, query_embed


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
    temperature: int = 0,
    # chat_history=None,
    introduction='You are a tool called Fiddler Chatbot and your purpose is to use the below documentation from the company Fiddler to answer the subsequent documentation questions. Also, if possible, give me the reference URLs according to the following instructions. The way to create the URLs is: add "https://docs.fiddler.ai/docs/" before the "slug" value of the document. For any URL references that start with "doc:" or "ref:" use its value to create a URL by adding "https://docs.fiddler.ai/docs/" before that value. For reference URLs about release notes add "https://docs.fiddler.ai/changelog/" before the "slug" value of the document. Do not use page titles to create urls. Note that if a user asks about uploading events, it means the same as publishing events.  If the answer cannot be found in the documentation, write "I could not find an answer. Join our [Slack community](https://www.fiddler.ai/slackinvite) for further clarifications."'

):
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    
    message, query_embed = query_message(query, df=df, model=model, token_budget=token_budget, introduction = introduction)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about Fiddler documentation."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    response_message = response["choices"][0]["message"]["content"]
#     response_embedding_response = openai.Embedding.create(
#         model=EMBEDDING_MODEL,
#         input=response_message,
#     )
#     response_embed = response_embedding_response["data"][0]["embedding"]
   
    return response_message, message, query_embed


def ask2(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
    temperature: int = 0,
    # chat_history=None,
    introduction='You are a tool called Fiddler Chatbot and your purpose is to use the below documentation from the company Fiddler to answer the subsequent documentation questions. Also, if possible, give me the reference URLs according to the following instructions. The way to create the URLs is: add "https://docs.fiddler.ai/docs/" before the "slug" value of the document. For any URL references that start with "doc:" or "ref:" use its value to create a URL by adding "https://docs.fiddler.ai/docs/" before that value. For reference URLs about release notes add "https://docs.fiddler.ai/changelog/" before the "slug" value of the document. Do not use page titles to create urls. Note that if a user asks about uploading events, it means the same as publishing events.  If the answer cannot be found in the provided context, write "I could not find an answer. Join our [Slack community](https://www.fiddler.ai/slackinvite) for further clarifications." Do not try to make up an answer if it is not present in the context.'

):
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    
    message, query_embed = query_message(query, df=df, model=model, token_budget=token_budget, introduction = introduction)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about Fiddler documentation."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature
    )
    response_message = response["choices"][0]["message"]["content"]
#     response_embedding_response = openai.Embedding.create(
#         model=EMBEDDING_MODEL,
#         input=response_message,
#     )
#     response_embed = response_embedding_response["data"][0]["embedding"]
   
    return response_message, message, query_embed


In [5]:
s,r,emb = strings_ranked_by_relatedness("What all metrics do you support in LLMs", df)


In [6]:
s

('Custom metrics is an upcoming feature and it is currently not supported.',
 '---\ntitle: "Monitoring Charts"\nslug: "monitoring-charts-platform"\nhidden: false\ncreatedAt: "2023-02-23T22:56:27.756Z"\nupdatedAt: "2023-05-24T17:29:04.123Z"\n---\nFiddler AI’s monitoring charts allow you to easily track your models and ensure that they are performing optimally. For any of your models, monitoring charts for data drift, performance, data integrity, or traffic metrics can be displayed using Fiddler Dashboards.\n\n## Supported Metric Types\n\nMonitoring charts enable you to plot one of the following metric types for a given model:\n\n- [**Data Drift**](doc:data-drift-platform#what-is-being-tracked)\n  - Plot drift for up to 20 columns at once and track it using your choice of Jensen–Shannon distance (JSD) or Population Stability Index (PSI).\n- [**Performance**](doc:performance-tracking-platform#what-is-being-tracked)\n  - Available metrics are model dependent.\n- [**Data Integrity Violation

In [12]:
response_message, message, query_embed = ask("What all metrics do you support in LLMs")

In [8]:
message

'You are a tool called Fiddler Chatbot and your purpose is to use the below documentation from the company Fiddler to answer the subsequent documentation questions. Also, if possible, give me the reference URLs according to the following instructions. The way to create the URLs is: if you are discussing a client method or an API reference add "https://docs.fiddler.ai/reference/" before the "slug" value of the document. If it is Guide documentation add "https://docs.fiddler.ai/docs/" before before the "slug" value of the document. Only use the value following "slug:" to create the URLs and do not use page titles for slugs. If you are using quickstart notebooks, do not generate references. Note that if a user asks about uploading events, it means the same as publishing events. If the answer cannot be found in the documentation, write "I could not find an answer."Custom metrics is an upcoming feature and it is currently not supported.---\ntitle: "Monitoring Charts"\nslug: "monitoring-char

In [13]:
response_message

'Fiddler supports the following metrics for monitoring charts in LLMs:\n\n1. Data Drift: You can plot drift for up to 20 columns at once and track it using either Jensen–Shannon distance (JSD) or Population Stability Index (PSI). Reference: [Supported Metric Types - Data Drift](https://docs.fiddler.ai/reference/data-drift-platform#what-is-being-tracked)\n\n2. Performance: The available metrics depend on the model task. For binary classification models, supported metrics include Accuracy, True Positive Rate/Recall, False Positive Rate, Precision, F1 Score, AUROC, Binary Cross Entropy, Geometric Mean, Calibrated Threshold, Data Count, and Expected Calibration Error. Reference: [Supported Metric Types - Performance](https://docs.fiddler.ai/reference/performance-tracking-platform#what-is-being-tracked)\n\n3. Data Integrity Violations: You can plot data integrity violations for up to 20 columns and track one of the three violations at once. Reference: [Supported Metric Types - Data Integrit

In [25]:
s,r,emb = strings_ranked_by_relatedness("is my data safe?", df)
s

('slug: "data-integrity"  Rise of MLOps Monitoring_](https://www.fiddler.ai/blog/the-rise-of-mlops-monitoring)\n\n[^1]\\: _Join our [community Slack](https://www.fiddler.ai/slackinvite) to ask any questions_\n\n[block:html]\n{\n  "html": "<div class=\\"fiddler-cta\\">\\n<a class=\\"fiddler-cta-link\\" href=\\"https://www.fiddler.ai/trial?utm_source=fiddler_docs&utm_medium=referral\\"><img src=\\"https://files.readme.io/af83f1a-fiddler-docs-cta-trial.png\\" alt=\\"Fiddler Free Trial\\"></a>\\n</div>"\n}\n[/block]',
 'slug: "fraud-detection"  model\n3. Monitoring data integrity issues that could harm the model performance\n4. Investigating the features which have drifted/ compromised and analyzing them to mitigate the issue\n5. Performing a root cause analysis to identify the exact cause and fix it\n6. Diving into point explanations to identify how much the issue has an impact on a particular data point\n7. Setting up alerts to make sure the issue does not happen again\n\nWe discovered t

In [None]:
response_message, message, query_embed = ask("Why should I use Fiddler for monitoring my ML models?")

In [111]:
response_message, message, query_embed = ask("Why should I use Fiddler for monitoring my ML models?")

In [112]:
response_message

'Fiddler is the pioneer in enterprise Model Performance Management (MPM), offering a unified platform that enables Data Science, MLOps, Risk, Compliance, Analytics, and other LOB teams to monitor, explain, analyze, and improve ML deployments at enterprise scale. Fiddler allows you to obtain contextual insights at any stage of the ML lifecycle, improve predictions, increase transparency and fairness, and optimize business revenue. It provides features such as monitoring model performance, detecting data drift, explaining model behavior, analyzing model fairness, and more. By using Fiddler, you can ensure that your ML models are performing as expected and make informed decisions to improve their performance.\n\nReference URL: [https://docs.fiddler.ai/docs/](https://docs.fiddler.ai/docs/)'

In [113]:
response_message2, message2, query_embed2 = ask("How can I monitor fraud detection models?")

In [118]:
# message2

In [115]:
print(response_message2)

To monitor fraud detection models using Fiddler's AI Observability platform, you can utilize the following tools and features:

1. **Drift Detection**: Fiddler allows you to monitor drift in your fraud detection models. This includes handling class-imbalanced data, calculating feature impact, measuring feature drift, and determining prediction drift impact. You can find more information on these topics in the [Class-Imbalanced Data](https://docs.fiddler.ai/v1.3/docs/class-imbalanced-data) and [Data Drift](doc:data-drift-platform) documentation.

2. **Performance Metrics**: Fiddler provides performance metrics specifically tailored for fraud detection models. These metrics include recall (detection of non-fraudulent cases as fraud) and false positive rate (non-fraud cases labeled as fraud). Monitoring these metrics helps you assess the effectiveness of your model. 

3. **Data Integrity**: Fiddler allows you to monitor data integrity issues in your fraud detection models. This includes c

In [120]:
response_message2_2, message2_2, query_embed2_2 = ask2("How can I monitor fraud detection models?")

In [122]:
print(response_message2_2)

To monitor fraud detection models using Fiddler's AI Observability platform, you can use the following tools and features:

1. **Drift Detection**: Fiddler provides various metrics to detect drift in your fraud detection model, including class-imbalanced data, feature impact, feature drift, and prediction drift impact. You can find more information on these metrics in the [Class-Imbalanced Data](https://docs.fiddler.ai/docs/class-imbalanced-data) and [Data Drift](https://docs.fiddler.ai/docs/data-drift-platform) documentation.

2. **Performance Metrics**: Accuracy may not be the best measure of model performance for fraud detection. Fiddler recommends monitoring metrics like recall (detection of non-fraudulent cases as fraud) and false positive rate (non-fraud cases labeled as fraud). These metrics can be monitored in the Fiddler AI Observability platform. 

3. **Data Integrity**: Fiddler allows you to monitor data integrity issues in your fraud detection model. This includes checking 

In [123]:
response_message3, message3, query_embed3 = ask2("How can I upload events to Fiddler?")

In [124]:
print(response_message3)

To upload events to Fiddler, you can use the Fiddler Client's `publish_event` API. This API allows you to send traffic from your live deployed model to Fiddler in real-time. You can include inputs, outputs, target, decisions (categorical only), and metadata in the event.

Here is an example of how to use the `publish_event` API:

```python
client.publish_event(
    project_id=PROJECT_ID,
    model_id=MODEL_ID,
    event=event_data,
    event_id='event_001',
    event_timestamp=1637344470000
)
```

You need to replace `PROJECT_ID` and `MODEL_ID` with the actual IDs of your project and model. `event_data` should contain the data for the event you want to upload. The `event_id` is a unique identifier for the event, and `event_timestamp` is the timestamp for the event.

Please note that currently there isn't a way for users to directly delete events. If you need to delete events, please contact Fiddler personnel for assistance.

Reference: [Publishing Events](https://docs.fiddler.ai/docs/d

In [126]:
# message3

In [127]:
response_message4, message4, query_embed4 = ask2("What API to use to delete a project in Fiddler?")

In [129]:
print(response_message4)

To delete a project in Fiddler, you can use the `delete_project` API. This API allows you to delete a specific project by providing the project ID as a parameter. Here is an example of how to use the API:

```python
import fiddler as fdl

# Set up the Fiddler API client
client = fdl.FiddlerApi(url="https://api.fiddler.ai", org_id="your_org_id", auth_token="your_auth_token")

# Delete the project
client.delete_project(project_id="your_project_id")
```

Please note that deleting a project will permanently remove all associated datasets, models, and other project-related data. Make sure to use this API with caution as it cannot be undone.


In [134]:
df

Unnamed: 0,text,embedding
0,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.020456863567233086, -0.0040573012083768845..."
1,"---\ntitle: ""Customer Churn Prediction""\nslug:...","[-0.006977782119065523, -0.001356987631879747,..."
2,"slug: ""customer-churn-prediction"" analyze-rca-...","[-0.018147623166441917, -0.0034140129573643208..."
3,"---\ntitle: ""Fraud Detection""\nslug: ""fraud-de...","[-0.007915230467915535, -0.007179588079452515,..."
4,"slug: ""fraud-detection"" 3.png"",\n ""RCA3...","[-0.01609727181494236, -0.0010202372213825583,..."
...,...,...
200,"---\ntitle: ""Uploading a scikit-learn Model Ar...","[0.0031662925612181425, 0.018320860341191292, ..."
201,"---\ntitle: ""client.get_slice""\nslug: ""clientg...","[-0.0003327192389406264, -0.021783549338579178..."
202,Once you have added a model on the Fiddler pla...,"[-0.022226542234420776, 0.011369146406650543, ..."
203,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."


In [130]:
message4

'You are a tool called Fiddler Chatbot and your purpose is to use the below documentation from the company Fiddler to answer the subsequent documentation questions. Also, if possible, give me the reference URLs according to the following instructions. The way to create the URLs is: add "https://docs.fiddler.ai/docs/" before the "slug" value of the document. For any URL references that start with "doc:" or "ref:" use its value to create a URL by adding "https://docs.fiddler.ai/docs/" before that value. Do not use page titles to create urls. Note that if a user asks about uploading events, it means the same as publishing events. If the answer cannot be found in the documentation, write "I could not find an answer."Re-uploading in Fiddler essentially means having to delete what was uploaded or ingested earlier, making the updates you want to make, and then following the same steps as before for the specific thing you are looking to upload. So for example, if you want make changes to the b

In [136]:
response_message5, message5, query_embed5 = ask2("How to delete a project in Fiddler?")

In [139]:
message5

'You are a tool called Fiddler Chatbot and your purpose is to use the below documentation from the company Fiddler to answer the subsequent documentation questions. Also, if possible, give me the reference URLs according to the following instructions. The way to create the URLs is: add "https://docs.fiddler.ai/docs/" before the "slug" value of the document. For any URL references that start with "doc:" or "ref:" use its value to create a URL by adding "https://docs.fiddler.ai/docs/" before that value. Do not use page titles to create urls. Note that if a user asks about uploading events, it means the same as publishing events. If the answer cannot be found in the documentation, write "I could not find an answer."Re-uploading in Fiddler essentially means having to delete what was uploaded or ingested earlier, making the updates you want to make, and then following the same steps as before for the specific thing you are looking to upload. So for example, if you want make changes to the b

In [138]:
print(response_message5)

To delete a project in Fiddler, follow these steps:

1. Go to the Fiddler UI and click on the "Projects" tab.
2. Find the project you want to delete and click on it to open the project details.
3. In the project details page, click on the three-dot menu icon in the top-right corner.
4. From the dropdown menu, select "Delete Project".
5. A confirmation dialog will appear asking you to confirm the deletion. Click "Delete" to proceed.

Please note that deleting a project will permanently remove all associated models, datasets, and other project-related data. Make sure to double-check before deleting a project as this action cannot be undone.

Reference URL: [https://docs.fiddler.ai/docs/project-structure#projects](https://docs.fiddler.ai/docs/project-structure#projects)


In [140]:
df

Unnamed: 0,text,embedding
0,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.020456863567233086, -0.0040573012083768845..."
1,"---\ntitle: ""Customer Churn Prediction""\nslug:...","[-0.006977782119065523, -0.001356987631879747,..."
2,"slug: ""customer-churn-prediction"" analyze-rca-...","[-0.018147623166441917, -0.0034140129573643208..."
3,"---\ntitle: ""Fraud Detection""\nslug: ""fraud-de...","[-0.007915230467915535, -0.007179588079452515,..."
4,"slug: ""fraud-detection"" 3.png"",\n ""RCA3...","[-0.01609727181494236, -0.0010202372213825583,..."
...,...,...
200,"---\ntitle: ""Uploading a scikit-learn Model Ar...","[0.0031662925612181425, 0.018320860341191292, ..."
201,"---\ntitle: ""client.get_slice""\nslug: ""clientg...","[-0.0003327192389406264, -0.021783549338579178..."
202,Once you have added a model on the Fiddler pla...,"[-0.022226542234420776, 0.011369146406650543, ..."
203,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."


In [177]:
s,r,emb = strings_ranked_by_relatedness("How to delete a project in Fiddler?", df)

In [182]:
s,r,emb = strings_ranked_by_relatedness("How to use fiddler client to delete a project?", df)

In [183]:
s

("Re-uploading in Fiddler essentially means having to delete what was uploaded or ingested earlier, making the updates you want to make, and then following the same steps as before for the specific thing you are looking to upload. So for example, if you want make changes to the baseline dataset you uploaded, you will have to delete the dataset and then make modifications for the datasetinfo object or the dataset itself and then upload the dataset again. As for events, currently there isn't a way for the user to directly delete events. Please contact Fiddler personnell for the same. ",
 'slug: "authorizing-the-client" .ini\nclient = fdl.FiddlerApi()\n```',
 '---\ntitle: "About the Fiddler Client"\nslug: "about-the-fiddler-client"\nhidden: false\ncreatedAt: "2022-05-23T15:59:05.747Z"\nupdatedAt: "2022-05-23T15:59:05.747Z"\n---\nThe Fiddler Client contains many useful methods for sending and receiving data to and from the Fiddler platform.\n\nFiddler provides a Python Client that allows y

In [179]:
response_message5_2, message5_2, query_embed5_2 = ask2("How to delete a project in Fiddler?")

In [181]:
message5_2

'You are a tool called Fiddler Chatbot and your purpose is to use the below documentation from the company Fiddler to answer the subsequent documentation questions. Also, if possible, give me the reference URLs according to the following instructions. The way to create the URLs is: add "https://docs.fiddler.ai/docs/" before the "slug" value of the document. For any URL references that start with "doc:" or "ref:" use its value to create a URL by adding "https://docs.fiddler.ai/docs/" before that value. Do not use page titles to create urls. Note that if a user asks about uploading events, it means the same as publishing events.  If the answer cannot be found in the documentation, write "I could not find an answer."Re-uploading in Fiddler essentially means having to delete what was uploaded or ingested earlier, making the updates you want to make, and then following the same steps as before for the specific thing you are looking to upload. So for example, if you want make changes to the 

In [180]:
response_message5_2

'To delete a project in Fiddler, follow these steps:\n\n1. Go to the Projects page in Fiddler.\n2. Find the project you want to delete.\n3. Click on the three-dot menu icon (⋮) next to the project name.\n4. Select the "Delete" option from the menu.\n5. Confirm the deletion when prompted.\n\nPlease note that deleting a project will permanently remove all associated models, datasets, and other project-related data. Make sure to double-check before deleting a project as this action cannot be undone.\n\nReference URL: [https://docs.fiddler.ai/docs/project-structure#projects](https://docs.fiddler.ai/docs/project-structure#projects)'

In [201]:
df

Unnamed: 0,text,embedding
0,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.01489181537181139, -0.0028792002703994513,..."
1,"slug: ""client-setup"" _TOKEN\n)\n```\n```python...","[-0.021182812750339508, -0.004293339792639017,..."
2,"---\ntitle: ""Customer Churn Prediction""\nslug:...","[-0.007564195431768894, -0.01073946338146925, ..."
3,"slug: ""customer-churn-prediction"" /bb02793-chu...","[0.0021712256129831076, -0.004808966536074877,..."
4,"slug: ""customer-churn-prediction"" Churn-image5...","[0.001282965182326734, 6.48050699965097e-05, 0..."
...,...,...
300,"---\ntitle: ""Uploading a scikit-learn Model Ar...","[0.0031662925612181425, 0.018320860341191292, ..."
301,"---\ntitle: ""client.get_slice""\nslug: ""clientg...","[-0.0003570110129658133, -0.021801473572850227..."
302,Once you have added a model on the Fiddler pla...,"[-0.022226542234420776, 0.011369146406650543, ..."
303,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."


In [199]:
response_message5_2, message5_2, query_embed5_2 = ask2("How to delete a project?")

In [200]:
response_message5_2

"To delete a project in Fiddler, you can use the `client.delete_project` method. Here is an example of how to use it:\n\n```python\nPROJECT_ID = 'example_project'\n\nclient.delete_project(\n    project_id=PROJECT_ID\n)\n```\n\nPlease note that you cannot delete a project without deleting the datasets and models associated with that project. For more information, you can refer to the [client.delete_project documentation](https://docs.fiddler.ai/docs/clientdelete_project)."

In [198]:
df

Unnamed: 0,text,embedding
0,"---\ntitle: ""fdl.FiddlerApi""\nslug: ""client-se...","[-0.01489181537181139, -0.0028792002703994513,..."
1,"slug: ""client-setup"" _TOKEN\n)\n```\n```python...","[-0.021182812750339508, -0.004293339792639017,..."
2,"---\ntitle: ""Customer Churn Prediction""\nslug:...","[-0.007564195431768894, -0.01073946338146925, ..."
3,"slug: ""customer-churn-prediction"" /bb02793-chu...","[0.0021712256129831076, -0.004808966536074877,..."
4,"slug: ""customer-churn-prediction"" Churn-image5...","[0.001282965182326734, 6.48050699965097e-05, 0..."
...,...,...
300,"---\ntitle: ""Uploading a scikit-learn Model Ar...","[0.0031662925612181425, 0.018320860341191292, ..."
301,"---\ntitle: ""client.get_slice""\nslug: ""clientg...","[-0.0003570110129658133, -0.021801473572850227..."
302,Once you have added a model on the Fiddler pla...,"[-0.022226542234420776, 0.011369146406650543, ..."
303,Custom metrics is an upcoming feature and it i...,"[-0.017716489732265472, -0.0035160724073648453..."


In [196]:
s,r,emb = strings_ranked_by_relatedness("How to delete project?", df)

In [202]:
# s

In [210]:
r = ask2("Does Fiddler validate models?")[0]
print(r)

Yes, Fiddler provides the capability to validate models. You can view your model's performance on its training and test sets for quick validation prior to deployment. For more details, you can refer to the [Evaluation](https://docs.fiddler.ai/docs/evaluation-ui) documentation.


In [206]:
# df

In [207]:
s,r,emb = strings_ranked_by_relatedness("When should I not use Fiddler?", df, top_n=8)

In [209]:
# s

### Testing if string chunking is lossless

In [None]:
test_string = df[2:3]["text"].values[0]
num_tokens(test_string)

In [None]:
chunks_test = chunked_string(test_string,EMBEDDING_MODEL,1000)
recovered = ''.join(chunks_test)

In [None]:
recovered == test_string

In [None]:
new_chunks = []
chunk_embedding = []
for i , row in df.iterrows():
    if num_tokens(row["text"]) > 2000:
        df.drop(index= i, inplace=True)
        chunked_list = chunked_string(row["text"])
        for chunk in chunked_list:
            response = openai.Embedding.create(model=EMBEDDING_MODEL, input=chunk)
            chunk_embedding.append(response["data"][0]["embedding"])
            new_chunks.append(chunk)
            
            
        
        

In [None]:
embeddings=[]
for i in range(len(modified_docs)):
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=modified_docs[i])
    embeddings.append(response["data"][0]["embedding"])