## Follow these steps to reload the vector index, currently running in DataStax
* Download files from readme - go to configurations - project management and export docs.  You will get a download of ALL versions of fiddler documentation - you only need 2x.x (latest).  Copy this into /documenation_data/2x.x
* Make sure you copy in the ChangelongPosts (release notes) into the 2x.x directory you plan to process
* Copy the latest [quickstart notebooks](https://github.com/fiddler-labs/fiddler-examples) from the 2x.x folder
* Generate the markdown version of .ipynb files and add it to quickstart pages on with the script below
* You will have to clean up hidden docs that are in 2x.x - this is done further down in this notebook
* You will need to add the caveats from old 23.x docs into new ones
* After this preprocessing is done, you will need to run "query_cassandra.ipynb" notebook which has a cell to TRUNCATE the "fiddler_doc_snippets_openai" table.  This needs to be done before reloading it.
* Reloading the vector index table is done via the "loader_cassandra_vector_index.ipynb" notebook.

### Imports

In [None]:
!pip -q install tiktoken

In [None]:
import os
import pandas as pd
import tiktoken
import openai 
import re
from scipy import spatial 
import ast
import feedparser
from bs4 import BeautifulSoup
import requests

### Set State

In [None]:
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
GPT_MODEL = "gpt-3.5-turbo"
release_num = '24.4'

In [None]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def chunked_string(
    string: str,
    model: str = EMBEDDING_MODEL,
    max_tokens: int = 2000,
) -> str:
    """Truncate a string to a maximum number of tokens."""
    encoding = tiktoken.encoding_for_model(model)
    encoded_string = encoding.encode(string)
    chunked_string = [encoding.decode(encoded_string[i:i+max_tokens]) for i in range(0, len(encoded_string), max_tokens)]
    return chunked_string

## Append Notebook markdowns to quick starts

- Download [Quickstart guides](https://github.com/fiddler-labs/fiddler-examples) v24.x into *documentation_data/notebooks-ipynb* folder: 
- Running the NBConvert in the next step will update the md-notebooks folder with markdown version of all the quickstarts from the *notebooks-ipynb* directory 

In [None]:
!jupyter nbconvert --output-dir='documentation_data/md-notebooks' documentation_data/notebooks-ipynb/*.ipynb --to markdown

The cell will take the markdown version of the notebooks and append it to the quickstart pages in the documentation directory 

In [None]:
for root, dirs, files in os.walk('/Users/anushrav/Projects/fiddler-chatbot/documentation_data/v24.4/QuickStart Notebooks'): # make sure this path points to your .ipynb notebooks
    for name in files:
        path = os.path.join(root, name)
        if path[-3:] == '.md':
            with open(path,'r') as f:
                file_str = f.read()    
            print(file_str)
            ipynb_links = re.search(r'\bFiddler_Quickstart_\w+', file_str)
            print(ipynb_links)
            if ipynb_links:
                with open('/Users/anushrav/Projects/fiddler-chatbot/documentation_data/md-notebooks/'+ipynb_links.group()+'.md') as l: 
                    QS = l.read()
                with open(path, 'a') as f:
                    f.write(QS)
                print(ipynb_links.group())

## Creating list of chunked_docs from downloaded documentation 

In [None]:
#change the path to where your downloaded folder is and choose the version of the docs you want to process
chunked_doc = []
for root, dirs, files in os.walk(f'documentation_data/v24.4'):
    for name in files:
        path = os.path.join(root, name)
        if path[-3:] == '.md':
            with open(path,'r') as f:
                file_str = f.read()
                chunked_doc.append(file_str)
                
len(chunked_doc)

In [None]:
#find and remove hidden pages
pattern = r'hidden:\s*(\w+)'

for doc in chunked_doc:
    match = re.search(pattern, doc)
    if match and match.group(1) == "true":
        chunked_doc.remove(doc)
        
len(chunked_doc)

### Crawl the blog and resources content and append it to chunked_doc list

In [None]:
def crawl_rss_feed(rss_url):
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    print("Number of Blogs:", len(feed.entries))
    
    # Iterate through the entries in the feed
    for entry in feed.entries:

        # Get the URL of the blog article
        article_url = entry.link

        # Fetch the content of the article
        response = requests.get(article_url)
        html_content = response.content.decode('utf-8', 'ignore')

        # Use BeautifulSoup to parse the HTML and extract the body
        soup = BeautifulSoup(html_content, 'html.parser')
        div_content = soup.find('div', class_='blog-post_content-wrapper')  # You may need to adjust this based on the HTML structure

        # Print or manipulate the content of the div
        if div_content:
            print("Title:", entry.title)
            print("Link:", entry.link)
            itemtext=''
            for item in div_content.select('p'):
                itemtext+=item.text + ' '
            chunked_doc.append("BlogLink:" + entry.link + " Content: " + itemtext)
        else:
            print("Div not found.")

# Replace 'your_rss_feed_url' with the actual RSS feed URL
rss_feed_url = 'https://www.fiddler.ai/blog/rss.xml'
crawl_rss_feed(rss_feed_url)

In [None]:
def crawl_rss_feed(rss_url):
    # Parse the RSS feed
    feed = feedparser.parse(rss_url)

    print("Number of Resources:", len(feed.entries))
    
    # Iterate through the entries in the feed
    for entry in feed.entries:

        # Get the URL of the blog article
        article_url = entry.link

        # Fetch the content of the article
        response = requests.get(article_url)
        html_content = response.content.decode('utf-8', 'ignore')

        # Use BeautifulSoup to parse the HTML and extract the body
        soup = BeautifulSoup(html_content, 'html.parser')

        
        div_content = soup.find('div', class_='resources-copy')  # You may need to adjust this based on the HTML structure

        # Print or manipulate the content of the div
        if div_content:
            print("Title:", entry.title)
            print("Link:", entry.link)
            itemtext=''
            for item in div_content.select('p'):
                itemtext+=item.text + ' '
            chunked_doc.append("ResourceLink:" + entry.link + " Content: " + itemtext)
        else:
            print("Div not found.")

# Replace 'your_rss_feed_url' with the actual RSS feed URL
rss_feed_url = 'http://www.fiddler.ai/resources/rss.xml'
crawl_rss_feed(rss_feed_url)

In [None]:
len(chunked_doc)

In [None]:
#we will append page slugs or blog links to every chunk
slug_pattern = r'slug:\s*"(.*?)"'
blog_pattern = r'BlogLink:https://www\.fiddler\.ai/blog/([\w-]+)'
resource_pattern = r'ResourceLink:https://www\.fiddler\.ai/resources/([\w-]+)'

In [None]:
#chunking docs to 750 tokens

token_lim_doc = []
for doc in chunked_doc:
    if num_tokens(doc) > 750:
        chunked_list = chunked_string(doc, max_tokens=750)
        
        # see if a doc slug or a blog link is detected
        slug = re.search(slug_pattern, chunked_list[0])
        blog = re.search(blog_pattern, chunked_list[0])

        if slug:
            chunked_doc_slug = slug.group(0)
            for i in range(1, len(chunked_list)):
                chunked_list[i] = chunked_doc_slug + ' ' + chunked_list[i]
        
        if blog:
            chunked_doc_blog = blog.group(0)
            for i in range(1, len(chunked_list)):
                chunked_list[i] = chunked_doc_blog + ' ' + chunked_list[i]
        
        token_lim_doc += chunked_list
    else:
        token_lim_doc.append(doc)

In [None]:
#Read in old caveats from previous version and output them to a new file
old_df = pd.read_csv('documentation_data/v24.3/caveats.csv')
caveats_df = old_df[~old_df['text'].str.contains('slug', case=False)]
#caveats_df = caveats_df.drop(columns=['embedding'])
caveats_df.to_csv(f'documentation_data/v{release_num}/caveats.csv', index=False)

caveats_df.loc[len(caveats_df.index)] = ['LLM means large language model.  A large language model (LLM) is a type of artificial intelligence (AI) algorithm that uses deep learning techniques and massively large data sets to understand, summarize, generate and predict new content.']
caveats_df.loc[len(caveats_df.index)] = ['The term generative AI, or GenAI, also is closely connected with LLMs, which are, in fact, a type of generative AI that has been specifically architected to help generate text-based content.']
caveats_df.loc[len(caveats_df.index)] = ['FM, or FMs, means Foundation Models.  Foundation Models are the same as large language models.']

caveats_df

In [None]:
#embeddings = generate_embeddings(token_lim_doc)
#df = pd.DataFrame({"text": chunked_doc, "embedding": embeddings})
df = pd.DataFrame({"text": token_lim_doc})
df = pd.concat([df,caveats_df], ignore_index=True)
df.to_csv(f'documentation_data/vector_index_feed_{release_num}.csv', index=False)
df

### to clean html text [optional] 

In [None]:
#!pip install beautifulsoup4
from bs4 import BeautifulSoup

html_text = "<p>This is <b>HTML</b> text.</p>"
soup = BeautifulSoup(html_text, 'html.parser')
clean_text = soup.get_text()
print(clean_text)

### Example of adding a caveat to already existing docs

In [None]:
## example 1
Caveats = """Currently, only the following fields in [fdl.ModelInfo()](ref:fdlmodelinfo) can be updated:
> 
> - `custom_explanation_names`
> - `preferred_explanation_method`
> - `display_name`
> - `description` """

chunked_doc = [Caveats]

In [None]:
## example 2
Caveats = "Once you have added a model on the Fiddler platform using a specific model info object, that is fdl.ModelInfo, you cannot modify aspects such as features, inputs, outputs, model task etc. specified in the model info object. Currently, if you want to change fundamental details about a modelinfo object, then it is advised to create/add a new model with a new modelinfo object."
chunked_doc = [Caveats]

#### Generate Embeddings (You can skip this since we generate embeddings vectors in loader_cassandra notebook)

In [None]:
openai.api_key = 'ADD YOUR OPENAI KEY'

In [None]:
def generate_embeddings(chunked_doc, tiktoken_encoding = "cl100k_base", token_limit = 8000):
    #global EMBEDDING_MODEL = "text-embedding-ada-002"  
    encoding = tiktoken.get_encoding(tiktoken_encoding)
    embeddings=[]
    for i in range(len(chunked_doc)):
        fdl_doc_token_list = encoding.encode(chunked_doc[i])
        if(len(fdl_doc_token_list)<token_limit):
            response = openai.Embedding.create(model=EMBEDDING_MODEL, input=chunked_doc[i])
            embeddings.append(response["data"][0]["embedding"])
    return embeddings

In [None]:
embeddings = generate_embeddings(chunked_doc)
df = pd.DataFrame({"text": chunked_doc, "embedding": embeddings})

In [None]:
df

### finding urls

In [None]:
text = """Fiddler's role in the ML lifecycle is to monitor, explain, analyze, and improve ML deployments at enterprise scale.
It provides contextual insights at any stage of the ML lifecycle, helps improve predictions, increases transparency and fairness, 
and optimizes business revenue. 
Reference: [Fiddler Simple Monitoring Quick Start Guide](https://docs.fiddler.ai/docs/Fiddler_Quickstart_Simple_Monitoring)"""

In [None]:
urls = re.findall(url_pattern, text)

In [None]:
urls