In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

## Get content

Get contents from VS Code docs and save it into a json format that can be used for RAG models.

In [28]:
# use the github api to recursively fetch all md files from a GH repo

import requests

REPO_OWNER = 'microsoft'
REPO_NAME = 'vscode-docs'

def get_markdown_files_from_github(path=''):
    url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{path}'
    headers = {
        'Authorization': f'token {os.getenv("GH_TOKEN")}'
    }
    response = requests.get(url, headers=headers)
    markdown_files = []

    if response.status_code == 200:
        files = response.json()

        for file in files:
            if file['type'] == 'dir':
                markdown_files += get_markdown_files_from_github(file['path'])
            elif file['type'] == 'file' and file['name'].endswith('.md'):
                markdown_files.append(file['path'])

        return markdown_files
    else:
        raise Exception(f"Error fetching repo contents: {response.status_code}")

# test
ds_md_files = get_markdown_files_from_github("docs/datascience")
print(ds_md_files)

['docs/datascience/azure-machine-learning.md',
 'docs/datascience/data-science-tutorial.md',
 'docs/datascience/data-wrangler-quick-start.md',
 'docs/datascience/data-wrangler.md',
 'docs/datascience/jupyter-kernel-management.md',
 'docs/datascience/jupyter-notebooks.md',
 'docs/datascience/notebooks-web.md',
 'docs/datascience/overview.md',
 'docs/datascience/python-interactive.md',
 'docs/datascience/pytorch-support.md']

In the following format:
```json
 {'name': 'data-science-tutorial.md',
  'path': 'docs/datascience/data-science-tutorial.md',
  'sha': '96f05758760fcad24265833c60d5537c0beec61c',
  'size': 20982,
  'url': 'https://api.github.com/repos/microsoft/vscode-docs/contents/docs/datascience/data-science-tutorial.md?ref=main',
  'html_url': 'https://github.com/microsoft/vscode-docs/blob/main/docs/datascience/data-science-tutorial.md',
  'git_url': 'https://api.github.com/repos/microsoft/vscode-docs/git/blobs/96f05758760fcad24265833c60d5537c0beec61c',
  'download_url': 'https://raw.githubusercontent.com/microsoft/vscode-docs/main/docs/datascience/data-science-tutorial.md',
  'type': 'file',
  '_links': {'self': 'https://api.github.com/repos/microsoft/vscode-docs/contents/docs/datascience/data-science-tutorial.md?ref=main',
   'git': 'https://api.github.com/repos/microsoft/vscode-docs/git/blobs/96f05758760fcad24265833c60d5537c0beec61c',
   'html': 'https://github.com/microsoft/vscode-docs/blob/main/docs/datascience/data-science-tutorial.md'}},
```

In [10]:
import base64

# use github api to fetch the content of a file
def get_file_content(file_path):
    url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{file_path}'
    headers = {
        'Authorization': f'token {os.getenv("GH_TOKEN")}'
    }
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        file_content = base64.b64decode(response.json()['content']).decode('utf-8')
        title = next(line.replace('# ', '') for line in file_content.split('\n') if line.startswith('#'))
        area = next((line.split(':', 1)[1].strip() for line in file_content.split('\n') if line.lower().startswith('area:')), '')
        description = next((line.split(':', 1)[1].strip() for line in file_content.split('\n') if line.lower().startswith('metadescription:')), '')
        content = file_content.split(title, 1)[1].strip()
        full_content = f"{description}\n\n{content}"
        return {
            'title': title,
            # 'content': full_content, # rate limit could be exceeded for free tier
            'content': description,
            'url': f'https://github.com/{REPO_OWNER}/{REPO_NAME}/blob/main/{file_path}',
            'area': area
        }
    else:
        raise Exception(f"Error fetching file content: {response.status_code}")
    
# test
get_file_content("docs/datascience/data-science-tutorial.md")

{'title': 'Data Science in VS Code tutorial',
 'content': 'Python data science tutorial demonstrating the use of common data science and machine learning libraries with Visual Studio code Jupyter Notebook support.',
 'url': 'https://github.com/microsoft/vscode-docs/blob/main/docs/datascience/data-science-tutorial.md',
 'area': 'datascience'}

## Create embeddings and add to the json

In [14]:
from azure.ai.inference import EmbeddingsClient
from azure.core.credentials import AzureKeyCredential

In [15]:
def generate_embedding(text):
    azure_endpoint = "https://models.inference.ai.azure.com"
    model_name = "text-embedding-3-small"
    client = EmbeddingsClient(
        endpoint=azure_endpoint,
        credential=AzureKeyCredential(os.getenv("AZURE_TOKEN"))
    )

    response = client.embed(input=text, model=model_name)
    embeddings = [item.embedding for item in response.data]
    return embeddings

In [38]:
import re

def chunk_text(text, max_token_length):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # split the text by sentence boundary
    chunks = []
    current_chunk = []

    current_token_length = 0
    for sentence in sentences:
        sentence_length = len(sentence.split())  # simple token estimation by word count
        if current_token_length + sentence_length <= max_token_length:
            current_chunk.append(sentence)
            current_token_length += sentence_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_token_length = sentence_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def generate_embeddings_for_document(content, max_token_length):
    chunks = chunk_text(content, max_token_length)
    chunk_embeddings = []

    for chunk in chunks:
        embedding = generate_embedding(chunk)
        chunk_embeddings.append({
            'text': chunk,
            'embedding': embedding
        })

    return chunk_embeddings

# test
document_content = get_file_content("docs/datascience/data-science-tutorial.md")['content']
max_tokens = 8000
print(generate_embeddings_for_document(document_content, max_tokens))

[{'text': 'Python data science tutorial demonstrating the use of common data science and machine learning libraries with Visual Studio code Jupyter Notebook support.', 'embedding': [[-0.056206148, -0.03270176, 0.0031168864, -0.012028116, 0.03282439, -0.014828204, -0.007843313, 0.0012026839, -0.0021115628, 0.0059476322, 0.033662375, -0.0017283391, -0.014480748, -0.048970886, 0.038015794, 0.0075776107, 0.012866098, 0.06757001, 0.0048567224, 0.05677843, 0.0075111855, -0.00876305, -0.0009606142, 0.049093515, 0.02217588, -0.04173562, 0.00474431, 0.055061586, -0.0053395843, 0.014061756, -0.028123513, -0.018568467, -0.014082195, 0.04247141, -0.019386012, 0.00444795, 0.008246975, -0.00067064154, 0.001898235, 0.030433076, -0.0003829044, -0.045210183, 0.011026625, 0.054652814, -0.017178643, 0.04057062, 0.009146273, -0.03750483, 0.015308511, 0.025630005, -0.033539742, -0.0062133344, 0.011404739, -0.033233162, -0.017147984, 0.03123018, -0.022543775, 0.047213163, 0.017096888, -0.04770369, 0.0544484

In [50]:
import json

def process_markdown_files(path='', max_token_length=8000):
    markdown_files = get_markdown_files_from_github(path) # comment for test
    # markdown_files = ds_md_files # test

    for markdown_file in markdown_files:
        markdown_content = get_file_content(markdown_file)

        title_embeddings = generate_embedding(markdown_content['title'])
        content_embeddings = generate_embeddings_for_document(markdown_content['content'], max_token_length) # if it doesn't work, try throttling the request or exponential backoff

        markdown_content['title_embeddings'] = title_embeddings
        markdown_content['content_embeddings'] = content_embeddings

        markdown_files.append(markdown_content)

    return markdown_files

# test
processed_md_ds_files = process_markdown_files("docs/datascience")
print(processed_md_ds_files)

HttpResponseError: (RateLimitReached) Rate limit of 15 per 60s exceeded for UserByModelByMinute. Please wait 2 seconds before retrying.
Code: RateLimitReached
Message: Rate limit of 15 per 60s exceeded for UserByModelByMinute. Please wait 2 seconds before retrying.

In [None]:
processed_markdown_files = process_markdown_files("docs")

with open("data_with_embeddings.json", "w") as f:
    json.dump(processed_markdown_files, f)

## Semantic search with FAISS

In [None]:
import faiss
import numpy as np

document_embeddings = processed_md_ds_files['content_embeddings']

d = len(document_embeddings[0])  # size of each embedding vector
faiss_index = faiss.IndexFlatL2(d)

# convert the embeddings to a NumPy array and add them to the faiss index
faiss_index.add(np.array(document_embeddings))
# faiss_index.add(np.vstack(document_embeddings)) # create a 2d array for faiss

# test: semantic search using FAISS
query = "notebooks capabilities in VS Code"
query_embedding = generate_embedding(query)
D, I = faiss_index.search(query_embedding, 5) # number of results to retrieve
# D, I = faiss_index.search(np.array(query_embedding), 5)

print("Closest document indices:", I) # indices of the closest embeddings
print("Corresponding distances:", D) # corresponding distances

matching_docs = [processed_md_ds_files[i] for i in I[0]]  # retrieve the documents using indices

for doc in matching_docs:
    print(f"Title: {doc['title']}")
    print(f"Content: {doc['content']}")
    print(f"URL: {doc['url']}")


In [None]:
from openai import OpenAI

gpt_client = OpenAI(
    base_url="https://models.inference.ai.azure.com",
    api_key=os.getenv("AZURE_TOKEN")
)

# generate response

## Or, use Azure AI Search

After following docs to [set up Azure AI Search client](https://learn.microsoft.com/en-us/python/api/overview/azure/search-documents-readme?view=azure-python#getting-started)...

In [None]:
# upload the document

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_API_KEY")

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

with open("data_with_embeddings.json", "r") as f:
    document = json.load(f)

result = search_client.upload_documents(document)

print("Upload of new document succeeded: {}".format(result[0].succeeded))

In [None]:
# perform a weighted hybrid search, making vector matches 2x more important than keyword matches, and print the top 5 results

results = search_client.search(
    search_text = query,
    vector_queries = generate_embedding(query),
    top=5
)

sources = "\n\n".join([f"[{doc['title']}]: {doc['content']}\n" for doc in results])

print(sources)

## Evaluate

In [None]:
# create evaluation data (manually label relevant documents for queries)
# something like [{"query": "How to do data science in Visual Studio Code?", "relevant_docs": [1, 3, 7]}, ...]


In [None]:
# run evaluation using precision@k (proportion of the top `k` retrieved documents that are relevant) - could use other metrics like recall@k, mean average precision, NDCG@k, etc., but this would be good starting point

In [None]:
# precision@k calculation ((# of relevant docs in top k) / k)

## Deploy RAG model as an API & use from the app