# Create a FAISS based Vector Index for DBCopilot with AzureML
We'll walk through setting up an AzureML Pipeline which grounding a Database into a LangChain-compatible FAISS Vector Index and create the Prompt flow to consume this index to serve as a DBCopilot chatbot.

In [19]:
# %pip install azure-ai-ml
# %pip install -U 'azureml-rag[faiss]>=0.1.11'

In [20]:
# If `import win32file` fails with a DLL error then run the following and restart kernel:
# %pip uninstall -y pywin32
# %conda install -y --force-reinstall pywin32

In [21]:
# System imports
import os
import json

In [22]:
#
# retrieve and print environment variables
#
# load environment variables from .env file (which should be in .gitignore)
# this is a way to keep sensitive information out of the codebase
# the following code allows for .env file to be in same directory as script
# or you can specify the path relative to the notebook to the .env file
from os.path import join
from dotenv import load_dotenv
dotenv_path = join(os.getcwd(), '.env')
print(dotenv_path)
load_dotenv(dotenv_path)

c:\Users\davidtorres\Projects\Sample-Code\AzureML\DBCopilot\.env


True

In [23]:
# Execution Variables

# Tenent and Application Id settings
tenant_id = os.getenv('TENANT_ID')                                      # "<enter the tenent id>"                   << Used for EnvironmentCredential
application_client_id = os.getenv('CLIENT_ID')                          # "<enter application client id>"           << Used for EnvironmentCredential
application_client_secret = os.getenv('CLIENT_SECRET')                  # "<enter application client secret>"       << Used for EnvironmentCredential

# Azure Workspace settings
subscription_id = os.getenv('SUBSCRIPTION_ID')                          # "<enter the subscription id>"             << Used in workspace.json settings file
resource_group_name = os.getenv('RESOURCE_GROUP')                       # "<enter the resource group name>"         << Used in workspace.json settings file
workspace_name = os.getenv('WORKSPACE_NAME')                            # "<enter the azure ml workspace name>"     << Used in workspace.json settings file
default_compute=os.getenv('CLUSTER_NAME')                               # "<enter dedicate compute cluster name>"   << !! Only works with dedicate compute cluster
                                                                        # "serverless"                              << !! "serverless" and "named" compute instances are 
                                                                        #  NOTE >>>>                                << !! currently causing failure in generate_meta_embedding step

# Azure OpenAI settings
aoai_connection_name = "Default_AzureOpenAI"                            # "<default is Default_AzureOpenAI>"        << Use the Azure OpenAI resource connection name
aoai_embedding_model_name = "text-embedding-ada-002"                    # "<default to text-embedding-ada-002"      << Recommendation is text-embedding-ada-002
aoai_completion_model_name = "gpt-35-turbo"                             # "<default to gpt-35-turbo>"               << Recommendation is gpt-35-turbo

# Set vector index asset name
datastore_name = os.getenv('DATA_STORE_NAME')                           # "<enter the name of the Datastore>"       << The database registered in Data > Datastore to create embeddings
datastore_scope = "array"                                                 # "<enter the scope to be indexed>"       << "all" = all tables, anything else need a list of tables/views 
data_asset_name = f"{datastore_name}_{datastore_scope}_llm_index"       # "<enter the vector index suffix>"         << The index to be created in Data > Data Asset

If datastore_scope is not "all" then create a list of tables to be in the scope of the vector index

In [24]:
database_tables = None

if datastore_scope != "all":
    tables = []
    tables.append('[SalesLT].[Address]')
    tables.append('[SalesLT].[Customer]')
    tables.append('[SalesLT].[CustomerAddress]')
    # tables.append('[SalesLT].[Product]')
    # tables.append('[SalesLT].[ProductCategory]')
    # tables.append('[SalesLT].[ProductDescription]')
    # tables.append('[SalesLT].[ProductModel]')
    # tables.append('[SalesLT].[ProductModelProductDescription]')
    database_tables = str(tables)

print(f"Tables to Index: {database_tables}")

Tables to Index: ['[SalesLT].[Address]', '[SalesLT].[Customer]', '[SalesLT].[CustomerAddress]']


## Create client for AzureML Workspace

The workspace is the top-level resource for Azure Machine Learning, providing a centralized place to work with all the artifacts you create when you use Azure Machine Learning. In this section we will connect to the workspace in which the job will be run.



Get `credential` to create `MLClient`.

In [26]:
from azure.identity import DefaultAzureCredential, ClientSecretCredential, EnvironmentCredential, AzureCliCredential
from azure.ai.ml import MLClient
from azureml.core import Workspace

# Create ClientSecretCredential as default credential
# Service Principal (Application Client) must have Contributor role on the Azure ML Workspace 
try:
    ## Set expected environment variables
    os.environ['AZURE_TENANT_ID'] = tenant_id
    os.environ['AZURE_CLIENT_ID'] = application_client_id
    os.environ['AZURE_CLIENT_SECRET'] = application_client_secret
    os.environ['AZURE_AUTHORITY_HOST'] = 'https://login.microsoftonline.com'
    credential = ClientSecretCredential(tenant_id=tenant_id, client_id=application_client_id, client_secret=application_client_secret)
except Exception as ex:
    print(ex)
    print('Try DefaultAzureCredential creation')
    # Fall back to DefaultAzureCredential, if ClientSecretCredential does not work
    # DefaultAzureCredential will look for credientials sequentially see docs at
    # https://learn.microsoft.com/en-us/dotnet/api/azure.identity.defaultazurecredential
    try:
        credential = DefaultAzureCredential()
    except Exception as ex:
        print(ex)

credential.get_token("https://management.azure.com/.default")


AccessToken(token='eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6Ii1LSTNROW5OUjdiUm9meG1lWm9YcWJIWkdldyIsImtpZCI6Ii1LSTNROW5OUjdiUm9meG1lWm9YcWJIWkdldyJ9.eyJhdWQiOiJodHRwczovL21hbmFnZW1lbnQuYXp1cmUuY29tIiwiaXNzIjoiaHR0cHM6Ly9zdHMud2luZG93cy5uZXQvNzJmOTg4YmYtODZmMS00MWFmLTkxYWItMmQ3Y2QwMTFkYjQ3LyIsImlhdCI6MTY5MzU3ODg3NiwibmJmIjoxNjkzNTc4ODc2LCJleHAiOjE2OTM2NjU1NzYsImFpbyI6IkUyRmdZRGd5b1ZGbVZ3UFArNURuZjJTWDJpMTRCUUE9IiwiYXBwaWQiOiJiZDhlNDU3Ni0zYjA2LTQ5MTgtYmU3ZC0wYjZhODE3NTA5ZWYiLCJhcHBpZGFjciI6IjEiLCJpZHAiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC83MmY5ODhiZi04NmYxLTQxYWYtOTFhYi0yZDdjZDAxMWRiNDcvIiwiaWR0eXAiOiJhcHAiLCJvaWQiOiI4Y2Q2OWM1Yy04YzU4LTQ5MTgtOTk2Zi1mOTI0ZmFlN2M4NmMiLCJyaCI6IjAuQVJvQXY0ajVjdkdHcjBHUnF5MTgwQkhiUjBaSWYza0F1dGRQdWtQYXdmajJNQk1hQUFBLiIsInN1YiI6IjhjZDY5YzVjLThjNTgtNDkxOC05OTZmLWY5MjRmYWU3Yzg2YyIsInRpZCI6IjcyZjk4OGJmLTg2ZjEtNDFhZi05MWFiLTJkN2NkMDExZGI0NyIsInV0aSI6IjVpaWpTWWFJOUVHeGNOSmtUcjBDQUEiLCJ2ZXIiOiIxLjAiLCJ4bXNfdGNkdCI6MTI4OTI0MTU0N30.THuBZWAVX_mKPwYqyDwcfGldXLashgmSmOR5

Create `MLClient` to interact with AzureML

In [10]:
try:
    ml_client= MLClient(credential=credential, subscription_id=subscription_id, resource_group_name=resource_group_name, workspace_name=workspace_name)
except Exception as ex:
    raise Exception("Failed to create MLClient from config file. Please verify AzureML Workspace details and update Execution Variables above.") from ex

ws = Workspace(
    subscription_id=ml_client.subscription_id,
    resource_group=ml_client.resource_group_name,
    workspace_name=ml_client.workspace_name,
)
print(ml_client)

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


MLClient(credential=<azure.identity._credentials.client_secret.ClientSecretCredential object at 0x000001C0396B6EC0>,
         subscription_id=8878a446-3d3e-44c2-bae5-09fd1d17e6d6,
         resource_group_name=common-ai-dev,
         workspace_name=common-ai-dev-ml)


## Azure OpenAI

We recommend using gpt-35-turbo model to get good quality QAs. [Follow these instructions](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) to setup an Azure OpenAI Instance and deploy the model. Once you have the model deployed in AOAI you can specify your Model name and Deployment name below.

We will use the automatically created `Default_AzureOpenAI` connection, change `aoai_connection_name` to use your own.

In [None]:
from azureml.rag.utils.connections import get_connection_by_name_v2

try:
    # Get the Azure OpenAI Connection
    aoai_connection = get_connection_by_name_v2(ws, aoai_connection_name)
    # Get the Azure OpenAI connection id
    aoai_connection_id = aoai_connection["id"]
    # Print Azure OpenAI connection info
    print(f"Azure OpenAI connection: \n{json.dumps(aoai_connection, indent=4)}")
    
except Exception as ex:
    print(f'Exception: {ex}')
    print(f'Unable to create a connection to the Azure OpenAI resource named: {aoai_connection_name}')


Now that the Workspace has a connection to Azure Open AI ensure the **embedding** model has been deployed (recommendation is `text-embedding-ada-002`)

This cell will fail if there is not deployment for the embeddings model, [follow these instructions](https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#deploy-a-model) to deploy a model with Azure OpenAI.

In [None]:
from azureml.rag.utils.deployment import infer_deployment

try:
    aoai_embedding_deployment_name = infer_deployment(aoai_connection, aoai_embedding_model_name)
    print(f"Deployment name in AOAI workspace for model '{aoai_embedding_model_name}' is '{aoai_embedding_deployment_name}'")
except Exception as ex:
    print(f"Exception: {ex}")
    print(f"Deployment name in AOAI workspace for model '{aoai_embedding_model_name}' is not found.")
    print(f"Please create a deployment for this model by following the deploy instructions on the resource page for '{aoai_connection['properties']['target']}' in Azure Portal.")

Now that the Workspace has a connection to Azure Open AI ensure the **completion** model has been deployed (recommendation is `gpt-35-turbo`)

The following cell will fail if a **completion** model is not deployed, [follow these instructions](https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#deploy-a-model) to deploy a **completion** model with Azure OpenAI.

In [None]:
from azureml.rag.utils.deployment import infer_deployment

try:
    aoai_completion_deployment_name = infer_deployment(aoai_connection, aoai_completion_model_name)
    print(f"Deployment name in AOAI workspace for model '{aoai_completion_model_name}' is '{aoai_completion_deployment_name}'")
except Exception as ex:
    print(f"Exception: {ex}")
    print(f"Deployment name in AOAI workspace for model '{aoai_completion_model_name}' is not found.")
    print(f"Please create a deployment for this model by following the deploy instructions on the resource page for '{aoai_connection['properties']['target']}' in Azure Portal.")

# Create LLM completion config in URI form which the AzureML embeddings components expect as input.
llm_completion_config = f'{{"type":"azure_open_ai","model_name":"{aoai_completion_model_name}","deployment_name":"{aoai_completion_deployment_name}","temperature":0,"max_tokens":"1500"}}'

### Setup Pipeline Job

The Components are published to a [Registry](https://learn.microsoft.com/azure/machine-learning/how-to-manage-registries?view=azureml-api-2&tabs=cli), `azureml`, which should have access to by default, it can be accessed from any Workspace.
In the below cell we get the Component Definitions from the `azureml` registry.

In [None]:
ml_registry = MLClient(credential=credential, registry_name="azureml")

db_copilot_component = ml_registry.components.get("llm_ingest_db_to_faiss", label="latest")

print(db_copilot_component)

In [None]:
# Create the pipeline
from azure.ai.ml.dsl import pipeline
@pipeline(
    name=f"db_copilot_vector_pipeline_faiss",
    default_compute=default_compute
)
def db_copilot_vector_pipeline_faiss(
    aoai_connection: str,
    db_datastore: str,
    embeddings_model: str,
    chat_aoai_deployment_name: str,
    embedding_aoai_deployment_name: str,
    mlindex_dataset_name: str,
    selected_tables: str = None,
    max_sampling_rows: int = 3,
):
    db_copilot_component(
        db_datastore=db_datastore,
        embeddings_model=embeddings_model,
        chat_aoai_deployment_name=chat_aoai_deployment_name,
        embedding_aoai_deployment_name=embedding_aoai_deployment_name,
        embeddings_dataset_name=mlindex_dataset_name,
        embedding_connection=aoai_connection,
        llm_connection=aoai_connection,
        selected_tables=selected_tables,
        max_sampling_rows=max_sampling_rows,
    )
    return {}


In [None]:
# Create pipeline job
pipeline_job = db_copilot_vector_pipeline_faiss(
    aoai_connection=aoai_connection_id,
    db_datastore=f"azureml://datastores/{datastore_name}",
    embeddings_model=f"azure_open_ai://deployment/{aoai_embedding_deployment_name}/model/{aoai_embedding_model_name}",
    chat_aoai_deployment_name=aoai_completion_deployment_name,
    embedding_aoai_deployment_name=aoai_embedding_deployment_name,
    mlindex_dataset_name=data_asset_name,
    selected_tables="[\"[SalesLT].[Product]\"]",
    max_sampling_rows=3,
)

In [None]:
# These are added so that in progress index generations can be listed in UI, this tagging is done automatically by UI.
pipeline_job.properties["azureml.mlIndexAssetName"] = data_asset_name
pipeline_job.properties["azureml.mlIndexAssetKind"] = "faiss"
pipeline_job.properties["azureml.mlIndexAssetSource"] = "Database"

In [None]:
# Submit pipeline job
running_pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name=str.lower(f"{datastore_name}_{datastore_scope}_dbcopilot_pipeline")
)
running_pipeline_job

In [None]:
ml_client.jobs.stream(running_pipeline_job.name)

## Use DBCopilot with Promptflow
After the pipeline complete, it will create a promptflow which could be used to chat with the db.