# Creación y Almacenamiento de Embeddings en Chroma DB

En el presente Notebook se crearán la documentación base del esquema reducido con el que trabajaremos, basada en el `information_schema` correspondiente, que luego se nutrirá por fuera con mayor información relevante para negocio. 

Adicionalmente, por fuera del Notebook, se confeccionará documentación adicional, como reglas de negocio y few-shot examples para, finalmente, obtener chunks de todos estos documentos y crear y almacenar los mismos, con su metadata asociada, en una base de datos vectorial de Chroma DB.

## Inicialización

### Librerías



In [4]:
import os
import sys
from pathlib import Path
import pandas as pd
from collections import defaultdict, OrderedDict
import yaml
import json
import uuid
from langchain_core.documents import Document
from langchain_openai import AzureOpenAIEmbeddings
from langchain_chroma import Chroma
import chromadb
import chromadb.utils.embedding_functions as ef
from dotenv import load_dotenv

load_dotenv('../.env')
pd.set_option('display.max_columns', None)
yaml.add_representer(OrderedDict, lambda dumper, data: dumper.represent_dict(data.items()))

notebook_dir = os.getcwd() 
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.append(project_root)

from src.pg_sql import execute_query


### Constantes

In [5]:
DATABASE = 'database'
SCHEMAS = 'schemas'
TABLES = 'tables'
COLUMNS = 'columns'
DESCRIPTION = 'description'
NAME = 'name'
DATA_TYPE = 'data_type'
PRIMARY_KEY = 'is_primary_key'
FOREIGN_KEY = 'is_foreign_key'
REFERENCE = 'reference'
TO_DO = '[To be completed ...]'

OUTPUT_PATH = '../data/embeddings/auxs'
MDL_PATH = '../data/embeddings/documents/MDL_adventure_works_dw.yaml'

CHROMA_DB_PATH = '../data/embeddings/chroma'
TABLES_COLLECTION_NAME = 'mdl_tables'
TABLES_SUMMARY_COLLECTION_NAME = 'mdl_tables_summary'
TABLES_DETAILS_COLLECTION_NAME = 'mdl_tables_details'
COLUMNS_COLLECTION_NAME = 'mdl_columns'
COLUMNS_REDUCED_COLLECTION_NAME = 'mdl_columns_reduced'

AZURE_OPENAI_EMBEDDING_MODEL = 'text-embedding-3-small'

### Funciones aplicables a todos los documentos

In [8]:
def get_azure_openai_embedding(input: str, model: str = AZURE_OPENAI_EMBEDDING_MODEL) -> list[float]:
    """
    Generate an embedding vector for the given text using Azure OpenAI's embedding API.

    Args:
        input (str): The text to be embedded.
        model (str, optional): The Azure OpenAI embedding model to use. 
            Defaults to `AZURE_OPENAI_EMBEDDING_MODEL`.

    Returns:
        list[float]: A list of floating-point numbers representing the text embedding.

    Raises:
        Any exceptions raised by the Azure OpenAI client in case of connection or API errors.
    """

    client = AzureOpenAI()
    
    response = client.embeddings.create(
        input= input,
        model= model
    )

    return response.data[0].embedding



def get_azure_openai_embedding_function(model_name: str = AZURE_OPENAI_EMBEDDING_MODEL, 
                                        deployment_id: str = AZURE_OPENAI_EMBEDDING_MODEL) -> ef.OpenAIEmbeddingFunction:
    """
    Creates and returns an OpenAIEmbeddingFunction configured for Azure OpenAI service.

    Args:
        model_name (str): The name of the Azure OpenAI embedding model to use. 
                          Defaults to the value of AZURE_OPENAI_EMBEDDING_MODEL.
        deployment_id (str): The deployment ID for the embedding model. 
                             Defaults to the value of AZURE_OPENAI_EMBEDDING_MODEL.

    Returns:
        OpenAIEmbeddingFunction: An instance of OpenAIEmbeddingFunction configured with
                                 API credentials and specified model for Azure OpenAI.
    
    Environment Variables:
        - AZURE_OPENAI_API_KEY: The API key for authenticating with Azure OpenAI.
        - AZURE_OPENAI_ENDPOINT: The base endpoint URL for the Azure OpenAI service.
        - OPENAI_API_VERSION: The API version to use for the OpenAI requests.
    """
    
    return ef.OpenAIEmbeddingFunction(
        api_key= os.environ['AZURE_OPENAI_API_KEY'],
        api_base= os.environ['AZURE_OPENAI_ENDPOINT'],
        api_type= 'azure',
        api_version= os.environ['OPENAI_API_VERSION'],
        model_name= model_name,
        deployment_id= deployment_id
    )



def setup_chromadb_collection(
        client:chromadb.Client, 
        collection_name: str, 
        clean_setup: bool = False,
        embedding_function: chromadb.EmbeddingFunction = ef.DefaultEmbeddingFunction(),
        space_metric: str = 'cosine'
    ) -> chromadb.Collection:
    
    """
    Set up a ChromaDB collection with specified parameters.

    This function either retrieves an existing collection or creates a new one with
    a given name, embedding function, and space metric. Optionally, it can delete
    any existing collection with the same name before setup (clean setup).

    Args:
        client (chromadb.Client): The ChromaDB client instance to interact with the database.
        collection_name (str): The name of the collection to retrieve or create.
        clean_setup (bool, optional): If True, deletes any existing collection with the 
            same name before creating a new one. Defaults to False.
        embedding_function (chromadb.EmbeddingFunction, optional): The embedding function
            to use for the collection. Defaults to ef.DefaultEmbeddingFunction().
        space_metric (str, optional): The metric to use for similarity search. 
            Must be one of 'l2', 'ip', or 'cosine'. Defaults to 'cosine'.

    Raises:
        AssertionError: If `space_metric` is not one of the supported options.

    Returns:
        chromadb.Collection: The retrieved or newly created ChromaDB collection.
    """

    assert space_metric in ('l2', 'ip', 'cosine'), f'The `{space_metric}`is not supported by Chroma DB.'

    if clean_setup:
        try:
            client.delete_collection(name= collection_name)
        except Exception:
            pass

    collection = client.get_or_create_collection(
        name= collection_name, 
        embedding_function= embedding_function,
        configuration= {'hnsw': {'space': space_metric}}
    )
    return collection



def store_chunks_in_chorma_db(chunks: list[dict], collection: chromadb.Collection, use_collection_ef: bool = True) -> None:
    """
    Store a list of document chunks into a ChromaDB collection.

    Each chunk is expected to be a dictionary containing 'content' (text of the chunk)
    and 'metadata' (associated metadata) keys. The function generates unique IDs and, if required,
    computes embeddings for each document chunk before adding them to the collection.

    Args:
        chunks (list[dict]): A list of dictionaries, each with keys:
                             - 'content' (str): The text content of the chunk.
                             - 'metadata' (dict): Metadata associated with the chunk.
        collection (chromadb.Collection): The target ChromaDB collection to store the data in.
        use_collection_ef (bool, optional): If True, use the collection's embedding function to
                                            generate embeddings; if False, generate embeddings
                                            separately. Defaults to True.

    Returns:
        None
    """

    documents = [chunk['content'] for chunk in chunks]
    metadatas = [chunk['metadata'] for chunk in chunks]
    ids = [str(uuid.uuid4()) for _ in chunks]
    
    embeddings = None
    if not use_collection_ef:
        embeddings = [get_azure_openai_embedding(doc) for doc in documents]

    collection.add(
        ids= ids,
        embeddings= embeddings,
        documents= documents,
        metadatas= metadatas
    )



def add_docs_to_chroma_col(chunks: list[dict], collection: Chroma) -> list[str]:
    """
    Store a list of document chunks into a ChromaDB collection using langchain.

    Each chunk is expected to be a dictionary containing 'content' (text of the chunk)
    and 'metadata' (associated metadata) keys. The function generates unique IDs and, if required,
    computes embeddings for each document chunk before adding them to the collection.

    Args:
        chunks (list[dict]): A list of dictionaries, each with keys:
                             - 'content' (str): The text content of the chunk.
                             - 'metadata' (dict): Metadata associated with the chunk.
        collection (langchain_chroma.Chroma): The target ChromaDB collection by langchain to store the data in.

    Returns:
        ids[list[str]]
    """

    ids = [str(uuid.uuid4()) for _ in chunks]

    collection.add_documents(
        ids = ids,
        documents= chunks
    )

## Fichero YAML con MDL del esquema de interés

### Obtener fichero base desde `information_schema`

Primero obtendremos un fichero base construido utilizando la query `/data/embeddings/auxs/get_information_schema.sql`, sobre el que luego se añadirá metadata extra. Para esto, definimos algunas funciones que nos serán de utilidad:

In [3]:
def get_information_schema(query_path: str, db_names_list: list[str], schema_names_list: list[str]) -> str:
    """
    Reads an SQL query from a file and replaces placeholder lists with formatted strings.

    This function is designed to work with SQL queries that have specific placeholders
    for database and schema names. It reads the query from the given file path, 
    formats the input lists of names into a single quoted, comma-separated string, 
    and replaces the placeholders in the query.

    Args:
        query_path (str): The file path to the SQL query. The query should
                          contain the placeholders `[db_names_list]` and
                          `[schema_names_list]`.
        db_names_list (list[str]): A list of database names to be formatted
                                   and inserted into the query.
        schema_names_list (list[str]): A list of schema names to be formatted
                                       and inserted into the query.

    Returns:
        str: The complete SQL query with the placeholders replaced by
             the formatted database and schema names.

    Raises:
        FileNotFoundError: If the specified query_path does not exist.
        
    Example:
        >>> from pathlib import Path
        >>> # Assume 'my_query.sql' contains:
        >>> # SELECT * FROM information_schema.tables WHERE table_schema IN ([schema_names_list])
        >>> # And we create a dummy file for the example:
        >>> Path('my_query.sql').write_text("SELECT * FROM information_schema.tables WHERE table_schema IN ([schema_names_list])")
        >>> db_list = ['db1', 'db2']
        >>> schema_list = ['schema_a', 'schema_b']
        >>> get_information_schema('my_query.sql', db_list, schema_list)
        "SELECT * FROM information_schema.tables WHERE table_schema IN ('schema_a', 'schema_b')"
    """
    query = Path(query_path).read_text()

    db_names = "'" + "', '".join(db_names_list) + "'"
    schema_names = "'" + "', '".join(schema_names_list) + "'"

    return query.replace('[db_names_list]', db_names).replace('[schema_names_list]', schema_names)



def format_yaml(yaml_str: str) -> str:
    """
    Formats a YAML string by adding a blank line before each list item
    that isn't preceded by a list key.
    """
    last_line = ''
    last_line_list_init = False
    last_line_empty = False

    lines = list()

    for line in yaml_str.split('\n'):
        if line.strip().startswith('-') and not last_line_list_init and not last_line_empty:
            last_line += '\n'

        lines.append(last_line)
        last_line = line
        last_line_list_init = last_line.strip().endswith(':')
        last_line_empty = last_line.strip()==''

    lines.append(line)

    return '\n'.join(lines)

In [4]:
GET_INFORMATION_SCHEMA_SQL = '../data/embeddings/auxs/get_information_schema.sql'
DB_NAME = 'adventure_works_dw'
SCHEMA_NAME = 'sales'


information_schema_data = execute_query(get_information_schema(
    query_path= GET_INFORMATION_SCHEMA_SQL,
    db_names_list= [DB_NAME],
    schema_names_list= [SCHEMA_NAME]
))

Veamos el aspecto que tienen los resultados de nuestra query:

In [5]:
information_schema_data

[{'db_name': 'adventure_works_dw',
  'schema_name': 'sales',
  'table_name': 'dim_customer',
  'column_name': 'customer_key',
  'column_type': 'INT4',
  'primary_key': True,
  'foreign_key': False,
  'target': None},
 {'db_name': 'adventure_works_dw',
  'schema_name': 'sales',
  'table_name': 'dim_customer',
  'column_name': 'geography_key',
  'column_type': 'INT4',
  'primary_key': False,
  'foreign_key': True,
  'target': 'sales.dim_geography.geography_key'},
 {'db_name': 'adventure_works_dw',
  'schema_name': 'sales',
  'table_name': 'dim_customer',
  'column_name': 'customer_full_name',
  'column_type': 'TEXT',
  'primary_key': False,
  'foreign_key': False,
  'target': None},
 {'db_name': 'adventure_works_dw',
  'schema_name': 'sales',
  'table_name': 'dim_customer',
  'column_name': 'birth_date',
  'column_type': 'DATE',
  'primary_key': False,
  'foreign_key': False,
  'target': None},
 {'db_name': 'adventure_works_dw',
  'schema_name': 'sales',
  'table_name': 'dim_customer',
 

Lo convertimos en un Data Frame de Pandas para que sea más vistoso:

In [6]:
pd.DataFrame(information_schema_data)

Unnamed: 0,db_name,schema_name,table_name,column_name,column_type,primary_key,foreign_key,target
0,adventure_works_dw,sales,dim_customer,customer_key,INT4,True,False,
1,adventure_works_dw,sales,dim_customer,geography_key,INT4,False,True,sales.dim_geography.geography_key
2,adventure_works_dw,sales,dim_customer,customer_full_name,TEXT,False,False,
3,adventure_works_dw,sales,dim_customer,birth_date,DATE,False,False,
4,adventure_works_dw,sales,dim_customer,marital_status,BPCHAR(1),False,False,
...,...,...,...,...,...,...,...,...
107,adventure_works_dw,sales,fact_sales,freight,NUMERIC,False,False,
108,adventure_works_dw,sales,fact_sales,order_date,DATE,False,False,
109,adventure_works_dw,sales,fact_sales,due_date,DATE,False,False,
110,adventure_works_dw,sales,fact_sales,ship_date,DATE,False,False,


Ahora procederemos a crear el YAML base con el MDL de nuestro esquema, que podremos tomar como punto de partida para luego añadirle metadata extra manualmente:

In [7]:
dbs_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for row in information_schema_data:
    db_name = row.get('db_name')
    schema_name = row.get('schema_name')
    table_name = row.get('table_name')
    
    dbs_data[db_name][schema_name][table_name].append(row)

for db_name, schemas_data in dbs_data.items():
    db = OrderedDict()
    db[DATABASE] = db_name
    db[DESCRIPTION] = TO_DO

    schemas = list()
    for schema_name, tables_data in schemas_data.items():
        schema = OrderedDict()
        schema[NAME] = schema_name
        schema[DESCRIPTION] = TO_DO

        tables = list()
        for table_name, columns_data in tables_data.items():
            table = OrderedDict()
            table[NAME] = table_name
            table[DESCRIPTION] = TO_DO

            columns = list()
            for column_data in columns_data:
                column = OrderedDict()
                column[NAME] = column_data.get('column_name')
                column[DESCRIPTION] = TO_DO
                column[DATA_TYPE] = column_data.get('column_type')
                
                if column_data.get('primary_key'):
                    column[PRIMARY_KEY] = True

                if column_data.get('foreign_key'):
                    column[FOREIGN_KEY] = True
                    column[REFERENCE] = column_data.get('target')

                columns.append(column)

            table[COLUMNS] = columns
            tables.append(table)
        
        schema[TABLES] = tables
        schemas.append(schema)
    
    db[SCHEMAS] = schemas

    mdl_file_path = f'{OUTPUT_PATH}/MDL_{db_name}.yaml'
    with open(mdl_file_path, 'w') as mdl:
        mdl.write(format_yaml(yaml.dump(db)))

    print(f'>  Fichero MDL base almacenado en {mdl_file_path}')

>  Fichero MDL base almacenado en ../data/embeddings/auxs/MDL_adventure_works_dw.yaml


### Creación de embeddings

Tomando como base el fichero obtenido en el apartado anterior, se ha creado un fichero que del MDL de nuestro esquema de interés que ha sido nutrido con metadata adicional, como descripciones para cada tabla y campo. Este fichero se encuentra en la ruta `/data/embeddings/documents/MDL_adventure_works_dw.yaml`.

Procederemos ahora a procesar este fichero para crear chunks de la información de cada una de las tablas y almacenar sus embeddings en nuestra base de datos vectorial de `chroma_db`, persistida en la ruta `/data/embeddings/chroma_db/`.

Procedemos ahora a definir funciones que nos permitirán chunkear el documento mdl desde un enfoque más integral a nivel de tablas, como también a un enfoque más granular a nivel de columnas:

In [9]:
def chunk_mdl_by_table(mdl_data: dict) -> list[Document]:
    """
    Splits a Model Description Language (MDL) data dictionary into chunks by table.

    Each chunk contains detailed information about a single table including
    database, schema, table metadata, and all columns with their types and keys.

    Args:
        mdl_data (dict): The MDL data structure representing a database model.
            Expected keys include:
            - DATABASE: str, name of the database
            - DESCRIPTION: str, description of the database
            - SCHEMAS: list of schemas, where each schema is a dict containing:
                - NAME: str, schema name
                - DESCRIPTION: str, schema description
                - TABLES: list of tables, where each table is a dict containing:
                    - NAME: str, table name
                    - DESCRIPTION: str, table description
                    - COLUMNS: list of columns, where each column is a dict with:
                        - NAME: str, column name
                        - DATA_TYPE: str, data type of the column
                        - DESCRIPTION: str, description of the column
                        - PRIMARY_KEY (optional): bool, if column is primary key
                        - FOREIGN_KEY (optional): bool, if column is foreign key
                        - REFERENCE (optional): reference target of foreign key

    Returns:
        list[dict]: A list of chunks where each chunk is a dictionary with:
            - 'content': str, formatted string with table and column information
            - 'metadata': dict, includes source, chunk_type, database, schema,
                          table names, column counts, primary keys, foreign keys,
                          and column info summaries.
    """
    
    chunks = []

    database_name = mdl_data[DATABASE]
    database_description = mdl_data[DESCRIPTION]

    for schema in mdl_data[SCHEMAS]:
        schema_name = schema[NAME]
        schema_description = schema[DESCRIPTION]

        for table in schema[TABLES]:
            table_name = table[NAME]
            table_description = table[DESCRIPTION]
            table_columns = table[COLUMNS]

            chunk_content_list = [
                f'Database: {database_name}',
                f'Database description: {database_description}',
                f'Schema: {schema_name}',
                f'Schema description: {schema_description}',
                f'Table: {table_name}',
                f'Table description: {table_description}',
                'Columns:'
            ]


            table_primary_key = []
            table_foreign_keys = []
            columns_info = []

            for col in table_columns:
                column_name = col[NAME]
                column_data_type = col[DATA_TYPE]
                column_description = col[DESCRIPTION]
                
                chunk_col_line = f'- {column_name} ({column_data_type}): {column_description}'
                columns_info.append({
                    'name': column_name,
                    'data_type': column_data_type
                })

                if col.get(PRIMARY_KEY):
                    chunk_col_line += ' (PRIMARY KEY)'
                    table_primary_key.append(column_name)

                if col.get(FOREIGN_KEY):
                    chunk_col_line += f' (FOREIGN KEY, reference: {col[REFERENCE]})'
                    table_foreign_keys.append({
                        'column_name': column_name,
                        'reference': col[REFERENCE]
                    })
                
                chunk_content_list.append(chunk_col_line)

            chunk_content = '\n'.join(chunk_content_list)

            metadata = {
                'source': 'MDL Document',
                'chunk_type': 'table_full',
                'database_name': database_name,
                'schema_name': schema_name,
                'table_name': table_name,
                'num_columns': len(table_columns),
                'table_primary_key': json.dumps(table_primary_key),
                'table_foreign_keys': json.dumps(table_foreign_keys),
                'columns_info': json.dumps(columns_info)
            }

            chunks.append(Document(page_content= chunk_content, metadata= metadata))

    return chunks



def chunk_mdl_by_table_summary(mdl_data: dict) -> list[Document]:
    chunks = []

    database_name = mdl_data[DATABASE]

    for schema in mdl_data[SCHEMAS]:
        schema_name = schema[NAME]

        for table in schema[TABLES]:
            table_name = table[NAME]
            table_description = table[DESCRIPTION]
            table_columns = table[COLUMNS]
            
            chunk_content_list = [
                f'Database: {database_name}',
                f'Schema: {schema_name}',
                f'Table: {table_name}',
                f'Table description: {table_description}'
            ]

            table_primary_key = [col[NAME] for col in table_columns if col.get(PRIMARY_KEY)]
            if table_primary_key:
                chunk_content_list.append('Table PRIMARY KEY:')
                for pk in table_primary_key:
                    chunk_content_list.append(f'- {pk}')

            table_foreign_keys = [{'column_name': col[NAME], 'reference': col[REFERENCE]} for col in table_columns if col.get(FOREIGN_KEY)]
            if table_foreign_keys:
                chunk_content_list.append('Table FOREIGN KEYS (Column name, Reference):')
                for fk in table_foreign_keys:
                    chunk_content_list.append(f'- ({fk["column_name"]}, {fk["reference"]})')

            chunk_content = '\n'.join(chunk_content_list)

            metadata = {
                'database_name': database_name,
                'schema_name': schema_name,
                'table_name': table_name
            }

            chunks.append(Document(page_content= chunk_content, metadata= metadata))

    return chunks



def chunk_mdl_by_table_details(mdl_data: dict) -> list[Document]:
    chunks = []

    database_name = mdl_data[DATABASE]

    for schema in mdl_data[SCHEMAS]:
        schema_name = schema[NAME]

        for table in schema[TABLES]:
            table_name = table[NAME]
            table_columns = table[COLUMNS]

            chunk_content_list = [
                f'Database: {database_name}',
                f'Schema: {schema_name}',
                f'Table: {table_name}',
                'Columns:'
            ]

            for col in table_columns:
                column_name = col[NAME]
                column_data_type = col[DATA_TYPE]
                column_description = col[DESCRIPTION]
                
                chunk_col_line = f'- {column_name} ({column_data_type}): {column_description}'

                if col.get(PRIMARY_KEY):
                    chunk_col_line += ' (PRIMARY KEY)'

                if col.get(FOREIGN_KEY):
                    chunk_col_line += f' (FOREIGN KEY, reference: {col[REFERENCE]})'
                
                chunk_content_list.append(chunk_col_line)

            chunk_content = '\n'.join(chunk_content_list)

            metadata = {
                'database_name': database_name,
                'schema_name': schema_name,
                'table_name': table_name,
            }

            chunks.append(Document(page_content= chunk_content, metadata= metadata ))

    return chunks



def chunk_mdl_by_column(mdl_data: dict) -> list[Document]:
    """
    Splits a Model Description Language (MDL) data dictionary into chunks by column.

    Each chunk corresponds to a single non-key column of a table and includes
    the context of the database, schema, and table, as well as information about
    primary and foreign keys in the table.

    Args:
        mdl_data (dict): The MDL data structure representing a database model.
            Expected keys and structure are similar to `chunk_mdl_by_table`.

    Returns:
        list[dict]: A list of chunks where each chunk is a dictionary with:
            - 'content': str, formatted string including database, schema, table,
                         primary/foreign key info, and column details.
            - 'metadata': dict, includes source, chunk_type, database, schema,
                          table and column names, and column data type.
    """

    chunks = []

    database_name = mdl_data[DATABASE]

    for schema in mdl_data[SCHEMAS]:
        schema_name = schema[NAME]

        for table in schema[TABLES]:
            table_name = table[NAME]
            table_columns = table[COLUMNS]
            
            chunk_base_content_list = [
                f'Database: {database_name}',
                f'Schema: {schema_name}',
                f'Table: {table_name}'
            ]

            table_primary_key = [col[NAME] for col in table_columns if col.get(PRIMARY_KEY)]
            if table_primary_key:
                chunk_base_content_list.append('Table PRIMARY KEY:')
                for pk in table_primary_key:
                    chunk_base_content_list.append(f'- {pk}')

            table_foreign_keys = [{'column_name': col[NAME], 'reference': col[REFERENCE]} for col in table_columns if col.get(FOREIGN_KEY)]
            if table_foreign_keys:
                chunk_base_content_list.append('Table FOREIGN KEYS (Column name, Reference):')
                for fk in table_foreign_keys:
                    chunk_base_content_list.append(f'- ({fk["column_name"]}, {fk["reference"]})')


            for col in table_columns:
                chunk_content_list = chunk_base_content_list.copy()

                if col.get(PRIMARY_KEY) or col.get(FOREIGN_KEY):
                    continue

                column_name = col[NAME]
                column_data_type = col[DATA_TYPE]
                column_description = col[DESCRIPTION]

                chunk_content_list.append(f'Column name: {column_name}')
                chunk_content_list.append(f'Column data type: {column_data_type}')
                chunk_content_list.append(f'Column description: {column_description}')

                chunk_content = '\n'.join(chunk_content_list)

                metadata = {
                    'source': 'MDL Document',
                    'chunk_type': 'column_slice',
                    'database_name': database_name,
                    'schema_name': schema_name,
                    'table_name': table_name,
                    'column_name': column_name,
                    'column_data_type': column_data_type
                }

                chunks.append(Document(page_content= chunk_content, metadata= metadata))

    return chunks



def chunk_mdl_by_column_reduced(mdl_data: dict) -> list[Document]:
    chunks = []

    database_name = mdl_data[DATABASE]

    for schema in mdl_data[SCHEMAS]:
        schema_name = schema[NAME]

        for table in schema[TABLES]:
            table_name = table[NAME]
            table_columns = table[COLUMNS]

            for col in table_columns:
                chunk_content_list = list()

                if col.get(PRIMARY_KEY) or col.get(FOREIGN_KEY):
                    continue

                column_name = col[NAME]
                column_data_type = col[DATA_TYPE]
                column_description = col[DESCRIPTION]

                chunk_content_list.append(f'Column name: {column_name}')
                chunk_content_list.append(f'Column data type: {column_data_type}')
                chunk_content_list.append(f'Column description: {column_description}')

                chunk_content = '\n'.join(chunk_content_list)

                metadata = {
                    'database_name': database_name,
                    'schema_name': schema_name,
                    'table_name': table_name,
                    'column_name': column_name,
                    'column_data_type': column_data_type
                }

                chunks.append(Document(page_content= chunk_content, metadata= metadata))

    return chunks


Veamos ahora el aspecto que tiene, por ejemplo, el chunk de mayor tamaño para cada uno de los tipos:

In [10]:
with open(MDL_PATH, 'r', encoding='utf-8') as mdl_file:
    mdl_data = yaml.safe_load(mdl_file)

chunks = chunk_mdl_by_table(mdl_data)

max_len = max([len(c.page_content) for c in chunks])
print(f'{max_len=}')
print()
print([c.page_content for c in chunks if len(c.page_content) == max_len][0])
print()
print([c.metadata for c in chunks if len(c.page_content) == max_len][0])

max_len=5145

Database: adventure_works_dw
Database description: AdventureWorks es una base de datos de ejemplo facilitada por Microsoft. Esta base de datos admite escenarios estándar de procesamiento de transacciones en línea para un fabricante de bicicletas ficticio: Adventure Works Cycles.
Schema: sales
Schema description: Se trata de un esquema de ventas con espíritu de simplificar la información del módulo de ventas de la BBDD original. Contamos con información de ventas (tanto por internet como de retail), y las dimensiones asociadas (promociones, clientes, distribuidores, vendedores, porductos, etc.).
Table: fact_sales
Table description: Tabla de hechos que contiene el detalle de las ordenes de ventas que ya han sido entregadas, con una granularidad a nivel de línea, mostrando siempre la última versión de cada pedido. SIEMPRE que se soliciten datos de ventas, como cantidades vendidas, importe de ventas, costes de ventas, impuestos, costes de envío, deberán ser obtenidos de esta 

In [11]:
with open(MDL_PATH, 'r', encoding='utf-8') as mdl_file:
    mdl_data = yaml.safe_load(mdl_file)

chunks = chunk_mdl_by_column(mdl_data)

max_len = max([len(c.page_content) for c in chunks])
print(f'{max_len=}')
print()
print([c.page_content for c in chunks if len(c.page_content) == max_len][0])
print()
print([c.metadata for c in chunks if len(c.page_content) == max_len][0])

max_len=762

Database: adventure_works_dw
Schema: sales
Table: fact_sales
Table PRIMARY KEY:
- sales_order_number
- sales_order_line_number
Table FOREIGN KEYS (Column name, Reference):
- (product_key, sales.dim_product.product_key)
- (reseller_key, sales.dim_reseller.reseller_key)
- (employee_key, sales.dim_sales_person.employee_key)
- (customer_key, sales.dim_customer.customer_key)
- (promotion_key, sales.dim_promotion.promotion_key)
- (sales_territory_key, sales.dim_sales_territory.sales_territory_key)
Column name: sale_source
Column data type: TEXT
Column description: Indicador de la fuente por la que ha sido realizado el pedido. reseller_sales=Pedido realizado por un vendedor para una tienda/distribuidor, internet_sales=Pedido realizado en línea por un cliente.

{'source': 'MDL Document', 'chunk_type': 'column_slice', 'database_name': 'adventure_works_dw', 'schema_name': 'sales', 'table_name': 'fact_sales', 'column_name': 'sale_source', 'column_data_type': 'TEXT'}


In [12]:
with open(MDL_PATH, 'r', encoding='utf-8') as mdl_file:
    mdl_data = yaml.safe_load(mdl_file)

chunks = chunk_mdl_by_table_summary(mdl_data)

max_len = max([len(c.page_content) for c in chunks])
print(f'{max_len=}')
print()
print([c.page_content for c in chunks if len(c.page_content) == max_len][0])
print()
print([c.metadata for c in chunks if len(c.page_content) == max_len][0])

max_len=1020

Database: adventure_works_dw
Schema: sales
Table: fact_sales
Table description: Tabla de hechos que contiene el detalle de las ordenes de ventas que ya han sido entregadas, con una granularidad a nivel de línea, mostrando siempre la última versión de cada pedido. SIEMPRE que se soliciten datos de ventas, como cantidades vendidas, importe de ventas, costes de ventas, impuestos, costes de envío, deberán ser obtenidos de esta tabla. Permite hacer desgloses a nivel de productos, clientes, tiendas/distribuidores, división territorial, promociones aplicadas y vendedores involucrados.
Table PRIMARY KEY:
- sales_order_number
- sales_order_line_number
Table FOREIGN KEYS (Column name, Reference):
- (product_key, sales.dim_product.product_key)
- (reseller_key, sales.dim_reseller.reseller_key)
- (employee_key, sales.dim_sales_person.employee_key)
- (customer_key, sales.dim_customer.customer_key)
- (promotion_key, sales.dim_promotion.promotion_key)
- (sales_territory_key, sales.dim_sa

In [13]:
with open(MDL_PATH, 'r', encoding='utf-8') as mdl_file:
    mdl_data = yaml.safe_load(mdl_file)

chunks = chunk_mdl_by_table_details(mdl_data)

max_len = max([len(c.page_content) for c in chunks])
print(f'{max_len=}')
print()
print([c.page_content for c in chunks if len(c.page_content) == max_len][0])
print()
print([c.metadata for c in chunks if len(c.page_content) == max_len][0])

max_len=4062

Database: adventure_works_dw
Schema: sales
Table: fact_sales
Columns:
- product_key (INT4): Clave foránea que apunta al producto que ha sido vendido en la tabla `dim_product`. (FOREIGN KEY, reference: sales.dim_product.product_key)
- reseller_key (INT4): Clave foránea que apunta a la tienda/distribuidor al que un vendedor le ha vendido el producto en la tabla `dim_reseller`. Tendrá un valor indicado únicamente para aquellos pedidos realizados por un vendedor para una tienda/distribuidor, indicadas en la columna `sales.fact_sales.sale_source` con el valor "reseller_sales", por el contrario, vendrá a NULL. (FOREIGN KEY, reference: sales.dim_reseller.reseller_key)
- employee_key (INT4): Clave foránea que apunta al vendedor que ha vendido el producto a una tienda/distribuidor en la tabla `dim_sales_person`. Tendrá un valor indicado únicamente para aquellos pedidos realizados por un vendedor para una tienda/distribuidor, indicadas en la columna `sales.fact_sales.sale_source` c

In [14]:
with open(MDL_PATH, 'r', encoding='utf-8') as mdl_file:
    mdl_data = yaml.safe_load(mdl_file)

chunks = chunk_mdl_by_column_reduced(mdl_data)

max_len = max([len(c.page_content) for c in chunks])
print(f'{max_len=}')
print()
print([c.page_content for c in chunks if len(c.page_content) == max_len][0])
print()
print([c.metadata for c in chunks if len(c.page_content) == max_len][0])

max_len=265

Column name: sale_source
Column data type: TEXT
Column description: Indicador de la fuente por la que ha sido realizado el pedido. reseller_sales=Pedido realizado por un vendedor para una tienda/distribuidor, internet_sales=Pedido realizado en línea por un cliente.

{'database_name': 'adventure_works_dw', 'schema_name': 'sales', 'table_name': 'fact_sales', 'column_name': 'sale_source', 'column_data_type': 'TEXT'}


Ahora procedemos a crear ambas colecciones de Chroma DB y a almacenar los embeddings de los chunks obtenidos en cada una:

In [15]:
### CURRENT
chroma_client = chromadb.PersistentClient(path= CHROMA_DB_PATH)
embeddings = AzureOpenAIEmbeddings(model=AZURE_OPENAI_EMBEDDING_MODEL)

with open(MDL_PATH, 'r', encoding='utf-8') as mdl_file:
    mdl_data = yaml.safe_load(mdl_file)



table_summary_chunks = chunk_mdl_by_table_summary(mdl_data)
if table_summary_chunks:
    try:
        chroma_client.delete_collection(TABLES_SUMMARY_COLLECTION_NAME)
    except:
        pass

    table_summary_collection = Chroma(
        client= chroma_client,
        collection_name= TABLES_SUMMARY_COLLECTION_NAME,
        embedding_function= embeddings,
        collection_configuration= {'hnsw': {'space': 'cosine'}},
        create_collection_if_not_exists= True
    )

    add_docs_to_chroma_col(table_summary_chunks, table_summary_collection)


table_detail_chunks = chunk_mdl_by_table_details(mdl_data)
if table_detail_chunks:
    try:
        chroma_client.delete_collection(TABLES_DETAILS_COLLECTION_NAME)
    except:
        pass

    table_details_collection = Chroma(
        client= chroma_client,
        collection_name= TABLES_DETAILS_COLLECTION_NAME,
        embedding_function= embeddings,
        collection_configuration= {'hnsw': {'space': 'cosine'}},
        create_collection_if_not_exists= True
    )

    add_docs_to_chroma_col(table_detail_chunks, table_details_collection)


column_reduced_chunks = chunk_mdl_by_column_reduced(mdl_data)
if column_reduced_chunks:
    try:
        chroma_client.delete_collection(COLUMNS_REDUCED_COLLECTION_NAME)
    except:
        pass

    column_reduced_collection = Chroma(
        client= chroma_client,
        collection_name= COLUMNS_REDUCED_COLLECTION_NAME,
        embedding_function= embeddings,
        collection_configuration= {'hnsw': {'space': 'cosine'}},
        create_collection_if_not_exists= True
    )

    add_docs_to_chroma_col(column_reduced_chunks, column_reduced_collection)


In [None]:
### LEGACY
chroma_client = chromadb.PersistentClient(path= CHROMA_DB_PATH)
openai_ef =  get_azure_openai_embedding_function()

with open(MDL_PATH, 'r', encoding='utf-8') as mdl_file:
    mdl_data = yaml.safe_load(mdl_file)


table_chunks = chunk_mdl_by_table(mdl_data)
if table_chunks:
    tables_collection = setup_chromadb_collection(
        chroma_client, 
        TABLES_COLLECTION_NAME, 
        clean_setup=True, 
        embedding_function= openai_ef,
        space_metric= 'cosine'
    )
    store_chunks_in_chorma_db(table_chunks, tables_collection, use_collection_ef=True)


columns_chunk = chunk_mdl_by_column(mdl_data)
if columns_chunk:
    columns_collection = setup_chromadb_collection(
        chroma_client, 
        COLUMNS_COLLECTION_NAME, 
        clean_setup=True, 
        embedding_function= openai_ef,
        space_metric= 'cosine'
    )
    store_chunks_in_chorma_db(columns_chunk, columns_collection, use_collection_ef=True)



Lanzamos ahora una consulta a cada una de las colecciones creadas:

In [None]:
QUERY = 'Los 10 artículos más comprados.'

In [174]:
table_summary_results = table_summary_collection.similarity_search_with_relevance_scores(
    query= QUERY,
    k= 5
)

relevant_columns = []
for table_result in table_summary_results:
    database_name = table_result[0].metadata['database_name']
    schema_name = table_result[0].metadata['schema_name']
    table_name = table_result[0].metadata['table_name']

    table_filter = {
        "$and": [
            {"database_name": {"$eq": database_name}},
            {"schema_name": {"$eq": schema_name}},
            {"table_name": {"$eq": table_name}}
        ]
    }

    column_reduced_results = column_reduced_collection.similarity_search_with_relevance_scores(
        query= QUERY,
        k=20,
        filter=table_filter
    )

    relevant_columns.append({
        'table_summary': {
            'content': table_result[0].page_content,
            'relevance_score': table_result[1]
        },
        'columns': [{'content': col[0].page_content, 'relevance_score': col[1]} for col in column_reduced_results]
    })


print(json.dumps(relevant_columns, indent=2))

[
  {
    "table_summary": {
      "content": "Database: adventure_works_dw\nSchema: sales\nTable: fact_sales\nTable description: Tabla de hechos que contiene el detalle de las ordenes de ventas que ya han sido entregadas, con una granularidad a nivel de l\u00ednea, mostrando siempre la \u00faltima versi\u00f3n de cada pedido. SIEMPRE que se soliciten datos de ventas, como cantidades vendidas, importe de ventas, costes de ventas, impuestos, costes de env\u00edo, deber\u00e1n ser obtenidos de esta tabla. Permite hacer desgloses a nivel de productos, clientes, tiendas/distribuidores, divisi\u00f3n territorial, promociones aplicadas y vendedores involucrados.\nTable PRIMARY KEY:\n- sales_order_number\n- sales_order_line_number\nTable FOREIGN KEYS (Column name, Reference):\n- (product_key, sales.dim_product.product_key)\n- (reseller_key, sales.dim_reseller.reseller_key)\n- (employee_key, sales.dim_sales_person.employee_key)\n- (customer_key, sales.dim_customer.customer_key)\n- (promotion_k

In [176]:
print(relevant_columns)

[{'table_summary': {'content': 'Database: adventure_works_dw\nSchema: sales\nTable: fact_sales\nTable description: Tabla de hechos que contiene el detalle de las ordenes de ventas que ya han sido entregadas, con una granularidad a nivel de línea, mostrando siempre la última versión de cada pedido. SIEMPRE que se soliciten datos de ventas, como cantidades vendidas, importe de ventas, costes de ventas, impuestos, costes de envío, deberán ser obtenidos de esta tabla. Permite hacer desgloses a nivel de productos, clientes, tiendas/distribuidores, división territorial, promociones aplicadas y vendedores involucrados.\nTable PRIMARY KEY:\n- sales_order_number\n- sales_order_line_number\nTable FOREIGN KEYS (Column name, Reference):\n- (product_key, sales.dim_product.product_key)\n- (reseller_key, sales.dim_reseller.reseller_key)\n- (employee_key, sales.dim_sales_person.employee_key)\n- (customer_key, sales.dim_customer.customer_key)\n- (promotion_key, sales.dim_promotion.promotion_key)\n- (sa

In [178]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain.schema import StrOutputParser

# Modelo LLM
llm = AzureChatOpenAI(model='gpt-4o-mini', temperature=0)

# Prompt para resumir contexto
summarizer_prompt = ChatPromptTemplate.from_template("""
Eres un asistente experto en bases de datos y SQL.
Tu tarea es generar un resumen conciso y relevante de la información dada sobre tablas y columnas,
teniendo en cuenta la pregunta del usuario.

# Pregunta del usuario:
{user_query}

# Datos recuperados:
{retrieved_docs}

Instrucciones para el resumen:
- Incluye solo las tablas y columnas necesarias para responder la pregunta.
- Mantén descripciones útiles para entender qué almacena cada columna.
- Conserva información clave sobre relaciones entre tablas si es necesaria.
- Elimina cualquier detalle irrelevante.
- Sé claro y breve para que otro agente pueda usar este resumen para crear una consulta SQL.
- Responde en español.
""")

# Cadena final
summarizer_chain = summarizer_prompt | llm | StrOutputParser()

# Ejemplo de uso
contexto_resumido = summarizer_chain.invoke({
    'user_query': QUERY,
    'retrieved_docs': relevant_columns  # Aquí pasas tu lista completa
})

print(contexto_resumido)

Para obtener los 10 artículos más comprados, se utilizarán las siguientes tablas y columnas de la base de datos `adventure_works_dw`:

### Tablas relevantes:

1. **Tabla: fact_sales**
   - **Descripción**: Contiene el detalle de las órdenes de ventas entregadas, con información a nivel de línea.
   - **Columnas relevantes**:
     - **order_quantity** (INT2): Cantidad vendida de cada producto.
     - **product_key** (FK): Clave del producto, necesaria para relacionar con la tabla de productos.

2. **Tabla: dim_product**
   - **Descripción**: Almacena información detallada de cada producto vendido.
   - **Columnas relevantes**:
     - **product_key** (PK): Clave del producto.
     - **english_product_name** (VARCHAR(50)): Nombre del producto en inglés.
     - **list_price** (NUMERIC): Precio de venta del producto en USD.

### Relación entre tablas:
- La columna `product_key` en `fact_sales` se relaciona con `product_key` en `dim_product` para obtener el nombre y precio de los productos v

In [89]:
tables_results = tables_collection.query(
    query_texts= 'Los 10 productos más vendidos.',
    n_results= 3,
    include= ['documents', 'metadatas', 'distances']
)

for i, _ in enumerate(tables_results.get('ids', [[]])[0]):
    print('Documento:')
    print(tables_results.get('documents', [[]])[0][i])
    print()
    
    print('Metadata:')
    display(tables_results.get('metadatas', [[]])[0][i])
    print()

    print('distance:')
    print(tables_results.get('distances', [[]])[0][i])
    print()
    
    print('#'*50 + '\n')


Documento:
Database: adventure_works_dw
Database description: AdventureWorks es una base de datos de ejemplo facilitada por Microsoft. Esta base de datos admite escenarios estándar de procesamiento de transacciones en línea para un fabricante de bicicletas ficticio: Adventure Works Cycles.
Schema: sales
Schema description: Se trata de un esquema de ventas con espíritu de simplificar la información del módulo de ventas de la BBDD original. Contamos con información de ventas (tanto por internet como de retail), y las dimensiones asociadas (promociones, clientes, distribuidores, vendedores, porductos, etc.).
Table: dim_product
Table description: Tabla de dimensión que almacena la información detallada de cada producto que alguna vez haya sido vendido.
Columns:
- product_key (INT4): Clave primaria para identificar de forma única a cada producto. (PRIMARY KEY)
- weight_unit_measure_code (BPCHAR(3)): Unidad de medida de peso. Algunos artículos no lo tienen configurado y viene a NULL.
- size_

{'schema_name': 'sales',
 'num_columns': 28,
 'table_name': 'dim_product',
 'columns_info': '[{"name": "product_key", "data_type": "INT4"}, {"name": "weight_unit_measure_code", "data_type": "BPCHAR(3)"}, {"name": "size_unit_measure_code", "data_type": "BPCHAR(3)"}, {"name": "english_product_name", "data_type": "VARCHAR(50)"}, {"name": "spanish_product_name", "data_type": "VARCHAR(50)"}, {"name": "standard_cost", "data_type": "NUMERIC"}, {"name": "finished_goods_flag", "data_type": "BOOL"}, {"name": "color", "data_type": "VARCHAR(15)"}, {"name": "safety_stock_level", "data_type": "INT2"}, {"name": "reorder_point", "data_type": "INT2"}, {"name": "list_price", "data_type": "NUMERIC"}, {"name": "size", "data_type": "VARCHAR(50)"}, {"name": "size_range", "data_type": "VARCHAR(50)"}, {"name": "weight", "data_type": "FLOAT8"}, {"name": "days_to_manufacture", "data_type": "INT4"}, {"name": "product_line", "data_type": "BPCHAR(2)"}, {"name": "dealer_price", "data_type": "NUMERIC"}, {"name": "cl


distance:
0.6944389939308167

##################################################

Documento:
Database: adventure_works_dw
Database description: AdventureWorks es una base de datos de ejemplo facilitada por Microsoft. Esta base de datos admite escenarios estándar de procesamiento de transacciones en línea para un fabricante de bicicletas ficticio: Adventure Works Cycles.
Schema: sales
Schema description: Se trata de un esquema de ventas con espíritu de simplificar la información del módulo de ventas de la BBDD original. Contamos con información de ventas (tanto por internet como de retail), y las dimensiones asociadas (promociones, clientes, distribuidores, vendedores, porductos, etc.).
Table: fact_sales
Table description: Tabla de hechos que contiene el detalle de las ordenes de ventas que ya han sido entregadas, con una granularidad a nivel de línea, mostrando siempre la última versión de cada pedido.
Columns:
- product_key (INT4): Clave foránea que apunta al producto que ha sido ven

{'table_foreign_keys': '[{"column_name": "product_key", "reference": "sales.dim_product.product_key"}, {"column_name": "reseller_key", "reference": "sales.dim_reseller.reseller_key"}, {"column_name": "employee_key", "reference": "sales.dim_sales_person.employee_key"}, {"column_name": "customer_key", "reference": "sales.dim_customer.customer_key"}, {"column_name": "promotion_key", "reference": "sales.dim_promotion.promotion_key"}, {"column_name": "sales_territory_key", "reference": "sales.dim_sales_territory.sales_territory_key"}]',
 'chunk_type': 'table_full',
 'table_name': 'fact_sales',
 'schema_name': 'sales',
 'num_columns': 22,
 'database_name': 'adventure_works_dw',
 'source': 'MDL Document',
 'table_primary_key': '["sales_order_number", "sales_order_line_number"]',
 'columns_info': '[{"name": "product_key", "data_type": "INT4"}, {"name": "reseller_key", "data_type": "INT4"}, {"name": "employee_key", "data_type": "INT4"}, {"name": "customer_key", "data_type": "INT4"}, {"name": "p


distance:
0.7193505764007568

##################################################

Documento:
Database: adventure_works_dw
Database description: AdventureWorks es una base de datos de ejemplo facilitada por Microsoft. Esta base de datos admite escenarios estándar de procesamiento de transacciones en línea para un fabricante de bicicletas ficticio: Adventure Works Cycles.
Schema: sales
Schema description: Se trata de un esquema de ventas con espíritu de simplificar la información del módulo de ventas de la BBDD original. Contamos con información de ventas (tanto por internet como de retail), y las dimensiones asociadas (promociones, clientes, distribuidores, vendedores, porductos, etc.).
Table: dim_promotion
Table description: Tabla de dimensión que almacena la información detallada de las diferentes promociones / descuentos que pueden aplicarse a las ventas.
Columns:
- promotion_key (INT4): Clave primaria para identificar de forma única a cada promoción / descuento. (PRIMARY KEY)
- eng

{'schema_name': 'sales',
 'columns_info': '[{"name": "promotion_key", "data_type": "INT4"}, {"name": "english_promotion_name", "data_type": "VARCHAR(255)"}, {"name": "spanish_promotion_name", "data_type": "VARCHAR(255)"}, {"name": "discount_pct", "data_type": "FLOAT8"}, {"name": "english_promotion_type", "data_type": "VARCHAR(50)"}, {"name": "spanish_promotion_type", "data_type": "VARCHAR(50)"}, {"name": "english_promotion_category", "data_type": "VARCHAR(50)"}, {"name": "spanish_promotion_category", "data_type": "VARCHAR(50)"}, {"name": "start_date", "data_type": "DATE"}, {"name": "end_date", "data_type": "DATE"}, {"name": "min_qty", "data_type": "INT4"}, {"name": "max_qty", "data_type": "INT4"}]',
 'num_columns': 12,
 'table_foreign_keys': '[]',
 'chunk_type': 'table_full',
 'table_primary_key': '["promotion_key"]',
 'table_name': 'dim_promotion',
 'source': 'MDL Document',
 'database_name': 'adventure_works_dw'}


distance:
0.7506245374679565

##################################################



In [90]:
columns_results = columns_collection.query(
    query_texts= 'Los 10 productos más vendidos.',
    n_results= 10,
    include= ['documents', 'metadatas', 'distances']
)

for i, _ in enumerate(columns_results.get('ids', [[]])[0]):
    print('Documento:')
    print(columns_results.get('documents', [[]])[0][i])
    print()
    
    print('Metadata:')
    display(columns_results.get('metadatas', [[]])[0][i])
    print()

    print('distance:')
    print(columns_results.get('distances', [[]])[0][i])
    print()
    
    print('#'*50 + '\n')


Documento:
Database: adventure_works_dw
Schema: sales
Table: dim_product
Table PRIMARY KEY:
- product_key
Column name: active
Column data type: BOOL
Column description: Indicador de si el producto sigue disponible para la venta.

Metadata:


{'column_data_type': 'BOOL',
 'chunk_type': 'column_slice',
 'source': 'MDL Document',
 'schema_name': 'sales',
 'column_name': 'active',
 'database_name': 'adventure_works_dw',
 'table_name': 'dim_product'}


distance:
0.7770865559577942

##################################################

Documento:
Database: adventure_works_dw
Schema: sales
Table: dim_product
Table PRIMARY KEY:
- product_key
Column name: list_price
Column data type: NUMERIC
Column description: Precio de venta del producto en USD.

Metadata:


{'column_data_type': 'NUMERIC',
 'schema_name': 'sales',
 'database_name': 'adventure_works_dw',
 'column_name': 'list_price',
 'chunk_type': 'column_slice',
 'source': 'MDL Document',
 'table_name': 'dim_product'}


distance:
0.7777111530303955

##################################################

Documento:
Database: adventure_works_dw
Schema: sales
Table: dim_product
Table PRIMARY KEY:
- product_key
Column name: reorder_point
Column data type: INT2
Column description: Nivel de stock que dispara una orden de compra o una orden de trabajo.

Metadata:


{'source': 'MDL Document',
 'schema_name': 'sales',
 'database_name': 'adventure_works_dw',
 'column_data_type': 'INT2',
 'column_name': 'reorder_point',
 'chunk_type': 'column_slice',
 'table_name': 'dim_product'}


distance:
0.7785109877586365

##################################################

Documento:
Database: adventure_works_dw
Schema: sales
Table: fact_sales
Table PRIMARY KEY:
- sales_order_number
- sales_order_line_number
Table FOREIGN KEYS (Column name, Reference):
- (product_key, sales.dim_product.product_key)
- (reseller_key, sales.dim_reseller.reseller_key)
- (employee_key, sales.dim_sales_person.employee_key)
- (customer_key, sales.dim_customer.customer_key)
- (promotion_key, sales.dim_promotion.promotion_key)
- (sales_territory_key, sales.dim_sales_territory.sales_territory_key)
Column name: total_product_cost
Column data type: NUMERIC
Column description: Coste estándar en USD de todas las unidades de producto. Se calcula como `order_quantity * product_standard_cost`.

Metadata:


{'table_name': 'fact_sales',
 'column_data_type': 'NUMERIC',
 'chunk_type': 'column_slice',
 'source': 'MDL Document',
 'database_name': 'adventure_works_dw',
 'schema_name': 'sales',
 'column_name': 'total_product_cost'}


distance:
0.7804881930351257

##################################################

Documento:
Database: adventure_works_dw
Schema: sales
Table: dim_product
Table PRIMARY KEY:
- product_key
Column name: safety_stock_level
Column data type: INT2
Column description: Nivel de stock mínimo del producto.

Metadata:


{'column_data_type': 'INT2',
 'source': 'MDL Document',
 'chunk_type': 'column_slice',
 'table_name': 'dim_product',
 'schema_name': 'sales',
 'column_name': 'safety_stock_level',
 'database_name': 'adventure_works_dw'}


distance:
0.7871631383895874

##################################################

Documento:
Database: adventure_works_dw
Schema: sales
Table: fact_sales
Table PRIMARY KEY:
- sales_order_number
- sales_order_line_number
Table FOREIGN KEYS (Column name, Reference):
- (product_key, sales.dim_product.product_key)
- (reseller_key, sales.dim_reseller.reseller_key)
- (employee_key, sales.dim_sales_person.employee_key)
- (customer_key, sales.dim_customer.customer_key)
- (promotion_key, sales.dim_promotion.promotion_key)
- (sales_territory_key, sales.dim_sales_territory.sales_territory_key)
Column name: product_standard_cost
Column data type: NUMERIC
Column description: Coste estándar en USD de cada unidad de producto.

Metadata:


{'column_name': 'product_standard_cost',
 'table_name': 'fact_sales',
 'source': 'MDL Document',
 'schema_name': 'sales',
 'chunk_type': 'column_slice',
 'column_data_type': 'NUMERIC',
 'database_name': 'adventure_works_dw'}


distance:
0.7875667810440063

##################################################

Documento:
Database: adventure_works_dw
Schema: sales
Table: fact_sales
Table PRIMARY KEY:
- sales_order_number
- sales_order_line_number
Table FOREIGN KEYS (Column name, Reference):
- (product_key, sales.dim_product.product_key)
- (reseller_key, sales.dim_reseller.reseller_key)
- (employee_key, sales.dim_sales_person.employee_key)
- (customer_key, sales.dim_customer.customer_key)
- (promotion_key, sales.dim_promotion.promotion_key)
- (sales_territory_key, sales.dim_sales_territory.sales_territory_key)
Column name: order_quantity
Column data type: INT2
Column description: Cantidad vendida de cada producto.

Metadata:


{'source': 'MDL Document',
 'table_name': 'fact_sales',
 'schema_name': 'sales',
 'chunk_type': 'column_slice',
 'column_name': 'order_quantity',
 'database_name': 'adventure_works_dw',
 'column_data_type': 'INT2'}


distance:
0.7875723838806152

##################################################

Documento:
Database: adventure_works_dw
Schema: sales
Table: fact_sales
Table PRIMARY KEY:
- sales_order_number
- sales_order_line_number
Table FOREIGN KEYS (Column name, Reference):
- (product_key, sales.dim_product.product_key)
- (reseller_key, sales.dim_reseller.reseller_key)
- (employee_key, sales.dim_sales_person.employee_key)
- (customer_key, sales.dim_customer.customer_key)
- (promotion_key, sales.dim_promotion.promotion_key)
- (sales_territory_key, sales.dim_sales_territory.sales_territory_key)
Column name: tax_amt
Column data type: NUMERIC
Column description: Importe de impuestos en USD aplicable a la línea de orden de venta.

Metadata:


{'source': 'MDL Document',
 'chunk_type': 'column_slice',
 'database_name': 'adventure_works_dw',
 'column_name': 'tax_amt',
 'table_name': 'fact_sales',
 'column_data_type': 'NUMERIC',
 'schema_name': 'sales'}


distance:
0.7937028408050537

##################################################

Documento:
Database: adventure_works_dw
Schema: sales
Table: dim_product
Table PRIMARY KEY:
- product_key
Column name: standard_cost
Column data type: NUMERIC
Column description: Coste estándar del producto en USD.

Metadata:


{'column_name': 'standard_cost',
 'column_data_type': 'NUMERIC',
 'database_name': 'adventure_works_dw',
 'source': 'MDL Document',
 'table_name': 'dim_product',
 'schema_name': 'sales',
 'chunk_type': 'column_slice'}


distance:
0.7940261960029602

##################################################

Documento:
Database: adventure_works_dw
Schema: sales
Table: dim_product
Table PRIMARY KEY:
- product_key
Column name: finished_goods_flag
Column data type: BOOL
Column description: Indicador de si se trata de un producto final vendible. 0 = El producto no es un artículo vendible, 1 = El producto es vendible.

Metadata:


{'chunk_type': 'column_slice',
 'column_data_type': 'BOOL',
 'column_name': 'finished_goods_flag',
 'schema_name': 'sales',
 'database_name': 'adventure_works_dw',
 'source': 'MDL Document',
 'table_name': 'dim_product'}


distance:
0.7962996959686279

##################################################

