In [1]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import os
from langchain.prompts.chat import ChatPromptTemplate
from langchain.llms.openai import OpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain.agents.mrkl.base import ZeroShotAgent
from langchain.agents.mrkl.prompt import FORMAT_INSTRUCTIONS
from langchain.memory import VectorStoreRetrieverMemory
from langchain.agents.agent import AgentExecutor
from langchain.chains.llm import LLMChain

from langchain.agents.agent_toolkits.sql.prompt import (
    SQL_FUNCTIONS_SUFFIX,
    SQL_PREFIX,
    SQL_SUFFIX,
)

# 1. Prepare Documents

**Technical documents**

In [50]:
td_vectordistinct = """
Overview : 
    The TD_VectorDistance function accepts a table of target vectors and a table of reference vectors and returns a table that contains the distance between 
    target-reference pairs. The function computes the distance between the target pair and the reference pair from the same table if you provide only one 
    table as the input. You must have the same column order in the TargetFeatureColumns argument and the RefFeatureColumns argument. 
    The function ignores the feature values during distance computation if the value is either NULL, NAN, or INF.

Syntax :
    SELECT target_id, reference_id, distancetype, cast(distance as decimal(36,8)) as distance 
    FROM TD_VECTORDISTANCE (

    ON table | view | (query)  AS TARGETTABLE PARTITION BY ANY
    [ ON  table | view | (query)  AS REFERENCETABLE DIMENSION ]
    USING
    TargetIDColumn ('target_id_column')
    TargetFeatureColumns ( 'target_feature_column' | target_feature_Column_range [,...])
    [ RefIDColumn ('ref_id_column') ]
    [ RefFeatureColumns ( 'ref_feature_column' | ref_feature_Column_range [,...]) ]
    [ DistanceMeasure ('Cosine' | 'Euclidean' | 'Manhattan'[,...])]
    [ TopK(integer_value)]          
    )
    
Example : Calculate distances between vectors in a target table (target_mobile_data_dense) and a reference table (ref_mobile_data_dense) based on feature1, feature2 and feature 3
    SELECT target_id, reference_id, distancetype, cast(distance as decimal(36,8)) as distance 
    FROM TD_VECTORDISTANCE (
        ON target_mobile_data_dense AS TargetTable
        ON ref_mobile_data_dense AS ReferenceTable DIMENSION
        USING
            TargetIDColumn('id_column')
            TargetFeatureColumns('feature1','feature2','feature3')
            RefIDColumn('id_column')
            RefFeatureColumns('feature1','feature2','feature3')
            DistanceMeasure('cosine')
            topk(2)
    ) AS dt order by 3,1,2,4;
"""

moving_average = """
Overview : 
    The MovingAverage function computes average values in a series, using the specified moving average type.
    Moving Average (MA) is a widely used technical analysis indicator that is used to smooth out fluctuations in a data series and to identify trends. It is calculated by taking the average of a predetermined number of periods or time intervals.

Syntax :
    MovingAverage (
      ON  table | view | (query) 
         [ PARTITION BY partition_column [,...] ]
         [ ORDER BY order_column [,...] ]
      [ USING
        [ MAvgType ({ 'C' | 'E' | 'M' | 'S' | 'T' | 'W' }) ]
        [ TargetColumns ('target_column'| 'target_column_range'[,...])]
        [ Alpha (alpha) ]
        [ StartRows (n) ]
        [ IncludeFirst ('true'|'t'|'yes'|'y'|'1'|'false'|'f'|'no'|'n'|'0') ]
        [ WindowSize (window_size) ]
      ]
    )

    MAvgType
    [Optional] Specify one of the following moving average types:
    'C' (Default)	Cumulative moving average.
    'E'	Exponential moving average.
    'M'	Modified moving average.
    'S'	Simple moving average.
    'T'	Triangular moving average.
    'W'	Weighted moving average.
    TargetColumns : [Optional] Specify the input column names for which to compute the moving average.
    Default behavior: The function copies every input column to the output table but does not compute any moving averages.
    Alpha : [Optional with MAvgType E, otherwise ignored.] Specify the damping factor, a value in the range [0, 1], which represents a percentage in the range [0, 100]. For example, if alpha is 0.2, the damping factor is 20%. A higher alpha discounts older observations faster.
            [Optional with MAvgType E, otherwise ignored.] Specify the number of rows to skip before calculating the exponential moving average. The function uses the arithmetic average of these rows as the initial value of the exponential moving average. The value n must be an integer.
            [Ignored with MAvgType C, otherwise optional.] Specify whether to include the starting rows in the output table. If you specify 'true', the output columns for the starting rows contain NULL, because their moving average is undefined.
    WindowSize:     [Optional with MAvgType M, S, T, and W; otherwise ignored.] Specify the number of previous values to consider when computing the new moving average. The data type of window_size must be BYTEINT, SMALLINT, or INTEGER.
    Minimum value: 3
    Default: '10'

Example : 
    SELECT * FROM MovingAverage (
      ON company1_stock PARTITION BY name ORDER BY period
      USING
      MAvgType ('S')
      TargetColumns ('stockprice')
      WindowSize (10)
      IncludeFirst ('true')
    ) AS dt ORDER BY id;
"""

bin_code_fit = """
Overview : 
    Bin Coding, also known as binning or bucketing, is a data preprocessing technique used in statistics and machine learning to transform continuous data into categorical or discrete data. In binning, a range of continuous values is divided into smaller intervals or bins, and the data values are assigned to the appropriate bin. This allows the data to be analyzed more easily by grouping similar values into categories.
    
    Binning is used to:
    Reduce noise in the data by aggregating values into groups.
    Identify trends and patterns in the data by grouping similar values together.
    Simplify complex data by reducing the number of unique values.

Syntax : 
    TD_BincodeFit (
    ON { table | view | (query) } AS InputTable
    [ OUT [ PERMANENT | VOLATILE ] TABLE OutputTable (output_table) ]
    USING
    TargetColumns ({ 'target_column' | target_column_range }[,...])
    MethodType ('equal-width')
    NBins ('single bin value' | 'separate bin values')
    [ LabelPrefix ('single prefix' | 'separate prefix values') ]
  )
    - ON clause : Accept the InputTable clause for equal-width bins with generated labels. Accept the InputTable and FitInput clauses for variable-width bins with specified labels.
    - TargetColumns : Specify the names of the InputTable columns to bin-code. The maximum number of target columns is 2018.
    - MethodType : Specify the bin-coding method:Method Type	Description equal-width	Bins have fixed width with auto-generated labels.
        Function finds minimum and maximum values of target columns (min and max) and computes bin width, w, with this formula:
        w = (max - min)/k
        k, the number of bins, is determined by NBins.
    - variable-width	Bins have specified variable widths and specified labels, provided by FitInput table. Maximum number of bins is 3000.
    - NBins : [MethodType ('equal-width') only.] Specify either a single bin value for all the target columns or separate bin values for each of the target columns.
Example : 
    SELECT * FROM TD_BincodeFit (
    ON bin_titanic_train AS InputTable
    ON FitInputTable AS FitInput DIMENSION
    USING
    TargetColumns ('age')
    MethodType ('Variable-Width')
    MinValueColumn ('MinValue')
    MaxValueCOlumn ('MaxValue')
    LabelColumn ('Label')
    TargetColNames ('ColumnName')
  ) AS dt
"""

# 2. Create Vector DB

In [51]:
technical_list = [td_vectordistinct, moving_average, bin_code_fit]

embedding_function = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API"))

# load it into FAISS
db = FAISS.from_texts(technical_list, embedding_function)

In [53]:
db.similarity_search("Calculate similarity")[0]

Document(page_content="\nOverview : \n    The TD_VectorDistance function accepts a table of target vectors and a table of reference vectors and returns a table that contains the distance between \n    target-reference pairs. The function computes the distance between the target pair and the reference pair from the same table if you provide only one \n    table as the input. You must have the same column order in the TargetFeatureColumns argument and the RefFeatureColumns argument. \n    The function ignores the feature values during distance computation if the value is either NULL, NAN, or INF.\n\nSyntax :\n    SELECT target_id, reference_id, distancetype, cast(distance as decimal(36,8)) as distance \n    FROM TD_VECTORDISTANCE (\n\n    ON table | view | (query)  AS TARGETTABLE PARTITION BY ANY\n    [ ON  table | view | (query)  AS REFERENCETABLE DIMENSION ]\n    USING\n    TargetIDColumn ('target_id_column')\n    TargetFeatureColumns ( 'target_feature_column' | target_feature_Column_r

# 3. Create Tearadata Search Tool

In [54]:
from langchain.tools import BaseTool
from typing import Optional, Type

from langchain.callbacks.manager import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)

from langchain.agents import create_sql_agent
from langchain.agents.types import AgentType

In [None]:
retriever = db.as_retriever()

In [55]:
class TeradataSearchTool(BaseTool):
    name = "teradata_search_tool"
    description = "Input to this tool is a keyword such as binning or bucketing, similarity, moving averate. Output is instruction of how to using Teradata Syntax with examples to make better query"

    def _run(
        self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool."""
        global retriever
        relevant_doc = retriever.get_relevant_documents(query)
        if len(relevant_doc) == 0 | len(query) == 0:
            return "There are no Syntax of Teradata that need to be used in this scenario"
        else:
            return relevant_doc[0].page_content

    async def _arun(
        self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool asynchronously."""
        raise NotImplementedError("custom_search does not support async")


teradata_search_tool = TeradataSearchTool()

# 4. Redefine SQL Agent

In [57]:
# Step 1. Define LLM model
model = OpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API"))

# Step 2. Setting connection to db and SQL Toolkits
db = SQLDatabase.from_uri("teradatasql://dbc:dbc@192.168.11.7:1025/Sales")
toolkit = SQLDatabaseToolkit(db=db, llm=model)
tools = toolkit.get_tools()

In [58]:
# Step 3. Define Prefix

prefix = """
You are an agent designed to interact with a Teradata database.
Given an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.
You can order the results by a relevant column to return the most interesting examples in the database.
Never query for all the columns from a specific table, only ask for the relevant columns given the question.
You have access to tools for interacting with the database.
Only use the below tools. Only use the information returned by the below tools to construct your final answer.
You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.
Note you must always use double quotation marks (" ") for column names in SQL queries for avoid syntax error

If the question does not seem related to the database, just return "I don't know" as the answer.
"""

In [59]:
suffix = """Begin!

Question: {input}
Thought: I should used teradata_search_tool to finding relevant syntax first.
{agent_scratchpad}"""

In [60]:
# Step 4. Create Agent Executor 
agent_executor = create_sql_agent(
    llm=model,
    toolkit=toolkit,
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    extra_tools=[teradata_search_tool],
    prefix=prefix,
    suffix=suffix
)

# 5. Test

In [63]:
agent_executor.run("Identify user similarities by analyzing the 'UserHistory' table using 'UserHistoryReference' as the reference table, focusing on attributes CallDuration, DataCounter, and SMS")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: teradata_search_tool
Action Input: similarity[0m
Observation: [33;1m[1;3m
Overview : 
    The TD_VectorDistance function accepts a table of target vectors and a table of reference vectors and returns a table that contains the distance between 
    target-reference pairs. The function computes the distance between the target pair and the reference pair from the same table if you provide only one 
    table as the input. You must have the same column order in the TargetFeatureColumns argument and the RefFeatureColumns argument. 
    The function ignores the feature values during distance computation if the value is either NULL, NAN, or INF.

Syntax :
    SELECT target_id, reference_id, distancetype, cast(distance as decimal(36,8)) as distance 
    FROM TD_VECTORDISTANCE (

    ON table | view | (query)  AS TARGETTABLE PARTITION BY ANY
    [ ON  table | view | (query)  AS REFERENCETABLE DIMENSION ]
    USING
    Targe

"The user similarities can be identified by analyzing the 'UserHistory' table using 'UserHistoryReference' as the reference table, focusing on attributes CallDuration, DataCounter, and SMS, using the TD_VectorDistance function to calculate the distance between the target pair and the reference pair from the same table. The results of the query show that users 1 and 5 have a cosine distance of 0.45486518, users 1 and 7 have a cosine distance of 0.32604815, and so on."