# Generate configuration file

## Get your [OpenAI API Key](https://platform.openai.com/account/api-keys)

In [None]:
#!pip install python-dotenv
#!pip install openai
#!pip install --upgrade langchain

In [11]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_type = os.environ['OPENAI_API_TYPE']
openai.api_version = os.environ['OPENAI_API_VERSION']
openai.api_base = os.environ['OPENAI_API_BASE']
openai.api_key = os.environ['OPENAI_API_KEY']

llm_model = os.environ['OPENAI_ENGINE']
print(llm_model)

gpt-35-turbo


## Get the Model

In [12]:
from langchain.chat_models import ChatOpenAI

In [13]:
# To control the randomness and creativity of the generated
# text by an LLM, use temperature = 0.0
chat = ChatOpenAI(temperature=0.0, model=llm_model)
chat

ChatOpenAI(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, model_name='gpt-35-turbo', temperature=0.0, model_kwargs={}, openai_api_key='681df770318d493192c149e6e7390108', openai_api_base='https://garyoai.openai.azure.com/', openai_organization='', openai_proxy='', request_timeout=None, max_retries=6, streaming=False, n=1, max_tokens=None, tiktoken_model_name=None)

## Output Parsers

Let's start with defining how we would like the LLM output to look like:

In [14]:
{
  "gift": False,
  "delivery_days": 5,
  "price_value": "pretty affordable!"
}

{'gift': False, 'delivery_days': 5, 'price_value': 'pretty affordable!'}

In [22]:
config_desc = """\
- name: my_data_pipeline
- staging: the list of staging operations which includes the following information:
  - name: the name of the staging operation
  - input: the input information which includes the following information:
    - type: the type of the input data source
    - format: the format of the input data source
    - path: the path of the input data source
    - read-type: the read type of the input data source
  - output: the output information which includes the following information:
    - target: the target of the output data source, use stg_sales
    - type: the type of the output data source which can be either file or view, use view
  - schema: the schema of the output data source which includes the following information:
    - fields: the list of fields (ID, string; name string) of the output data source, each field includes the following information:
      - metadata: the metadata of the field
      - name: the name of the field
      - nullable: the nullable of the field
      - type: the type of the field
  - sampleData: here provide 5-row sample data of the output data source
- standard: the list of standard tables, each standard table includes the following information:
  - name: the name of the standard table
  - type: the type of the standard table which can be either batch or streaming
  - code: the code of the standard table which includes the following information:
    - lang: the language of the code, use SQL
    - sql: the SQL code to select all data from the staging table named stg_sales
  - output: the output information which includes the following information:
    - target: the target of the output data source
    - type: the type of the output data source which can be either file or view
- serving: the list of serving tables
"""

config_desc1 = """\
{
  "name": "fruit_batch_data_app",
  "staging": [
    {
      "name": "sales_ingestion",
      "input": {
        "type": "filestore",
        "format": "csv",
        "path": "/FileStore/cddp_apps/fruit_batch_data_app/landing/sales_ingestion/",
        "read-type": "batch"
      },
      "output": {
        "target": "stg_sales",
        "type": [
          "file",
          "view"
        ]
      },
      "schema": {
        "fields": [
          {
            "metadata": {},
            "name": "ID",
            "nullable": true,
            "type": "integer"
          }
        ],
        "type": "struct"
      },
      "sampleData": [
        {
          "ID": 1,
          "Amount": 12,
          "TS": "2022-01-10T00:00:00.000+08:00"
        }
      ]
    }
  ],
  "standard": [
    {
      "name": "fruit_sales_transform",
      "type": "batch",
      "code": {
        "lang": "sql",
        "sql": "select price.fruit, price.id, sales.amount, price.price, sales.ts from stg_sales sales left outer join stg_price price on sales.id = price.id and sales.ts >= price.start_ts and sales.ts < price.end_ts"
      },
      "output": {
        "target": "std_fruit_sales",
        "type": [
          "file",
          "view"
        ]
      }
    }
  ],
  "serving": [
    {
      "name": "fruit_sales_total_curation",
      "type": "batch",
      "code": {
        "lang": "sql",
        "sql": "select id, fruit, sum(amount*price) as total from std_fruit_sales group by id, fruit order by total desc"
      },
      "output": {
        "target": "srv_fruit_sales_total",
        "type": [
          "table",
          "file"
        ]
      }
    }
  ]
}
"""

config_template = """\
You are data engineer to build a data pipeline based on CDDP. \
Refer to https://github.com/Azure/config-driven-data-pipeline for the detailed information about CDDP. \
You are required to build a data pipeline to ingest data from a source system to a target system. \
In order to do that, you need to create a JSON configuration file for the data pipeline. \

The JSON configuration file should contain the following information:
{{
  "name": "fruit_batch_data_app",
  "staging": [],
  "standard": [],
  "serving": []
}}

Refer to https://github.com/Azure/config-driven-data-pipeline/blob/main/example/pipeline_fruit_batch.json for a sample JSON configuration file. \

Please create a JSON configuration file for the CDDP data pipeline \
by specifying the following information:
{config_desc}

Format the output as JSON with the following keys:
name
staging
standard
serving
"""

In [23]:
from langchain.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_template(config_template)
print(prompt_template)

input_variables=['config_desc'] output_parser=None partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['config_desc'], output_parser=None, partial_variables={}, template='You are data engineer to build a data pipeline based on CDDP. Refer to https://github.com/Azure/config-driven-data-pipeline for the detailed information about CDDP. You are required to build a data pipeline to ingest data from a source system to a target system. In order to do that, you need to create a JSON configuration file for the data pipeline. \nThe JSON configuration file should contain the following information:\n{{\n  "name": "fruit_batch_data_app",\n  "staging": [],\n  "standard": [],\n  "serving": []\n}}\n\nRefer to https://github.com/Azure/config-driven-data-pipeline/blob/main/example/pipeline_fruit_batch.json for a sample JSON configuration file. \nPlease create a JSON configuration file for the CDDP data pipeline by specifying the following information:\n{conf

In [24]:
messages = prompt_template.format_messages(config_desc=config_desc)
chat = ChatOpenAI(temperature=0.0, engine=llm_model)
response = chat(messages=messages)
print(response.content)

                    engine was transferred to model_kwargs.
                    Please confirm that engine is what you intended.


{
  "name": "my_data_pipeline",
  "staging": [
    {
      "name": "staging_operation_1",
      "input": {
        "type": "input_data_type",
        "format": "input_data_format",
        "path": "input_data_path",
        "read-type": "input_data_read_type"
      },
      "output": {
        "target": "stg_sales",
        "type": "view"
      },
      "schema": {
        "fields": [
          {
            "metadata": "metadata_1",
            "name": "ID",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": "metadata_2",
            "name": "name",
            "nullable": true,
            "type": "string"
          }
        ]
      },
      "sampleData": "sample_data"
    }
  ],
  "standard": [
    {
      "name": "standard_table_1",
      "type": "batch",
      "code": {
        "lang": "SQL",
        "sql": "SELECT * FROM stg_sales"
      },
      "output": {
        "target": "output_data_source",
        "type": "file"
  

In [9]:
type(response.content)

str

### Parse the LLM output string into a Python dictionary

In [36]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

In [37]:
gift_schema = ResponseSchema(name="gift",
                             description="Was the item purchased\
                             as a gift for someone else? \
                             Answer True if yes,\
                             False if not or unknown.")
delivery_days_schema = ResponseSchema(name="delivery_days",
                                      description="How many days\
                                      did it take for the product\
                                      to arrive? If this \
                                      information is not found,\
                                      output -1.")
price_value_schema = ResponseSchema(name="price_value",
                                    description="Extract any\
                                    sentences about the value or \
                                    price, and output them as a \
                                    comma separated Python list.")

response_schemas = [gift_schema, 
                    delivery_days_schema,
                    price_value_schema]

In [38]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [39]:
format_instructions = output_parser.get_format_instructions()

In [40]:
print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"gift": string  // Was the item purchased                             as a gift for someone else?                              Answer True if yes,                             False if not or unknown.
	"delivery_days": string  // How many days                                      did it take for the product                                      to arrive? If this                                       information is not found,                                      output -1.
	"price_value": string  // Extract any                                    sentences about the value or                                     price, and output them as a                                     comma separated Python list.
}
```


In [41]:
review_template_2 = """\
For the following text, extract the following information:

gift: Was the item purchased as a gift for someone else? \
Answer True if yes, False if not or unknown.

delivery_days: How many days did it take for the product\
to arrive? If this information is not found, output -1.

price_value: Extract any sentences about the value or price,\
and output them as a comma separated Python list.

text: {text}

{format_instructions}
"""

prompt = ChatPromptTemplate.from_template(template=review_template_2)

messages = prompt.format_messages(text=customer_review, 
                                format_instructions=format_instructions)

In [42]:
print(messages[0].content)

For the following text, extract the following information:

gift: Was the item purchased as a gift for someone else? Answer True if yes, False if not or unknown.

delivery_days: How many days did it take for the productto arrive? If this information is not found, output -1.

price_value: Extract any sentences about the value or price,and output them as a comma separated Python list.

text: This leaf blower is pretty amazing.  It has four settings:candle blower, gentle breeze, windy city, and tornado. It arrived in two days, just in time for my wife's anniversary present. I think my wife liked it so much she was speechless. So far I've been the only one using it, and I've been using it every other morning to clear the leaves on our lawn. It's slightly more expensive than the other leaf blowers out there, but I think it's worth it for the extra features.


The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```

In [43]:
response = chat(messages)

In [44]:
print(response.content)

```json
{
	"gift": false,
	"delivery_days": "2",
	"price_value": "It's slightly more expensive than the other leaf blowers out there, but I think it's worth it for the extra features."
}
```


In [45]:
output_dict = output_parser.parse(response.content)

In [46]:
output_dict

{'gift': False,
 'delivery_days': '2',
 'price_value': "It's slightly more expensive than the other leaf blowers out there, but I think it's worth it for the extra features."}

In [47]:
type(output_dict)

dict

In [48]:
output_dict.get('delivery_days')

'2'

Reminder: Download your notebook to you local computer to save your work.