<a href="https://colab.research.google.com/github/cooolbabu/GoogleGemini101/blob/main/AzureDatabricks/BookstoreMedallion_BookstoreDB_OpenAI_T1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package installs

In [1]:
%pip install openai
%pip install PyGithub

Collecting openai
  Downloading openai-1.14.1-py3-none-any.whl (257 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.5/257.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.4 ht

### Imports, get keys, get llm client and set model to variable MODEL_NAME

In [3]:

import json
import re
from openai import OpenAI
from pprint import pprint
from google.colab import userdata
from github import Github

# Get the OpenAI API key from Colab secrets
github_token=userdata.get('Github_Token')
openai_api_key=userdata.get('OPENAI_API_KEY')
# Initialize a GitHub instance
g = Github(github_token)

client = OpenAI(api_key=openai_api_key)
MODEL_NAME = "gpt-3.5-turbo-1106"

### Github helper functions
* read_file_as_string()
* check_in_file(repo_name, file_path, file_content, content_tag, branch)

In [4]:
def read_file_as_string(file_path):
    """
        Reads the file and return a string representation of the file contents

        Parameters:
            file_path (str): Filename including filepath
    """
    try:
        with open(file_path, 'r') as file:
            file_contents = file.read()
        return file_contents
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def check_in_file(repo_name, file_path, file_content, content_tag, branch):
    """
        Checks if a specific file exists in a GitHub repository and updates it with new content if it does.
        If the file does not exist, it creates a new file with the provided content.

        This function operates on a specific branch named 'test'. If updating, it will commit the changes with a given content tag as the commit message.
        In case the file needs to be created, it will also use the content tag as the commit message for the new file.

        Parameters:
        - repo_name (str): The name of the repository, formatted as 'username/repository'.
        - file_path (str): The path to the file within the repository. This should include the file name and its extension.
        - file_content (str): The content to be written to the file. This is used both for updating and creating the file.
        - content_tag (str): A message associated with the commit used for updating or creating the file.
        - branch (str): Github branch for the code

        Behavior:
        - If the file exists at the specified path, it updates the file with `file_content`, using `content_tag` as the commit message.
        - If the file does not exist, it creates a new file at the specified path with `file_content`, also using `content_tag` as the commit message for creation.
        - Upon successful update or creation, prints a success message indicating the action taken.
    """

    # Get the repository
    repo = g.get_repo(repo_name)

    try:
        # Get the contents of the file if it exists
        file = repo.get_contents(file_path, ref=branch)

        # Update the file
        repo.update_file(file_path, content_tag, file_content, file.sha, branch=branch)
        print(f"File '{file_path}' updated successfully.")
    except:
        # If the file doesn't exist, create it
        print(f"{file_path}/{file_content} does not exist")
        repo.create_file(file_path, content_tag, file_content, branch=branch)
        print(f"File '{file_path}' created successfully.")

def create_notebook(response, system_message, instructions, filename):
    # Extract summary, code, and explanation from the response JSON
    summary = response["summary"]
    code = response["code"]
    explanation = response["explanation"]

    # Create the notebook content
    notebook_content = f"""# Databricks notebook source

# MAGIC %md
# MAGIC # Summary
# MAGIC {summary}

# COMMAND ----------

# MAGIC %md
# MAGIC # Code (use Databricks workspace formatter to format the code)

# COMMAND ----------

{code} U+0004

# COMMAND ----------

# MAGIC %md
# MAGIC # Explanation
# MAGIC {explanation}

# COMMAND ----------

# MAGIC %md
# MAGIC # GenAI Instructions
# MAGIC * ## AI Role
# MAGIC {system_message}

# COMMAND ----------
# MAGIC %md
# MAGIC * ## Instructions (Try edit mode for visualizing table structure)
# MAGIC {instructions}
"""

    # Write the notebook content to a file
    with open(filename, "w") as f:
        f.write(notebook_content)

    print(f"Notebook '{filename}' has been created.")

    return notebook_content

In [5]:
import ast

def convert_str_to_dict(s):
    try:
        d = ast.literal_eval(s)
        if isinstance(d, dict):
            return d
        else:
            raise ValueError("Input is not a valid dictionary string")
    except (ValueError, SyntaxError):
        raise ValueError("Input is not a valid dictionary string")

import string

def strip_control_characters_old(s):
    # Create a translation table that maps all control characters to None
    control_chars = dict.fromkeys(range(0x00, 0x20), ' ')
    control_chars.update(dict.fromkeys(range(0x7f, 0xa0), ' '))

    # Translate the string using the translation table
    cleaned_str = s.translate(dict.fromkeys(control_chars, ' '))

    return cleaned_str

def strip_control_characters(s):
    # Create a translation table that maps all control characters and special characters to a space ' '
    control_chars = dict.fromkeys(range(0x00, 0x09), ' ')  # Exclude \n, \r, \f
    control_chars.update(dict.fromkeys(range(0x0B, 0x0C), ' '))
    control_chars.update(dict.fromkeys(range(0x0E, 0x20), ' '))
    control_chars.update(dict.fromkeys(range(0x7f, 0xa0), ' '))
    special_chars = dict.fromkeys(map(ord, string.punctuation.replace('\n', '').replace('\r', '').replace('\f', '')), ' ')
    control_chars.update(special_chars)

    # Translate the string using the translation table
    cleaned_str = s.translate(control_chars)

# Setup
1.   System Message
2.   User Message



In [6]:
system_message = """
You are  Azure Databricks data engineer.
    - You will be given tasks and asked to write pyspark code.
    - You will use best practices for writing code.
    - Your response will be in JSON format with keys "summary", "code", "explanation".
  """.strip()

user_message_content = read_file_as_string("./BookstorePrompt.txt")
print(user_message_content)



Please build a pyspark program for Azure Databricks using Medallion framework.
I will give you the table schema. I will provide general instructions and instructions for each step. 

The schema for the tables is as follows

customers table schema
root
 |-- customer_id: string (nullable = true)
 |-- email: string (nullable = true)
 |-- profile: string (nullable = true)
 |-- updated: string (nullable = true)

books table schema
root
 |-- book_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 
orders_bronze table schema
root
 |-- order_id: string (nullable = true)
 |-- order_timestamp: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- quantity: long (nullable = true)
 |-- total: integer (nullable = true)
 |-- books: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- book_id: string (nullable = true)
 

# Make the call to LLMs

In [8]:
# Create the message with variables

response = client.chat.completions.create(
    model=MODEL_NAME,
    response_format = {"type" : "json_object"},
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message_content}]
)

# Assuming you have a client setup for interaction. Ensure to configure your OpenAI client appropriately.



In [43]:
print(f"\nResponse id: {response.id}\nCreated: {response.created}\nModel: {response.model}\nCompletion Tokens: {response.usage.completion_tokens}\nPrompt Tokens: {response.usage.prompt_tokens}\nTotal tokens: {response.usage.total_tokens}")
#print(response.created)



Response id: chatcmpl-93UWtqKT3jxXQMlW8R7aRLUDwQZII
Created: 1710619367
Model: gpt-3.5-turbo-0125
Completion Tokens: 931
Prompt Tokens: 1026
Total tokens: 1957


In [42]:
import pprint
pprint.pprint(response.dict())

{'choices': [{'finish_reason': 'stop',
              'index': 0,
              'logprobs': None,
              'message': {'content': '{\n'
                                     '    "summary": "Created a Pyspark '
                                     'program using Medallion framework for '
                                     'Azure Databricks following the '
                                     'instructions provided.",\n'
                                     '    "code": {\n'
                                     '        "part1": {\n'
                                     '            "ingest_orders_bronze": {\n'
                                     '                "code": "spark = '
                                     "SparkSession.builder.appName('OrderBronzeIngest').getOrCreate()\\n\\norders_bronze_df "
                                     '= '
                                     "spark.readStream.format('cloudFiles')\\n    "
                                     ".option('cloud

In [15]:
print(f"Response id: {response.id}")
json_str = json.dumps(response.json(), indent=4, sort_keys=True)
print(json_str)

Response id: chatcmpl-93UWtqKT3jxXQMlW8R7aRLUDwQZII
"{\"id\":\"chatcmpl-93UWtqKT3jxXQMlW8R7aRLUDwQZII\",\"choices\":[{\"finish_reason\":\"stop\",\"index\":0,\"logprobs\":null,\"message\":{\"content\":\"{\\n    \\\"summary\\\": \\\"Created a Pyspark program using Medallion framework for Azure Databricks following the instructions provided.\\\",\\n    \\\"code\\\": {\\n        \\\"part1\\\": {\\n            \\\"ingest_orders_bronze\\\": {\\n                \\\"code\\\": \\\"spark = SparkSession.builder.appName('OrderBronzeIngest').getOrCreate()\\\\n\\\\norders_bronze_df = spark.readStream.format('cloudFiles')\\\\n    .option('cloudFiles.format', 'parquet')\\\\n    .option('cloudFiles.includeExistingFiles', 'true')\\\\n    .option('cloudFiles.useNotifications', 'true')\\\\n    .option('cloudFiles.url', 'dbfs:/mnt/bookstore/orders-raw')\\\\n    .option('cloudFiles.format', 'parquet')\\\\n    .load()\\\\n\\\\norders_bronze_df = orders_bronze_df.withColumn('file_name', input_file_name())\\\\

In [21]:
pprint(response.choices[0])

Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n    "summary": "Created a Pyspark program using Medallion framework for Azure Databricks following the instructions provided.",\n    "code": {\n        "part1": {\n            "ingest_orders_bronze": {\n                "code": "spark = SparkSession.builder.appName(\'OrderBronzeIngest\').getOrCreate()\\n\\norders_bronze_df = spark.readStream.format(\'cloudFiles\')\\n    .option(\'cloudFiles.format\', \'parquet\')\\n    .option(\'cloudFiles.includeExistingFiles\', \'true\')\\n    .option(\'cloudFiles.useNotifications\', \'true\')\\n    .option(\'cloudFiles.url\', \'dbfs:/mnt/bookstore/orders-raw\')\\n    .option(\'cloudFiles.format\', \'parquet\')\\n    .load()\\n\\norders_bronze_df = orders_bronze_df.withColumn(\'file_name\', input_file_name())\\n    .withColumn(\'processed_timestamp\', current_timestamp())\\n\\ncheckpoint_location = \'dbfs:/mnt/bookstore/checkpoints/orders_bronze\'\\n\\nquery

In [23]:
pprint(response.choices[0].message)

ChatCompletionMessage(content='{\n    "summary": "Created a Pyspark program using Medallion framework for Azure Databricks following the instructions provided.",\n    "code": {\n        "part1": {\n            "ingest_orders_bronze": {\n                "code": "spark = SparkSession.builder.appName(\'OrderBronzeIngest\').getOrCreate()\\n\\norders_bronze_df = spark.readStream.format(\'cloudFiles\')\\n    .option(\'cloudFiles.format\', \'parquet\')\\n    .option(\'cloudFiles.includeExistingFiles\', \'true\')\\n    .option(\'cloudFiles.useNotifications\', \'true\')\\n    .option(\'cloudFiles.url\', \'dbfs:/mnt/bookstore/orders-raw\')\\n    .option(\'cloudFiles.format\', \'parquet\')\\n    .load()\\n\\norders_bronze_df = orders_bronze_df.withColumn(\'file_name\', input_file_name())\\n    .withColumn(\'processed_timestamp\', current_timestamp())\\n\\ncheckpoint_location = \'dbfs:/mnt/bookstore/checkpoints/orders_bronze\'\\n\\nquery = orders_bronze_df.writeStream.format(\'delta\')\\n    .outp

In [40]:
content_dict = json.loads(response.choices[0].message.content)
print(content_dict["summary"])
print(content_dict["code"])
print(content_dict["code"]["part1"]["ingest_orders_bronze"]["code"])
print(content_dict["code"]["part1"]["ingest_orders_bronze"]["explanation"])
print(content_dict["code"]["part2"])
print(content_dict["code"]["part3"])

Created a Pyspark program using Medallion framework for Azure Databricks following the instructions provided.
{'part1': {'ingest_orders_bronze': {'code': "spark = SparkSession.builder.appName('OrderBronzeIngest').getOrCreate()\n\norders_bronze_df = spark.readStream.format('cloudFiles')\n    .option('cloudFiles.format', 'parquet')\n    .option('cloudFiles.includeExistingFiles', 'true')\n    .option('cloudFiles.useNotifications', 'true')\n    .option('cloudFiles.url', 'dbfs:/mnt/bookstore/orders-raw')\n    .option('cloudFiles.format', 'parquet')\n    .load()\n\norders_bronze_df = orders_bronze_df.withColumn('file_name', input_file_name())\n    .withColumn('processed_timestamp', current_timestamp())\n\ncheckpoint_location = 'dbfs:/mnt/bookstore/checkpoints/orders_bronze'\n\nquery = orders_bronze_df.writeStream.format('delta')\n    .outputMode('append')\n    .option('checkpointLocation', checkpoint_location)\n    .trigger(availableNow=True)\n    .table('orders_bronze')", 'explanation': 'Re

# Validate response from LLM

In [None]:

print(message)


In [None]:
file_contents = create_notebook(json.loads(stripped_message), system_message, user_message_content, "orders_bronze_notebook-t2.py")

In [None]:
print(file_contents)

# Check into Github
*   repository : "cooolbabu/GoogleGemini101"
*   filename : "AzureDatabricks/filename" - specify actual filename
*   filecontent: Contents of the file to check in
*   tag_name: give a comment. It will show in Github
* branch: branch name to check into. Ensure that branch already exists
          Future TODO: if branch doesn't exist (notify, ask, process)



In [None]:


check_in_file(repo_name="cooolbabu/GoogleGemini101",
              file_path="AzureDatabricks/ConfigureDB/create_order_table-t1.py",
              file_content=file_contents,
              content_tag='creating orders table added control characters t5',
              branch="pyspark-genai-t2")