<a href="https://colab.research.google.com/github/cooolbabu/GoogleGemini101/blob/main/AzureDatabricks/BookstoreMedallion_OpenAI_Greeter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package installs

In [1]:
%pip install openai
%pip install PyGithub

Collecting openai
  Downloading openai-1.14.1-py3-none-any.whl (257 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.5/257.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.4 ht

### Imports, get keys, get llm client and set model to variable MODEL_NAME

In [2]:

import json
import re
from openai import OpenAI
from pprint import pprint
from google.colab import userdata
from github import Github

# Get the OpenAI API key from Colab secrets
github_token=userdata.get('Github_Token')
openai_api_key=userdata.get('OPENAI_API_KEY')
# Initialize a GitHub instance
g = Github(github_token)

client = OpenAI(api_key=openai_api_key)
MODEL_NAME = "gpt-3.5-turbo"

### Github helper functions
* read_file_as_string()
* check_in_file(repo_name, file_path, file_content, content_tag, branch)

In [3]:
def read_file_as_string(file_path):
    """
        Reads the file and return a string representation of the file contents

        Parameters:
            file_path (str): Filename including filepath
    """
    try:
        with open(file_path, 'r') as file:
            file_contents = file.read()
        return file_contents
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def check_in_file(repo_name, file_path, file_content, content_tag, branch):
    """
        Checks if a specific file exists in a GitHub repository and updates it with new content if it does.
        If the file does not exist, it creates a new file with the provided content.

        This function operates on a specific branch named 'test'. If updating, it will commit the changes with a given content tag as the commit message.
        In case the file needs to be created, it will also use the content tag as the commit message for the new file.

        Parameters:
        - repo_name (str): The name of the repository, formatted as 'username/repository'.
        - file_path (str): The path to the file within the repository. This should include the file name and its extension.
        - file_content (str): The content to be written to the file. This is used both for updating and creating the file.
        - content_tag (str): A message associated with the commit used for updating or creating the file.
        - branch (str): Github branch for the code

        Behavior:
        - If the file exists at the specified path, it updates the file with `file_content`, using `content_tag` as the commit message.
        - If the file does not exist, it creates a new file at the specified path with `file_content`, also using `content_tag` as the commit message for creation.
        - Upon successful update or creation, prints a success message indicating the action taken.
    """

    # Get the repository
    repo = g.get_repo(repo_name)

    try:
        # Get the contents of the file if it exists
        file = repo.get_contents(file_path, ref=branch)

        # Update the file
        repo.update_file(file_path, content_tag, file_content, file.sha, branch=branch)
        print(f"File '{file_path}' updated successfully.")
    except:
        # If the file doesn't exist, create it
        print(f"{file_path}/{file_content} does not exist")
        repo.create_file(file_path, content_tag, file_content, branch=branch)
        print(f"File '{file_path}' created successfully.")

def create_notebook(response, system_message, instructions, filename):
    # Extract summary, code, and explanation from the response JSON
    summary = response["summary"]
    code = response["code"]
    explanation = response["explanation"]

    # Create the notebook content
    notebook_content = f"""# Databricks notebook source

# MAGIC %md
# MAGIC # Summary
# MAGIC {summary}

# COMMAND ----------

# MAGIC %md
# MAGIC # Code (use Databricks workspace formatter to format the code)

# COMMAND ----------

{code} U+0004

# COMMAND ----------

# MAGIC %md
# MAGIC # Explanation
# MAGIC {explanation}

# COMMAND ----------

# MAGIC %md
# MAGIC # GenAI Instructions
# MAGIC * ## AI Role
# MAGIC {system_message}

# COMMAND ----------
# MAGIC %md
# MAGIC * ## Instructions (Try edit mode for visualizing table structure)
# MAGIC {instructions}
"""

    # Write the notebook content to a file
    with open(filename, "w") as f:
        f.write(notebook_content)

    print(f"Notebook '{filename}' has been created.")

    return notebook_content

In [4]:
import ast

def convert_str_to_dict(s):
    try:
        d = ast.literal_eval(s)
        if isinstance(d, dict):
            return d
        else:
            raise ValueError("Input is not a valid dictionary string")
    except (ValueError, SyntaxError):
        raise ValueError("Input is not a valid dictionary string")

import string

def strip_control_characters_old(s):
    # Create a translation table that maps all control characters to None
    control_chars = dict.fromkeys(range(0x00, 0x20), ' ')
    control_chars.update(dict.fromkeys(range(0x7f, 0xa0), ' '))

    # Translate the string using the translation table
    cleaned_str = s.translate(dict.fromkeys(control_chars, ' '))

    return cleaned_str

def strip_control_characters(s):
    # Create a translation table that maps all control characters and special characters to a space ' '
    control_chars = dict.fromkeys(range(0x00, 0x09), ' ')  # Exclude \n, \r, \f
    control_chars.update(dict.fromkeys(range(0x0B, 0x0C), ' '))
    control_chars.update(dict.fromkeys(range(0x0E, 0x20), ' '))
    control_chars.update(dict.fromkeys(range(0x7f, 0xa0), ' '))
    special_chars = dict.fromkeys(map(ord, string.punctuation.replace('\n', '').replace('\r', '').replace('\f', '')), ' ')
    control_chars.update(special_chars)

    # Translate the string using the translation table
    cleaned_str = s.translate(control_chars)

# Setup
1.   System Message
2.   User Message



In [22]:
system_message = """
You are  Azure Databricks data engineer.
    - You will be given tasks and asked to write pyspark code.
    - You will use best practices for writing code.
    - Your response will be in JSON format with keys "summary", "code", "explanation".
  """.strip()

#user_message_content = read_file_as_string("./BookstorePrompt.txt")
#print(user_message_content)

# Make the call to LLMs

In [6]:
# Create the message with variables

response = client.chat.completions.create(
    model=MODEL_NAME,
    messages=[
        {"role": "system", "content": "You are a sarcastic greeter with a great sense of humor"},
        {"role": "user", "content": "Say this is a test"}]
)

# Assuming you have a client setup for interaction. Ensure to configure your OpenAI client appropriately.



In [None]:
message = response.content[0].text

In [24]:
print(response)

ChatCompletion(id='chatcmpl-93NivuIPglB8fdAi3VbNM4lsH6AzM', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n  "summary": "Creating Orders table containing order_id, customer_id, books, total_quantity",\n  "code": "CREATE TABLE Orders (order_id INT, customer_id INT, books STRING, total_quantity INT)",\n  "explanation": "In this code snippet, we are creating a table named \'Orders\' with columns order_id (INT), customer_id (INT), books (STRING), and total_quantity (INT) to store order details."\n}', role='assistant', function_call=None, tool_calls=None))], created=1710593205, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_4f2ebda25a', usage=CompletionUsage(completion_tokens=93, prompt_tokens=83, total_tokens=176))


In [19]:
print(f"Chat id: {response.id}" )
print(response.choices[0].message);
print(response.usage);

Chat id: chatcmpl-93NHpWmRCF3tnio7QOBjixLj6tfyV
ChatCompletionMessage(content="Oh wow, a test you say? How thrilling! I've been waiting all day to be tested on my vast knowledge of all things sarcasm and wit. Let the fun begin!", role='assistant', function_call=None, tool_calls=None)
CompletionUsage(completion_tokens=37, prompt_tokens=29, total_tokens=66)


In [21]:

response = client.chat.completions.create(
   model=MODEL_NAME,
  response_format={ "type": "json_object" },
  messages=[
        {"role": "system", "content": "You are a sarcastic greeter with a great sense of humor. reponses must be in json format"},
        {"role": "user", "content": "Say this is a test"}]
)
print(response.choices[0].message.content)

{
  "response": "Oh joy, a test! Let's see if I can impress you with my vast knowledge and witty responses."
}


In [None]:

response = client.chat.completions.create(
   model=MODEL_NAME,
  response_format={ "type": "json_object" },
  messages=[
        {"role": "system", "content": "You are a sarcastic greeter with a great sense of humor. reponses must be in json format"},
        {"role": "user", "content": "Say this is a test"}]
)
print(response.choices[0].message.content)

{
  "response": "Oh joy, a test! Let's see if I can impress you with my vast knowledge and witty responses."
}


In [23]:

response = client.chat.completions.create(
   model=MODEL_NAME,
  response_format={ "type": "json_object" },
  messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": "Create a Orders table containing order_id, customer_id, books, total_quantity"}]
)
print(response.choices[0].message.content)

{
  "summary": "Creating Orders table containing order_id, customer_id, books, total_quantity",
  "code": "CREATE TABLE Orders (order_id INT, customer_id INT, books STRING, total_quantity INT)",
  "explanation": "In this code snippet, we are creating a table named 'Orders' with columns order_id (INT), customer_id (INT), books (STRING), and total_quantity (INT) to store order details."
}


# Validate response from LLM

In [None]:
stripped_message = strip_control_characters_old(message)
#print(stripped_message)
print(message)


Here is the PySpark program for Azure Databricks using the Medallion framework based on your instructions:

```json
{
  "summary": "The PySpark program ingests data into orders_bronze table using Autoloader, processes it into orders_silver table by joining with customers table, and aggregates sales data into sales_by_author table. It follows the Medallion architecture and uses best practices for variable naming, checkpointing, and schema management.",
  "code": "from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name, current_timestamp, from_json, col, explode

# Initialize Spark session
spark = SparkSession.builder \
    .appName(\"BookstoreETL\") \
    .getOrCreate()

# Define variables for table names and locations
orders_bronze_table = \"orders_bronze\"
orders_silver_table = \"orders_silver\"
sales_by_author_table = \"sales_by_author\"
customers_table = \"customers\"
books_table = \"books\"

checkpoint_root = \"dbfs:/mnt/bookstore/checkpoints/\"
schem

In [None]:
file_contents = create_notebook(json.loads(stripped_message), system_message, user_message_content, "orders_bronze_notebook-t2.py")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
print(file_contents)

NameError: name 'file_contents' is not defined

# Check into Github
*   repository : "cooolbabu/GoogleGemini101"
*   filename : "AzureDatabricks/filename" - specify actual filename
*   filecontent: Contents of the file to check in
*   tag_name: give a comment. It will show in Github
* branch: branch name to check into. Ensure that branch already exists
          Future TODO: if branch doesn't exist (notify, ask, process)



In [None]:


check_in_file(repo_name="cooolbabu/GoogleGemini101",
              file_path="AzureDatabricks/ConfigureDB/create_order_table-t1.py",
              file_content=file_contents,
              content_tag='creating orders table added control characters t5',
              branch="pyspark-genai-t2")

File 'AzureDatabricks/ConfigureDB/create_order_table-t1.py' updated successfully.
