<a href="https://colab.research.google.com/github/cooolbabu/GoogleGemini101/blob/pyspark-genai-t1/AzureDatabricks/Boiler_code_for_Github.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package installs

In [1]:
%pip install anthropic
%pip install PyGithub

Collecting anthropic
  Downloading anthropic-0.20.0-py3-none-any.whl (850 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/850.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/850.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m675.8/850.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m850.5/850.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from anthropic)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->anthropic)
  Downloading httpcore-1.0.4-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 

### Imports, get keys, get llm client and set model to variable MODEL_NAME

In [2]:
from anthropic import Anthropic
import json
import re
from pprint import pprint
from google.colab import userdata
from github import Github

# Get the OpenAI API key from Colab secrets
github_token=userdata.get('Github_Token')
claude_api_key=userdata.get('Claude_api_key')
# Initialize a GitHub instance
g = Github(github_token)

client = Anthropic(api_key=claude_api_key)
MODEL_NAME = "claude-3-opus-20240229"

### Github helper functions
* read_file_as_string()
* check_in_file(repo_name, file_path, file_content, content_tag, branch)

In [3]:
def read_file_as_string(file_path):
    """
        Reads the file and return a string representation of the file contents

        Parameters:
            file_path (str): Filename including filepath
    """
    try:
        with open(file_path, 'r') as file:
            file_contents = file.read()
        return file_contents
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def check_in_file(repo_name, file_path, file_content, content_tag, branch):
    """
        Checks if a specific file exists in a GitHub repository and updates it with new content if it does.
        If the file does not exist, it creates a new file with the provided content.

        This function operates on a specific branch named 'test'. If updating, it will commit the changes with a given content tag as the commit message.
        In case the file needs to be created, it will also use the content tag as the commit message for the new file.

        Parameters:
        - repo_name (str): The name of the repository, formatted as 'username/repository'.
        - file_path (str): The path to the file within the repository. This should include the file name and its extension.
        - file_content (str): The content to be written to the file. This is used both for updating and creating the file.
        - content_tag (str): A message associated with the commit used for updating or creating the file.
        - branch (str): Github branch for the code

        Behavior:
        - If the file exists at the specified path, it updates the file with `file_content`, using `content_tag` as the commit message.
        - If the file does not exist, it creates a new file at the specified path with `file_content`, also using `content_tag` as the commit message for creation.
        - Upon successful update or creation, prints a success message indicating the action taken.
    """

    # Get the repository
    repo = g.get_repo(repo_name)

    try:
        # Get the contents of the file if it exists
        file = repo.get_contents(file_path, ref=branch)

        # Update the file
        repo.update_file(file_path, content_tag, file_content, file.sha, branch=branch)
        print(f"File '{file_path}' updated successfully.")
    except:
        # If the file doesn't exist, create it
        print(f"{file_path}/{file_content} does not exist")
        repo.create_file(file_path, content_tag, file_content, branch=branch)
        print(f"File '{file_path}' created successfully.")

def create_notebook(response, system_message, instructions, filename):
    # Extract summary, code, and explanation from the response JSON
    summary = response["summary"]
    code = response["code"]
    explanation = response["explanation"]

    # Create the notebook content
    notebook_content = f"""# Databricks notebook source

# MAGIC %md
# MAGIC # Summary
# MAGIC {summary}

# COMMAND ----------

# MAGIC %md
# MAGIC # Code

# COMMAND ----------

{code} U+0004

# COMMAND ----------

# MAGIC %md
# MAGIC # Explanation
# MAGIC {explanation}

# COMMAND ----------

# MAGIC %md
# MAGIC # GenAI Instructions
# MAGIC * ## AI Role
# MAGIC {system_message}

# COMMAND ----------
# MAGIC %md
# MAGIC * ## Instructions
# MAGIC {instructions}
"""

    # Write the notebook content to a file
    with open(filename, "w") as f:
        f.write(notebook_content)

    print(f"Notebook '{filename}' has been created.")

    return notebook_content

In [4]:
import ast

def convert_str_to_dict(s):
    try:
        d = ast.literal_eval(s)
        if isinstance(d, dict):
            return d
        else:
            raise ValueError("Input is not a valid dictionary string")
    except (ValueError, SyntaxError):
        raise ValueError("Input is not a valid dictionary string")

import string

def strip_control_characters(s):
    # Create a translation table that maps all control characters to None
    control_chars = dict.fromkeys(range(0x00, 0x20), ' ')
    control_chars.update(dict.fromkeys(range(0x7f, 0xa0), ' '))

    # Translate the string using the translation table
    cleaned_str = s.translate(dict.fromkeys(control_chars, ' '))

    return cleaned_str

# Setup
1.   System Message
2.   User Message



In [5]:
system_message = """
You are  Azure Databricks data engineer.
    You will be given tasks and asked to write pyspark code.
    You will use best practices for writing code.
    Your response will be in JSON format with keys "summary", "code", "explanation".
    Do not include introductory line the respoonse.
  """.strip()


user_message_content = """
  I will give you schema for a table. Your task is to provide pyspark code to create the table.
  orders_bronze table schema
  root
  |-- order_id: string (nullable = true)
  |-- order_timestamp: long (nullable = true)
  |-- customer_id: string (nullable = true)
  |-- quantity: long (nullable = true)
  |-- total: integer (nullable = true)
  |-- books: array (nullable = true)
  |    |-- element: struct (containsNull = true)
  |    |    |-- book_id: string (nullable = true)
  |    |    |-- quantity: integer (nullable = true)
  |    |    |-- subtotal: long (nullable = true)
  |-- _rescued_data: string (nullable = true)
  |-- file_name: string (nullable = true)
  |-- processed_timestamp: timestamp (nullable = true)
  """.strip()

# Make the call to LLMs

In [6]:
# Create the message with variables
response = client.messages.create(
    model="claude-3-opus-20240229",
    max_tokens=1000,
    temperature=0,
    system=system_message,
    messages=[
        {"role": "user", "content": user_message_content}
    ]
)

# Assuming you have a client setup for interaction. Ensure to configure your OpenAI client appropriately.

message = response.content[0].text

# Validate response from LLM

In [7]:
stripped_message = strip_control_characters(message)


In [8]:
file_contents = create_notebook(json.loads(stripped_message), system_message, user_message_content, "orders_bronze_notebook-t2.py")

Notebook 'orders_bronze_notebook-t2.py' has been created.


In [9]:
print(file_contents)

# Databricks notebook source

# MAGIC %md
# MAGIC # Summary
# MAGIC The code creates a schema for the orders_bronze table using StructType and StructField classes from pyspark.sql.types, and then creates the table using createOrReplaceTempView method.

# COMMAND ----------

# MAGIC %md
# MAGIC # Code

# COMMAND ----------

from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, ArrayType, TimestampType  schema = StructType([     StructField("order_id", StringType(), True),     StructField("order_timestamp", LongType(), True),     StructField("customer_id", StringType(), True),     StructField("quantity", LongType(), True),     StructField("total", IntegerType(), True),     StructField("books", ArrayType(         StructType([             StructField("book_id", StringType(), True),             StructField("quantity", IntegerType(), True),             StructField("subtotal", LongType(), True)         ])     ), True),     StructField("_rescued_data", Strin

# Check into Github
*   repository : "cooolbabu/GoogleGemini101"
*   filename : "AzureDatabricks/filename" - specify actual filename
*   filecontent: Contents of the file to check in
*   tag_name: give a comment. It will show in Github
* branch: branch name to check into. Ensure that branch already exists
          Future TODO: if branch doesn't exist (notify, ask, process)



In [10]:


check_in_file(repo_name="cooolbabu/GoogleGemini101",
              file_path="AzureDatabricks/ConfigureDB/create_order_table-t5.py",
              file_content=file_contents,
              content_tag='creating orders table added control characters t5',
              branch="pyspark-genai-t1")

AzureDatabricks/ConfigureDB/create_order_table-t5.py/# Databricks notebook source

# MAGIC %md
# MAGIC # Summary
# MAGIC The code creates a schema for the orders_bronze table using StructType and StructField classes from pyspark.sql.types, and then creates the table using createOrReplaceTempView method.

# COMMAND ----------

# MAGIC %md
# MAGIC # Code

# COMMAND ----------

from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, ArrayType, TimestampType  schema = StructType([     StructField("order_id", StringType(), True),     StructField("order_timestamp", LongType(), True),     StructField("customer_id", StringType(), True),     StructField("quantity", LongType(), True),     StructField("total", IntegerType(), True),     StructField("books", ArrayType(         StructType([             StructField("book_id", StringType(), True),             StructField("quantity", IntegerType(), True),             StructField("subtotal", LongType(), True)         ])