<a href="https://colab.research.google.com/github/deep-diver/auto-data-fountain/blob/main/notebooks/pilot_modular.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install google-generativeai
!pip install pyyaml



In [2]:
GEMINI_API_KEY="..."

In [3]:
import json

def find_json_code_snippet(raw_code_snippet):
	json_parsed_string = None

	json_start_index = raw_code_snippet.find('{')
	json_end_index = raw_code_snippet.rfind('}')

	if json_start_index >= 0 and json_end_index >= 0:
		json_code_snippet = raw_code_snippet[json_start_index:json_end_index+1]
		try:
			json_parsed_string = json.loads(json_code_snippet, strict=False)
		except:
			raise ValueError('failed to parse string into JSON format')
	else:
		raise ValueError('No JSON code snippet found in string.')

	return json_parsed_string

def parse_first_json_code_snippet(code_snippet):
	json_parsed_string = None

	if isinstance(code_snippet, list):
		for code_snippet_piece in code_snippet:
			try:
				json_parsed_string = find_json_code_snippet(code_snippet_piece)
				return json_parsed_string
			except:
				pass
	else:
		try:
			json_parsed_string = find_json_code_snippet(code_snippet)
		except Exception as e:
			print(e)
			raise ValueError()

	return json_parsed_string

In [4]:
def determine_model_name(given_image=None):
  if given_image is None:
    return "gemini-pro"
  else:
    return "gemini-pro-vision"

def construct_image_part(given_image):
  return {
    "mime_type": "image/jpeg",
    "data": given_image
  }

def call_gemini(prompt="", API_KEY=None, given_text=None, given_image=None, generation_config=None, safety_settings=None):
  import google.generativeai as genai
  genai.configure(api_key=API_KEY)

  if generation_config is None:
    generation_config = {
      "temperature": 0.9,
      "top_p": 1,
      "top_k": 32,
      "max_output_tokens": 8192,
    }

  if safety_settings is None:
    safety_settings = [
      {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_ONLY_HIGH"
      },
      {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_ONLY_HIGH"
      },
      {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_ONLY_HIGH"
      },
      {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_ONLY_HIGH"
      },
    ]

  model_name = determine_model_name(given_image)
  model = genai.GenerativeModel(model_name=model_name,
                                generation_config=generation_config,
                                safety_settings=safety_settings)

  prompt_parts = [prompt]
  if given_image is not None:
    prompt_parts.append(construct_image_part(given_image))

  response = model.generate_content(prompt_parts)
  return response.text

In [5]:
!mkdir counsel

In [7]:
%%writefile counsel/diagram.mermaid

erDiagram
    COUNSELOR ||--|{ COUNSELEE : "provides counseling to"

    %% Comments for relationship attributes
    %% Start date: 2024-02-14
    %% Frequency: Weekly
    %% Topic: marriage guidance

Overwriting counsel/diagram.mermaid


In [19]:
%%writefile counsel/setup.yaml

initial_prompt: |
  %s # erDiagram
  %s # delimiter
  The above erDiagram describes the basic setup of a certain scene.

  Generate possible first few conversations between a user and an assistant based on the erDiagram.
  The conversations should sound natural and logical.
  The direction or the style of the conversations should be "%s".
  The conversations should be occured without exposuring the underlying information of the erDiagram.

  The user should play the role of "%s" appeared in the erDiagram. The user should focus on the given role.
  The assistant should play the role of "%s" appeared in the erDiagram. The assistant should focus on the given role.
  Based on the words that the user say, the assistant gives appropriate, detailed, and long answers.

derivational_prompt: |
  %s # erDiagram
  %s # delimiter
  The above erDiagram describes the basic setup of a certain scene.

  Generate possible follow-up conversations between a user and an assistant based on the erDiagram and the first few conversations: %s # first few conversation in JSON str.
  The conversations should sound natural and logical.
  The direction or the style of the conversations should be "%s".
  The conversations should be occured without exposuring the underlying information of the erDiagram.

  The user should play the role of "%s" appeared in the erDiagram. The user should focus on the given role.
  The assistant should play the role of "%s" appeared in the erDiagram. The assistant should focus on the given role.
  Based on the words that the user say, the assistant gives appropriate, detailed, and long answers.

output_format: |
  The generated conversations are recorded in a valid JSON as
  {"conversations":[{"user": text, "assistant": text},...]}.

delimiter: "------------------------------"

user_role: COUNSELEE
assistant_role: COUNSELOR

seed_evolving_directions:
  - general
  - diverse

derivational_evolving_directions:
  - general
  - in-depth

# er_diagram: |
#    COUNSELOR ||--|{ COUNSELEE : "provides counseling to"
#
#    %% Comments for relationship attributes
#    %% Start date: 2024-02-14
#    %% Frequency: Weekly
#    %% Topic: marriage guidance

# er_diagram_path: diagram.mermaid

Overwriting counsel/setup.yaml


In [9]:
import os
import yaml

def get_setup_and_mermaid(folder_path):
    mermaid = None
    setup_yaml_path = os.path.join(folder_path, 'setup.yaml')
    mermaid_path = os.path.join(folder_path, 'diagram.mermaid')

    if not os.path.isfile(setup_yaml_path):
        raise FileNotFoundError(f"setup.yaml not found in {folder_path}")

    with open(setup_yaml_path, 'r') as file:
      setup = yaml.safe_load(file)

    if 'er_diagram' in setup:
      mermaid = setup['er_diagram']
    elif 'er_diagram_path' in setup:
      mermaid_path = os.path.join(folder_path, setup['er_diagram_path'])

    if mermaid is None:
      if not os.path.isfile(mermaid_path):
          raise FileNotFoundError(f"diagram.mermaid not found in {folder_path}")

      with open(mermaid_path, 'r') as file:
        mermaid = file.read()

    return setup, mermaid.strip()

In [22]:
folder = "counsel"

setup, mermaid = get_setup_and_mermaid(folder)

In [27]:
def gen_data(prompt, retry_num):
  cur_retry = 0
  data_json = None
  data = None

  while (data_json is None or data is None) and \
        cur_retry <= retry_num:
    try:
      data_json = call_gemini(
        prompt=prompt,
        API_KEY=GEMINI_API_KEY
      )

      data = parse_first_json_code_snippet(data_json)
    except:
      cur_retry = cur_retry + 1
      continue

  return data

In [28]:
def gen_seeds(setup, mermaid, retry_num=4):
  initial_prompt = setup['initial_prompt']
  output_format = setup['output_format']
  delimiter = setup['delimiter']

  user_role = setup['user_role']
  assistant_role = setup['assistant_role']

  outputs = []
  seed_evolving_directions = setup["seed_evolving_directions"]

  for evolving_direction in seed_evolving_directions:
    prompt = initial_prompt % (mermaid, delimiter, evolving_direction, user_role, assistant_role)
    prompt = f"{prompt}\n{output_format}"
    output = gen_data(prompt, retry_num)

    if output is not None:
      outputs.append(output)

  return outputs

In [29]:
seeds = gen_seeds(setup, mermaid)

failed to parse string into JSON format


In [30]:
len(seeds)

2

In [34]:
import json

print(json.dumps(seeds, indent=2))

[
  {
    "conversations": [
      {
        "user": "Hi, I'd like to make an appointment with a counselor.",
        "assistant": "Certainly, I can help you with that. Can I get your name and contact information, please?"
      },
      {
        "user": "My name is John Doe. My number is 555-555-5555.",
        "assistant": "Thank you, Mr. Doe. What day and time would you like to come in?"
      },
      {
        "user": "I'm available on Mondays at 2pm or Thursdays at 10am.",
        "assistant": "Okay, let me check our availability. We have an opening on Thursday at 10am. Does that work for you?"
      },
      {
        "user": "That works perfectly. Thank you.",
        "assistant": "You're welcome. I'll go ahead and book your appointment. You'll receive a confirmation email shortly."
      }
    ]
  },
  {
    "conversations": [
      {
        "user": "Hi, I'm interested in getting some counseling.",
        "assistant": "I'm happy to help. Can you tell me a little bit about w

In [37]:
def gen_derivations(setup, mermaid, seed_conversations, retry_num=4, d_factor=4):
  derivational_prompt = setup['derivational_prompt']
  output_format = setup['output_format']
  delimiter = setup['delimiter']

  user_role = setup['user_role']
  assistant_role = setup['assistant_role']

  outputs = []
  derivational_evolving_directions = setup["derivational_evolving_directions"]

  for seed_conversation in seed_conversations:
    base_conversation = {'conversations': []}

    for conversation in seed_conversation['conversations']:
      base_conversation['conversations'].append(conversation)

      for evolving_direction in derivational_evolving_directions:
        prompt = derivational_prompt % (mermaid, delimiter, json.dumps(base_conversation), evolving_direction, user_role, assistant_role)
        prompt = f"{prompt}\n{output_format}"

        for _ in range(d_factor):
          generated_conversation = None
          generated_conversation_json = None
          output = gen_data(prompt, retry_num)

          if output is not None:
            outputs.append(output)

  return outputs

In [38]:
outputs = gen_derivations(setup, mermaid, seeds)

In [39]:
len(outputs)

72

In [40]:
import json

print(json.dumps(outputs, indent=2))

[
  {
    "conversations": [
      {
        "user": "Hello, I'd like to book an appointment with a therapist.",
        "assistant": "I'd be happy to schedule one for you. Can I get your name and contact details?"
      },
      {
        "user": "Thanks, my name is Tim. My number is 555-123-4567.",
        "assistant": "Okay, Tim. Unfortunately, I'm unavailable this week, but I have some openings next week. Does that work for you?"
      },
      {
        "user": "Yes, that works. What day works best for you?",
        "assistant": "I'm available on Monday, Wednesday, or Friday. Which day would you prefer?"
      },
      {
        "user": "I'll take Monday.",
        "assistant": "Okay, Monday it is. I'll send you a confirmation email with the details."
      },
      {
        "user": "Perfect. Thank you.",
        "assistant": "You're welcome, Tim. I look forward to working with you."
      }
    ]
  },
  {
    "conversations": [
      {
        "user": "Hi, I'm looking for a cou