In [None]:
!pip install cohere --upgrade
!python -m pip install cohere --upgrade
!pip install --upgrade cohere retrying ratelimit

In [None]:
!pip install transformers

# Code Generation for chart-creation

In [3]:
from dataclasses import dataclass
from typing import List
import pandas as pd
import time
import json
import cohere
from pdb import set_trace
from tqdm.contrib.concurrent import thread_map
from functools import partial
from tqdm.auto import tqdm
from retrying import retry
from ratelimit import limits
from cohere.base_client import ApiError
import httpx
from getpass import getpass
import os
from google.colab import drive
import random
import pickle

In [5]:
# Mount the Google Drive at /content/drive
drive.mount('/content/drive/', force_remount=True)

# Verify by listing the files in the drive
#!ls "/content/drive/MyDrive/"

Mounted at /content/drive/


In [47]:
os.environ["CO_API_KEY"] = getpass(" Enter Cohere Key ")

 Enter Cohere Key ··········


# CSV as Input: Data Loading

In [66]:
dataframe = pd.read_csv("/content/drive/MyDrive/Code_Based_Synthetic_Data_Gen/dataset.csv")
dataframe.head()
dataframe.shape

(10000, 21)

In [68]:


GENERATE_CHART_CODE_MATPLOTLIB_PROMPT = """You are an expert in data analysis and good at writing code (Python `matplotlib`) to generate plots.
I have some data about {topic} which can be used to generate a {figure_type}. Your task is to generate and output all 80 python functions.

Here is the data (CSV format, already loaded as a pd.DataFrame):
<data>
{data}
</data>

Please define 80 different Python function (using `matplotlib`) called `generate_plot` that generates a {figure_type} using the data provided, using various columns. However, for every chart, ensure that the data being represented is restricted to 10 to 15 values, for making it easier to read the chart, and prevent overlapping of labels. Every function should be in a seperate script. Here are the requirements:
1. **Code Requirements**: create a function called `generate_plot` that generates the chart using `matplotlib`.
    (1) The data, which is loaded in as a `pd.DataFrame` is taken as an argument. The function has no other arguments. The data needs to be loaded as a pd.DataFrame. You may need to adjust the data format or hard code some data to fit the `matplotlib` specification.
    (2) Remember to import necessary libraries (e.g., `import numpy as np`, `import matplotlib.pyplot as plt`) at the beginning of the script. Also import (`import pandas as pd`).
    (3) The function should always return a `PIL.Image.Image` object.
    (4) Ensure that the plot is properly labeled, with all labels visible and has a title.
    (5) The `generate_plot` function should save the plot to a BytesIO and then return the plot as a PIL Image object. **Do not close the BytesIO object.**
    (6) Use `bbox_inches='tight'` argument in `savefig` or `plt.tight_layout()`, ensuring the plot is saved with all the elements (title, labels, etc) visible.
    (7) Only define the function and do not call it. Do not show the plot. Save the plot with appropriate resolution. No need to show example usage.

2. **Output Requirements**:
    Put ```python at the beginning and ``` at the end of the script to separate the code from the text. This will help me easily extract the code.

Please don't answer with any additional text in the script or at the end, your whole response should be the Python code which can be directly executed. Ensure that all 80 python functions are produced as output."""



SYS_PROMPT = SYS_PROMPT ="""
You are a helpful AI assistant.
"""

formatted_prompts_updated=[]
chart_types = ["bar", "line", "pie", "scatter"]
for i in range(len(chart_types)):
  messages =  messages = [{"role": "system", "content": SYS_PROMPT},
                  {"role": "user", "content": ""}]
  messages[1]["content"] = GENERATE_CHART_CODE_MATPLOTLIB_PROMPT.format(topic="world_happiness_report", figure_type=chart_types[i], data=dataframe)
  formatted_prompts_updated.append(messages)

class CohereClientWrapper:
    def __init__(self, wait_exponential_multiplier_ms, wait_exponential_max_ms, stop_max_attempt_number):
      self.wait_exponential_multiplier_ms = wait_exponential_multiplier_ms,
      self.wait_exponential_max_ms = wait_exponential_max_ms
      self.stop_max_attempt_number = stop_max_attempt_number
      self.client = cohere.ClientV2()

    def get_completion(self, prompt, **kwargs) -> str:

        def should_retry_on_exception(e: Exception):
            if isinstance(e, (cohere.TooManyRequestsError, httpx.ReadTimeout, httpx.RemoteProtocolError, httpx.ConnectError)):
                return True
            elif isinstance(e, ApiError):
                if e.status_code:
                    return True
                else:
                    return False
            return False

        @retry(wait_exponential_multiplier=self.wait_exponential_multiplier_ms,
               wait_exponential_max=self.wait_exponential_max_ms,
               stop_max_attempt_number=self.stop_max_attempt_number,
               retry_on_exception=should_retry_on_exception)
        def _run(prompt):
            response = self.client.chat(
                messages=prompt,
                **kwargs,
                model=self.model_name,
                temperature=0.4,
                response_format={"type": "text"
                }
            )
            time.sleep(2)

            return response.message.content[0].text

        return _run(prompt)

    def get_parrallel_completions(self, prompts: List[str], model_name, **kwargs) -> List[str]:
        self.model_name=model_name
        desc = kwargs.pop("desc", None)
        _get_completion_fn = partial(self.get_completion, **kwargs)
        responses = thread_map(_get_completion_fn, prompts,
                               max_workers=1, desc=desc)
        return list(responses)

test_exec = CohereClientWrapper(10000,10000,60)
responses_all = test_exec.get_parrallel_completions(formatted_prompts_updated, model_name="command-a-03-2025")


# Script extraction and image generation

In [None]:
import re
import subprocess
import json

figtype = ""
datatype = ""
data_path = ""

def extract_and_run_python_code_from_json_list(responses):
    for item in range(0,len(responses)):
        data = responses[item]

        func_defintion =  """\nif __name__ == "__main__":
        \n    import sys
        \n    df = pd.read_csv(sys.argv[1])
        \n    result=generate_plot(df)
        \n    result.save(sys.argv[2])
        """

        pattern = re.compile(r'```python(.*?)```', re.DOTALL)
        matches = pattern.findall(data)
        print(len(matches))

        for i, match in enumerate(matches, 1):
          #print(f"Script {i}:\n{match}\n")
          code_snippet = match + func_defintion
          #print(code_snippet)
          outfile_path = f"./images_{figtype}_{datatype}/{figtype}_{datatype}{i}.png"

          # Save the code to a file
          filename = f'scripts_{figtype}_{datatype}/extracted_code_{figtype}_{i}.py'
          with open(filename, 'w') as file:
            file.write(code_snippet)



          # Run the code
          result = subprocess.run(['python', filename, data_path, outfile_path], capture_output=True, text=True)
          print("Output:", result.stdout)
          print("Errors:", result.stderr)
          print(type(result.stdout))

extract_and_run_python_code_from_json_list(responses_all)


# Question-Answer Generation

In [76]:

GENERATE_QUESTION_ANSWER_PAIRS_BAR = """
You are an expert in data analysis and good at generating questions and answers based on data visualizations.
I have some code that generates a bar chart using `matplotlib`. And a corresponding dataframe.

Here is the code:
<code>
{code}
</code>

Here is the data (CSV format, already loaded as a pd.DataFrame):
<data>
{data}
</data>


Please generate a set of questions and answers based on the bar chart generated by this code. Use the following templates for the questions:
1. What is the total value for [category]?
2. What is the average value across all categories?
3. How many categories have a value greater than [threshold]?
4. Which category has the highest total value?
5. Which category has the lowest total value?
6. What is the combined total value of [category1] and [category2]?
7. What is the difference in total value between [category1] and [category2]?
8. What percentage of the total value does [category] represent?
9. How many categories have values less than [threshold]?
10. What is the median value across all categories?


Please provide the questions and their corresponding answers as a csv. Mark with ``` where the csv begins and ends. Output no other text at the end.
"""


GENERATE_QUESTION_ANSWER_PAIRS_LINE = """
You are an expert in data analysis and good at generating questions and answers based on data visualizations.
I have some code that generates a bar chart using `matplotlib`. And a corresponding dataframe.

Here is the code:
<code>
{code}
</code>

Here is the data (CSV format, already loaded as a pd.DataFrame):
<data>
{data}
</data>


Please generate a set of questions and answers based on the line chart generated by this code. For reference for [category], you can use the dataframe provided w.r.t line chart, and for [speicifc point] you can use the code for the bar chart and dataframe as reference. Only specifically generate questions for the provided line chart, and nothing else:
Specify the answer as answer as 1 or two words/ numerical value and not entire sentences. Use the following templates for the questions:
1. What is the value for [category] at [specific point]?
2. What is the trend shown in the line chart?
3. Identify any peaks or troughs in the chart and explain their significance.
4. How many categories have values greater than [threshold] at [specific point]?
5. What is the difference in value between [category1] and [category2] at [specific point]?
6. What is the average value across all categories at [specific point]?
7. Describe the overall trend for [category].
8. What is the combined value of [category1] and [category2] at [specific point]?
9. What is the maximum value observed in the chart?
10. What is the minimum value observed in the chart?


Please provide the questions and their corresponding answers as a csv. Mark with ``` where the csv begins and ends. Output no other text at the end.
"""

GENERATE_QUESTION_ANSWER_PAIRS_SCATTER = """
You are an expert in data analysis and good at generating questions and answers based on data visualizations.
I have some code that generates a bar chart using `matplotlib`. For reference for [category], you can use the dataframe provided w.r.t scatter chart. Only specifically generate questions for the provided scatter chart, and nothing else:
Specify the answer as answer as 1 or two words/ numerical value and not entire sentences. And a corresponding dataframe.

Here is the code:
<code>
{code}
</code>

Here is the data (CSV format, already loaded as a pd.DataFrame):
<data>
{data}
</data>



Please generate a set of questions and answers based on the scatter plot generated by this code. For reference for [category] and [variable], you can use the dataframe provided w.r.t bar chart. Only specifically generate questions for the provided bar chart, and nothing else:
Specify the answer as answer as 1 or two words/ numerical value and not entire sentences. Use the following templates for the questions:
1. What is the value of [variable] for  [category]?
2. What is the trend shown in the scatter plot?
3. Identify any clusters in the scatter plot and explain their significance.
4. How many points have a [variable] value greater than [threshold]?
5. What is the difference in [variable]value between  [category1] and  [category2]?
6. What is the average [variable] value across all points?
7. Describe the overall distribution of [variable].
8. What is the combined [variable]value of [category1] and [category2]?
9. What is the maximum [variable] value observed in the scatter plot?
10. What is the minimum [variable] value observed in the scatter plot?


Please provide the questions and their corresponding answers as a csv. Mark with ``` where the csv begins and ends. Output no other text at the end.
"""

GENERATE_QUESTION_ANSWER_PAIRS_PIE = """You are an expert in data analysis and good at generating questions and answers based on data visualizations.
I have some code that generates a pie chart using `matplotlib`.

Here is the code:
<code>
{code}
</code>

Here is the data (CSV format, already loaded as a pd.DataFrame):
<data>
{data}
</data>


Please generate a set of questions and answers based on the pie chart generated by this code. For reference for [category], you can use the dataframe provided w.r.t pie chart. Only specifically generate questions for the provided pie chart, and nothing else:
Specify the answer as answer as 1 or two words/ numerical value and not entire sentences. Use the following templates for the questions:
1. What percentage of the total value does [category1]represent?
2. Which category has the largest value?
3. Which category has the smallest value?
4. What is the combined percentage of the total value for [category1] and [category2]?
5. What is the difference in percentage between [category1]and [category2]?
6. How many categories have a value greater than [threshold]?
7. What is the combined value of [category1] and [category2]?
8. What is the average value across all categories?
9. What is the total value represented in the pie chart?
10. How many categories have values less than [threshold]?


Please provide the questions and their corresponding answers as a csv. Mark with ``` where the csv begins and ends. Output no other text at the end."""



SYS_PROMPT = SYS_PROMPT ="""
You are a helpful AI assistant.
"""


formatted_prompts = []

In [None]:
formatted_prompts = []
responses_all=[]
figtype = ""
datatype = ""
for i in range(1,41):
    with open(f"scripts_{figtype}_{datatype}/extracted_code_{figtype}_{i}.py") as f:
      code = f.read()
      print(code)
      code = "```python" + code + "```"


    messages =  messages = [{"role": "system", "content": SYS_PROMPT},
                    {"role": "user", "content": ""}]
    messages[1]["content"] = GENERATE_QUESTION_ANSWER_PAIRS_BAR.format(code=code, data=dataframe)
    formatted_prompts.append(messages)

responses = test_exec.get_parrallel_completions(formatted_prompts, model_name="command-a-03-2025")

# Write CSV's of QA pairs

In [None]:
import pandas as pd


for i in range(0,len(responses)):
    # Given CSV data
    csv_data = responses[i]

    image_path = f"images_{figtype}_{datatype}/{figtype}_{datatype}_{i+1}.png"
    script_path = f"scripts_{figtype}_{datatype}/extracted_code_{figtype}_{i+1}.py"
    data_domain =   "diabetes"

    # Convert the CSV data to a DataFrame
    data = [line.split(',') for line in csv_data.strip().split('\n')]
    #print(data)
    df = pd.DataFrame(data[1:])
    print(df.head())
    if len(df.columns)>0:
      df = df.loc[:,[0,1]]

      # Remove the first row from the DataFrame
      df = df.iloc[1:]

      df.columns=["Question", "Answer"]

      # Add 'image_path' and 'script_path' columns
      df["image_path"] = image_path
      df["script_path"] = script_path
      df["data_domain"] = data_domain


      # Display the DataFrame
      print(df.head())

      df.to_csv(f"question_answer_label_{figtype}_diabetes/{figtype}_{datatype}_{i}.csv")


# Save Generated Data

In [None]:
!cp -r ./image/ '/content/drive/My Drive/Code_Based_Synthetic_Data_Gen//'

In [None]:
!cp -r ./scripts/ '/content/drive/My Drive/Code_Based_Synthetic_Data_Gen/'

In [None]:
!cp -r ./question/ '/content/drive/My Drive/Code_Based_Synthetic_Data_Gen/'