In [2]:
%load_ext autoreload
%autoreload 2
import re
import json
from src.fnUtils import render_markdown
from src.genAIClient import GenerativeAIClient
from utils.file_ops import save_to_text

client = GenerativeAIClient(show_info_log=False)

In [6]:
def complete(prompt):
    output=client.invoke(prompt=prompt)
    return output

complete("Tell me an inspiring quote.")

'"The future belongs to those who believe in the beauty of their dreams." - Eleanor Roosevelt'

In [36]:
# Setup the article title
article = "What is data engineering?"
# Inject the article title into the base prompt
base_prompt = f"""
Write a numbered, hierarchical outline for an article on "{article}"

Here is an example, of the structure:

1. Introduction
    a. Definition of digital marketing
2. Types of Digital Marketing
    a. Search Engine Optimization
    b. Social Media Marketing
    c. Content Marketing
    d. Pay-Per-Click Advertising
    e. Email Marketing
3. Benefits of Digital Marketing
    a. Cost-Effective
    b. Targeted Audience
    c. Measurable Results
    d. Increased Reach
"""
print(base_prompt)

# Combine the two patterns so that we have a dictionary that looks like this:
# {
#     "1. Introduction": {"a.": "Definition of digital marketing"},
#     "2. Types of Digital Marketing": {
#         "a.": "Cost-Effective",
#         "b.": "Targeted Audience",
#         "c.": "Measurable Results",
#     },
# }


Write a numbered, hierarchical outline for an article on "What is data engineering?"

Here is an example, of the structure:

1. Introduction
    a. Definition of digital marketing
2. Types of Digital Marketing
    a. Search Engine Optimization
    b. Social Media Marketing
    c. Content Marketing
    d. Pay-Per-Click Advertising
    e. Email Marketing
3. Benefits of Digital Marketing
    a. Cost-Effective
    b. Targeted Audience
    c. Measurable Results
    d. Increased Reach



In [37]:
result = complete(base_prompt)
print(result)

1. Definition of Data Engineering
2. Key Components of Data Engineering
    a. Data Integration
    b. Data Warehousing
    c. Data Analytics
3. Data Engineering Process
    a. Data Collection
    b. Data Cleaning
    c. Data Transformation
    d. Data Analysis
4. Benefits of Data Engineering
    a. Improved Data-Driven Decision-Making
    b. Enhanced Data Security
    c. Increased Data Accessibility
5. Data Engineering Tools and Technologies
    a. Data Management Platforms
    b. Data Warehousing Platforms
    c. Data Analytics Tools
6. Challenges in Data Engineering
    a. Data Inconsistency
    b. Data Complexity
    c. Resource Constraints
7. Conclusion


In [12]:
def extract_sections(outline_text):
    # Extract main sections
    main_sections = re.findall(r'\d+\..*?(?=^\d+|\Z)', outline_text, re.MULTILINE | re.DOTALL)

    # Extract sub-sections
    sections = {}
    for section in main_sections:
        section_title = re.search(r'\d+\..+', section).group(0)
        sub_sections = re.findall(r'\s+[a-z]\..+', section, re.MULTILINE)
        sections[section_title] = [heading.strip() for heading in  sub_sections]
    return sections

print(extract_sections(result))

{'1. Definition of Data Engineering': ['a. Key concepts and principles'], '2. Core Responsibilities of Data Engineers': ['a. Data integration and transformation', 'b. Data warehousing and management', 'c. Data quality and governance'], '3. Data Engineering Process': ['a. Data collection and preparation', 'b. Data analysis and modeling', 'c. Data visualization and reporting'], '4. Tools and Technologies Used in Data Engineering': ['a. Data integration tools', 'b. Data warehousing platforms', 'c. Data analysis and visualization tools'], '5. Career Path and Skills Required for Data Engineers': ['a. Education and training requirements', 'b. Technical and analytical skills', 'c. Industry certifications'], '6. Importance of Data Engineering': ['a. Role in decision-making and business intelligence', 'b. Facilitating real-time data insights', 'c. Enabling innovation and competitive advantage']}


In [23]:
prompt = f"""Produce an article outline for "{article}" as JSON.

**Output format**:
{{
'top heading one': ['subheading_one', 'subheading_two', ...],
'top heading two': ['subheading_one', 'subheading_two', ...],
...
'top heading n': ['subheading_one', 'subheading_two', ...],
}}

Remember that the ouput must be parsable JSON.
"""

json_string = complete(prompt)

In [24]:
print(json_string)

```json
{
  "Introduction to Data Engineering": ["Definition", "Key Concepts", "Role in Data Management"],
  "Data Engineering Principles": ["Data Integration", "Data Transformation", "Data Quality Management"],
  "Data Engineering Tools and Technologies": ["Big Data Platforms (Hadoop, Spark, etc.)", "Data Warehousing Tools", "Data Integration Tools"],
  "Data Engineering Practices": ["Agile Data Engineering", "DevOps for Data Engineering", "Data Governance"],
  "Benefits of Data Engineering": ["Improved Data Quality", "Enhanced Data Accessibility", "Reduced Data Silos"],
  "Challenges in Data Engineering": ["Data Volume and Complexity", "Lack of Skilled Professionals", "Data Security Concerns"],
  "Future Trends in Data Engineering": ["Cloud-Based Data Engineering", "Machine Learning for Data Engineering", "Real-Time Data Engineering"],
  "Conclusion": ["Importance of Data Engineering", "Career Opportunities in Data Engineering", "Call to Action"]
}
```


In [26]:
# Remove Markdown code block indicators
json_string = json_string.strip("```json\n")

# Load JSON string into a JSON object
json_object = json.loads(json_string)

# Print the JSON object
print(json_object)

{'Introduction to Data Engineering': ['Definition', 'Key Concepts', 'Role in Data Management'], 'Data Engineering Principles': ['Data Integration', 'Data Transformation', 'Data Quality Management'], 'Data Engineering Tools and Technologies': ['Big Data Platforms (Hadoop, Spark, etc.)', 'Data Warehousing Tools', 'Data Integration Tools'], 'Data Engineering Practices': ['Agile Data Engineering', 'DevOps for Data Engineering', 'Data Governance'], 'Benefits of Data Engineering': ['Improved Data Quality', 'Enhanced Data Accessibility', 'Reduced Data Silos'], 'Challenges in Data Engineering': ['Data Volume and Complexity', 'Lack of Skilled Professionals', 'Data Security Concerns'], 'Future Trends in Data Engineering': ['Cloud-Based Data Engineering', 'Machine Learning for Data Engineering', 'Real-Time Data Engineering'], 'Conclusion': ['Importance of Data Engineering', 'Career Opportunities in Data Engineering', 'Call to Action']}


In [28]:
!pip install pyyaml

import yaml



In [32]:
prompt = f"""Produce an article outline as a .yml file for {article}.

Always return valid YML.

**Output format**:
- name: Example YAML File
  description: This is an example YAML file.
  sections:
    - title: Introduction
      content: |
        This is the introduction.
    - title: Conclusion
      content: |
        This is the conclusion.
"""

text = complete(prompt)
text

'name: What is data engineering?\ndescription: Data engineering is the process of building, maintaining, and optimizing data systems to meet the needs of an organization. This involves collecting, cleaning, and transforming data into a format that is usable for analysis and decision-making.\nsections:\n  - title: Introduction\n    content: |\n      Data engineering is a critical part of the modern data-driven organization. By building and maintaining efficient and reliable data systems, data engineers ensure that the organization has the data it needs to make informed decisions.\n  - title: The role of data engineers\n    content: |\n      Data engineers play a variety of roles in an organization, including:\n      - Collecting data from a variety of sources\n      - Cleaning and transforming data to make it usable for analysis\n      - Building and maintaining data pipelines\n      - Monitoring data quality\n      - Providing support to data analysts and other users of data\n  - title

In [33]:
print(text)

name: What is data engineering?
description: Data engineering is the process of building, maintaining, and optimizing data systems to meet the needs of an organization. This involves collecting, cleaning, and transforming data into a format that is usable for analysis and decision-making.
sections:
  - title: Introduction
    content: |
      Data engineering is a critical part of the modern data-driven organization. By building and maintaining efficient and reliable data systems, data engineers ensure that the organization has the data it needs to make informed decisions.
  - title: The role of data engineers
    content: |
      Data engineers play a variety of roles in an organization, including:
      - Collecting data from a variety of sources
      - Cleaning and transforming data to make it usable for analysis
      - Building and maintaining data pipelines
      - Monitoring data quality
      - Providing support to data analysts and other users of data
  - title: The benefits 

In [34]:
# Load the YAML string into a Python object
data = yaml.load(text, Loader=yaml.FullLoader)

In [35]:
print(data)

{'name': 'What is data engineering?', 'description': 'Data engineering is the process of building, maintaining, and optimizing data systems to meet the needs of an organization. This involves collecting, cleaning, and transforming data into a format that is usable for analysis and decision-making.', 'sections': [{'title': 'Introduction', 'content': 'Data engineering is a critical part of the modern data-driven organization. By building and maintaining efficient and reliable data systems, data engineers ensure that the organization has the data it needs to make informed decisions.\n'}, {'title': 'The role of data engineers', 'content': 'Data engineers play a variety of roles in an organization, including:\n- Collecting data from a variety of sources\n- Cleaning and transforming data to make it usable for analysis\n- Building and maintaining data pipelines\n- Monitoring data quality\n- Providing support to data analysts and other users of data\n'}, {'title': 'The benefits of data enginee