In [1]:
import openai
import os
from util import get_api_key
openai.api_key = get_api_key()

In [2]:
schema_files = os.listdir('../schemas')

In [3]:
all_schemas = {}

In [4]:
for file in schema_files:
    opened_file = open('../schemas/' + file, 'r')
    all_schemas[file] = opened_file.read()

In [5]:
system_prompt = """You are a data engineer looking to create documentation and example queries for your data sets"""

In [6]:
user_prompt = f"""Using cumulative table input schema {all_schemas['players.sql']}
                 Generate a pipeline documentation in markdown 
                    that shows how this is generated from 
                {all_schemas['player_seasons.sql']}
                make sure to include example queries that use the season stats array
                make sure to document all columns with column comments
                make sure to document all created types as well
            """

In [7]:
print(system_prompt)
print(user_prompt)

You are a data engineer looking to create documentation and example queries for your data sets
Using cumulative table input schema  CREATE TYPE season_stats AS (
                         season Integer,
                         pts REAL,
                         ast REAL,
                         reb REAL,
                         weight INTEGER
                       );
 CREATE TYPE scorer_class AS
     ENUM ('bad', 'average', 'good', 'star');


 CREATE TABLE players (
     player_name TEXT,
     height TEXT,
     college TEXT,
     country TEXT,
     draft_year TEXT,
     draft_round TEXT,
     draft_number TEXT,
     seasons season_stats[],
     scoring_class scorer_class,
     is_active BOOLEAN,
     current_season INTEGER,
     PRIMARY KEY (player_name, current_season)
 );




                 Generate a pipeline documentation in markdown 
                    that shows how this is generated from 
                CREATE TABLE public.player_seasons (
    player_name text NOT NULL,


In [8]:
response = openai.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature=0
)
answer = response.choices[0].message.content

In [9]:
print(answer)

# Pipeline Documentation

This pipeline is designed to transform data from the `public.player_seasons` table into the `players` table. The `players` table is a more complex structure that includes custom types and arrays to better organize and represent the data.

## Source Table

The source table `public.player_seasons` has the following schema:

| Column Name | Data Type | Description |
|-------------|-----------|-------------|
| player_name | text | The name of the player |
| age | integer | The age of the player |
| height | text | The player's height |
| weight | integer | The player's weight |
| college | text | The college the player attended |
| country | text | The country the player is from |
| draft_year | text | The year the player was drafted |
| draft_round | text | The round the player was drafted in |
| draft_number | text | The number the player was drafted at |
| gp | real | Games played by the player |
| pts | real | Points scored by the player |
| reb | real | Rebou

In [10]:
if not os.path.exists('output'):
    os.mkdir('output')
# Open the file with write permissions
with open('output/documentation.md', 'w') as file:
    # Write some data to the file
    file.write(answer)