# Quick Start

In [1]:
# Set project as root directory
import os

if os.getcwd().endswith("notebooks"):
    os.chdir("..")
print(os.getcwd())

/home/cmcouto-silva/Projects/sqldeps


## API

In [2]:
import sqldeps

print("sqldeps version:", sqldeps.__version__)

sqldeps version: 0.1.1


### Exploring SQL Extractor & SQLProfile

In [3]:
from sqldeps.llm_parsers import create_extractor

In [4]:
# `create_extractor` is a wrapper to instantiate the target SQL extractor
# Every extractor is derivd from the base class `BaseSQLExtractor`
# The default extractor is `LiteLLMExtractor`, which allows specifying models from multiple providers

create_extractor?

[0;31mSignature:[0m
[0mcreate_extractor[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mframework[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'litellm'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mstr[0m [0;34m|[0m [0;32mNone[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mparams[0m[0;34m:[0m [0mdict[0m [0;34m|[0m [0;32mNone[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprompt_path[0m[0;34m:[0m [0mpathlib[0m[0;34m.[0m[0m_local[0m[0;34m.[0m[0mPath[0m [0;34m|[0m [0;32mNone[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0msqldeps[0m[0;34m.[0m[0mllm_parsers[0m[0;34m.[0m[0mbase[0m[0;34m.[0m[0mBaseSQLExtractor[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Create an appropriate SQL extractor based on the specified framework.

Args:
    framework: The LLM framework to use ("litellm", "groq", "openai", or "deepseek

In [5]:
# Create extractor for the OpenAI GPT-4.1-mini model (expects env variable OPENAI_API_KEY to be set)
extractor = create_extractor(model="openai/gpt-4.1-mini")

In [6]:
# Extract dependencies and outputs from a SQL query
sql_query = """
WITH user_orders AS (
    SELECT o.user_id, COUNT(*) AS order_count
    FROM orders o
    JOIN users u ON o.user_id = u.id
    WHERE u.status = 'active'
    GROUP BY o.user_id
)

CREATE TABLE transactions.user_order_summary AS
SELECT * FROM user_orders;
"""
sql_profile = extractor.extract_from_query(sql_query)

# Print the extracted dependencies and outputs
print(sql_profile)

SQLProfile(dependencies={'orders': ['user_id'], 'users': ['id', 'status']}, outputs={'transactions.user_order_summary': ['*']})


In [7]:
# Extracted result object contains the dependencies and outputs
print("Dependencies:", sql_profile.dependencies)
print("Outputs:", sql_profile.outputs)

# Convert the result to different formats
print("As dictionary:", sql_profile.to_dict())

print("As DataFrame:")
print(sql_profile.to_dataframe())

Dependencies: {'orders': ['user_id'], 'users': ['id', 'status']}
Outputs: {'transactions.user_order_summary': ['*']}
As dictionary: {'dependencies': {'orders': ['user_id'], 'users': ['id', 'status']}, 'outputs': {'transactions.user_order_summary': ['*']}}
As DataFrame:
         type        schema               table   column
0  dependency          None              orders  user_id
1  dependency          None               users       id
2  dependency          None               users   status
3     outcome  transactions  user_order_summary        *


## Extracting SQL profile from a file

In [8]:
extractor.extract_from_file?

[0;31mSignature:[0m [0mextractor[0m[0;34m.[0m[0mextract_from_file[0m[0;34m([0m[0mfile_path[0m[0;34m:[0m [0mstr[0m [0;34m|[0m [0mpathlib[0m[0;34m.[0m[0m_local[0m[0;34m.[0m[0mPath[0m[0;34m)[0m [0;34m->[0m [0msqldeps[0m[0;34m.[0m[0mmodels[0m[0;34m.[0m[0mSQLProfile[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Extract dependencies from a SQL file.

Args:
    file_path: Path to SQL file

Returns:
    SQLProfile object containing dependencies and outputs

Raises:
    FileNotFoundError: If file does not exist
[0;31mFile:[0m      ~/Projects/sqldeps/sqldeps/llm_parsers/base.py
[0;31mType:[0m      method

In [9]:
extractor.extract_from_file("data/examples/example.sql")

SQLProfile(dependencies={'orders': ['user_id'], 'users': ['id', 'status']}, outputs={'order_summary': [], 'transactions.user_order_summary': ['*']})

## Extracting SQL profile from a folder

In [10]:
extractor.extract_from_folder?

[0;31mSignature:[0m
[0mextractor[0m[0;34m.[0m[0mextract_from_folder[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfolder_path[0m[0;34m:[0m [0mstr[0m [0;34m|[0m [0mpathlib[0m[0;34m.[0m[0m_local[0m[0;34m.[0m[0mPath[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrecursive[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmerge_sql_profiles[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvalid_extensions[0m[0;34m:[0m [0mset[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m|[0m [0;32mNone[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_workers[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrpm[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_cache[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m

In [11]:
# Extract from folder with default parameters
extractor.extract_from_folder(folder_path = "data/examples/folders_with_sql_files")

[32m2025-05-06 17:55:02.431[0m | [1mINFO    [0m | [36msqldeps.llm_parsers.base[0m:[36m_process_files_sequentially[0m:[36m180[0m - [1mCache usage: enabled[0m
[32m2025-05-06 17:55:02.432[0m | [1mINFO    [0m | [36msqldeps.llm_parsers.base[0m:[36m_process_files_sequentially[0m:[36m181[0m - [1mProcessing 2 SQL files sequentially with RPM: 100[0m
Processing SQL files:   0%|          | 0/2 [00:00<?, ?it/s][32m2025-05-06 17:55:02.434[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mload_from_cache[0m:[36m99[0m - [1mLoading from cache: data/examples/folders_with_sql_files/example1.sql[0m
[32m2025-05-06 17:55:02.434[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mload_from_cache[0m:[36m99[0m - [1mLoading from cache: data/examples/folders_with_sql_files/example2.sql[0m
Processing SQL files: 100%|██████████| 2/2 [00:00<00:00, 1973.33it/s]


{'data/examples/folders_with_sql_files/example1.sql': SQLProfile(dependencies={'users': ['id', 'name']}, outputs={}),
 'data/examples/folders_with_sql_files/example2.sql': SQLProfile(dependencies={'users': ['*']}, outputs={})}

In [12]:
# Extract from folder with default parameters
extractor.extract_from_folder(
    folder_path = "data/examples/folders_with_sql_files",
    merge_sql_profiles = True
)

[32m2025-05-06 17:55:02.441[0m | [1mINFO    [0m | [36msqldeps.llm_parsers.base[0m:[36m_process_files_sequentially[0m:[36m180[0m - [1mCache usage: enabled[0m
[32m2025-05-06 17:55:02.442[0m | [1mINFO    [0m | [36msqldeps.llm_parsers.base[0m:[36m_process_files_sequentially[0m:[36m181[0m - [1mProcessing 2 SQL files sequentially with RPM: 100[0m
Processing SQL files:   0%|          | 0/2 [00:00<?, ?it/s][32m2025-05-06 17:55:02.444[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mload_from_cache[0m:[36m99[0m - [1mLoading from cache: data/examples/folders_with_sql_files/example1.sql[0m
[32m2025-05-06 17:55:02.444[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mload_from_cache[0m:[36m99[0m - [1mLoading from cache: data/examples/folders_with_sql_files/example2.sql[0m
Processing SQL files: 100%|██████████| 2/2 [00:00<00:00, 1301.77it/s]


SQLProfile(dependencies={'users': ['*']}, outputs={})

In [13]:
# Clean up cache
from sqldeps.cache import cleanup_cache

cleanup_cache()

[32m2025-05-06 17:55:02.454[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mcleanup_cache[0m:[36m126[0m - [1mRemoved cache directory: .sqldeps_cache[0m


True

In [14]:
# Extract from folder recursively with multiprocessing
extractor.extract_from_folder(
    folder_path = "data/examples/folders_with_sql_files/",
    recursive = True,
    merge_sql_profiles = False,
    valid_extensions = None,
    n_workers = -1,   # use all available cores
    rpm = 100,        # Max 100 requests per minute
    use_cache = True,
    clear_cache = False,
)

[32m2025-05-06 17:55:02.472[0m | [1mINFO    [0m | [36msqldeps.parallel[0m:[36mprocess_files_in_parallel[0m:[36m168[0m - [1mProcessing 5 SQL files[0m
[32m2025-05-06 17:55:02.473[0m | [1mINFO    [0m | [36msqldeps.parallel[0m:[36mprocess_files_in_parallel[0m:[36m169[0m - [1mUsing 20 workers with global rate limit of 100 requests per minute[0m
[32m2025-05-06 17:55:02.473[0m | [1mINFO    [0m | [36msqldeps.parallel[0m:[36mprocess_files_in_parallel[0m:[36m172[0m - [1mCache usage: enabled[0m
[32m2025-05-06 17:55:02.536[0m | [34m[1mDEBUG   [0m | [36msqldeps.parallel[0m:[36mextract_with_rate_limit[0m:[36m85[0m - [34m[1mExtracting from file: data/examples/folders_with_sql_files/example2.sql[0m
[32m2025-05-06 17:55:02.537[0m | [34m[1mDEBUG   [0m | [36msqldeps.parallel[0m:[36mextract_with_rate_limit[0m:[36m85[0m - [34m[1mExtracting from file: data/examples/folders_with_sql_files/example1.sql[0m
[32m2025-05-06 17:55:02.541[0m | [34m

{'data/examples/folders_with_sql_files/example2.sql': SQLProfile(dependencies={'users': ['*']}, outputs={}),
 'data/examples/folders_with_sql_files/subfolder1/example3.sql': SQLProfile(dependencies={'my_db.users': ['id', 'name'], 'orders': ['order_id', 'user_id']}, outputs={}),
 'data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example5.sql': SQLProfile(dependencies={'orders': ['user_id'], 'users': ['id', 'name']}, outputs={}),
 'data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example4.sql': SQLProfile(dependencies={'my_db.users': ['email', 'id', 'name', 'status'], 'orders': ['order_date', 'order_id', 'order_type', 'priority_level', 'shipping_status', 'total_amount', 'user_id']}, outputs={}),
 'data/examples/folders_with_sql_files/example1.sql': SQLProfile(dependencies={'users': ['id', 'name']}, outputs={})}

In [15]:
# Extract from folder recursively with multiprocessing with cached files
extractor.extract_from_folder(
    folder_path = "data/examples/folders_with_sql_files/",
    recursive = True,
    merge_sql_profiles = False,
    valid_extensions = None,
    n_workers = -1,   # use all available cores
    rpm = 100,        # Max 100 requests per minute
    use_cache = True,
    clear_cache = False,
)

[32m2025-05-06 17:55:05.238[0m | [1mINFO    [0m | [36msqldeps.parallel[0m:[36mprocess_files_in_parallel[0m:[36m168[0m - [1mProcessing 5 SQL files[0m
[32m2025-05-06 17:55:05.238[0m | [1mINFO    [0m | [36msqldeps.parallel[0m:[36mprocess_files_in_parallel[0m:[36m169[0m - [1mUsing 20 workers with global rate limit of 100 requests per minute[0m
[32m2025-05-06 17:55:05.239[0m | [1mINFO    [0m | [36msqldeps.parallel[0m:[36mprocess_files_in_parallel[0m:[36m172[0m - [1mCache usage: enabled[0m
[32m2025-05-06 17:55:05.303[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mload_from_cache[0m:[36m99[0m - [1mLoading from cache: data/examples/folders_with_sql_files/example2.sql[0m
[32m2025-05-06 17:55:05.304[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mload_from_cache[0m:[36m99[0m - [1mLoading from cache: data/examples/folders_with_sql_files/subfolder1/example3.sql[0m
[32m2025-05-06 17:55:05.308[0m | [1mINFO    [0m | [36msqldeps.parallel

{'data/examples/folders_with_sql_files/subfolder1/example3.sql': SQLProfile(dependencies={'my_db.users': ['id', 'name'], 'orders': ['order_id', 'user_id']}, outputs={}),
 'data/examples/folders_with_sql_files/example2.sql': SQLProfile(dependencies={'users': ['*']}, outputs={}),
 'data/examples/folders_with_sql_files/example1.sql': SQLProfile(dependencies={'users': ['id', 'name']}, outputs={}),
 'data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example5.sql': SQLProfile(dependencies={'orders': ['user_id'], 'users': ['id', 'name']}, outputs={}),
 'data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example4.sql': SQLProfile(dependencies={'my_db.users': ['email', 'id', 'name', 'status'], 'orders': ['order_date', 'order_id', 'order_type', 'priority_level', 'shipping_status', 'total_amount', 'user_id']}, outputs={})}

In [16]:
# Clean up cache again
cleanup_cache()

[32m2025-05-06 17:55:05.359[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mcleanup_cache[0m:[36m126[0m - [1mRemoved cache directory: .sqldeps_cache[0m


True

In [17]:
# Extract from folder recursively with multiprocessing with cached files with max 3 requests per minute
extractor.extract_from_folder(
    folder_path = "data/examples/folders_with_sql_files/",
    recursive = True,
    merge_sql_profiles = False,
    valid_extensions = None,
    n_workers = -1,   # use all available cores
    rpm = 3,        # Max 3 requests per minute
    use_cache = True,
    clear_cache = False,
)

[32m2025-05-06 17:55:05.369[0m | [1mINFO    [0m | [36msqldeps.parallel[0m:[36mprocess_files_in_parallel[0m:[36m168[0m - [1mProcessing 5 SQL files[0m
[32m2025-05-06 17:55:05.370[0m | [1mINFO    [0m | [36msqldeps.parallel[0m:[36mprocess_files_in_parallel[0m:[36m169[0m - [1mUsing 20 workers with global rate limit of 3 requests per minute[0m
[32m2025-05-06 17:55:05.370[0m | [1mINFO    [0m | [36msqldeps.parallel[0m:[36mprocess_files_in_parallel[0m:[36m172[0m - [1mCache usage: enabled[0m
[32m2025-05-06 17:55:05.449[0m | [34m[1mDEBUG   [0m | [36msqldeps.parallel[0m:[36mextract_with_rate_limit[0m:[36m85[0m - [34m[1mExtracting from file: data/examples/folders_with_sql_files/example2.sql[0m
[32m2025-05-06 17:55:05.452[0m | [34m[1mDEBUG   [0m | [36msqldeps.parallel[0m:[36mextract_with_rate_limit[0m:[36m85[0m - [34m[1mExtracting from file: data/examples/folders_with_sql_files/example1.sql[0m
[32m2025-05-06 17:55:05.453[0m | [34m[1

{'data/examples/folders_with_sql_files/example1.sql': SQLProfile(dependencies={'users': ['id', 'name']}, outputs={}),
 'data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example4.sql': SQLProfile(dependencies={'my_db.users': ['email', 'id', 'name', 'status'], 'orders': ['order_date', 'order_id', 'order_type', 'priority_level', 'shipping_status', 'total_amount', 'user_id']}, outputs={}),
 'data/examples/folders_with_sql_files/example2.sql': SQLProfile(dependencies={'users': ['*']}, outputs={}),
 'data/examples/folders_with_sql_files/subfolder2/subfolder2_1/example5.sql': SQLProfile(dependencies={'orders': ['user_id'], 'users': ['id', 'name']}, outputs={}),
 'data/examples/folders_with_sql_files/subfolder1/example3.sql': SQLProfile(dependencies={'my_db.users': ['id', 'name'], 'orders': ['order_id', 'user_id']}, outputs={})}

### SQL Profile visualization

In [18]:
from sqldeps.visualization import visualize_sql_dependencies

In [19]:
sql_profiles = extractor.extract_from_folder("data/examples/folders_with_sql_files/", recursive = True)

# Visualize the SQL dependencies
fig = visualize_sql_dependencies(sql_profiles)
fig.update_layout(height=800)

[32m2025-05-06 17:56:07.081[0m | [1mINFO    [0m | [36msqldeps.llm_parsers.base[0m:[36m_process_files_sequentially[0m:[36m180[0m - [1mCache usage: enabled[0m
[32m2025-05-06 17:56:07.082[0m | [1mINFO    [0m | [36msqldeps.llm_parsers.base[0m:[36m_process_files_sequentially[0m:[36m181[0m - [1mProcessing 5 SQL files sequentially with RPM: 100[0m
Processing SQL files:   0%|          | 0/5 [00:00<?, ?it/s][32m2025-05-06 17:56:07.084[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mload_from_cache[0m:[36m99[0m - [1mLoading from cache: data/examples/folders_with_sql_files/example1.sql[0m
[32m2025-05-06 17:56:07.084[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mload_from_cache[0m:[36m99[0m - [1mLoading from cache: data/examples/folders_with_sql_files/example2.sql[0m
[32m2025-05-06 17:56:07.084[0m | [1mINFO    [0m | [36msqldeps.cache[0m:[36mload_from_cache[0m:[36m99[0m - [1mLoading from cache: data/examples/folders_with_sql_files/subfolder

## CLI

In [20]:
%%bash

sqldeps --version

SQLDeps version: 0.1.1


In [21]:
%%bash

sqldeps --help

[1m                                                                                [0m
[1m [0m[1;33mUsage: [0m[1msqldeps [OPTIONS] COMMAND [ARGS]...[0m[1m                                    [0m[1m [0m
[1m                                                                                [0m
 SQL Dependency Extractor - Analyze SQL files to extract table and column       
 dependencies                                                                   
                                                                                
                                                                                
[2m╭─[0m[2m Options [0m[2m─────────────────────��─────────────────────────────────────────────[0m[2m─╮[0m
[2m│[0m [1;36m-[0m[1;36m-version[0m                     Show the version and exit.                     [2m│[0m
[2m│[0m [1;36m-[0m[1;36m-install[0m[1;36m-completion[0m          Install completion for the current shell.      [2m│[0m
[2m│[0m 

In [22]:
%%bash

sqldeps extract --help

[1m                                                                                [0m
[1m [0m[1;33mUsage: [0m[1msqldeps extract [OPTIONS] FPATH[0m[1m                                        [0m[1m [0m
[1m                                                                                [0m
 Extract SQL dependencies from file or folder.                                  
                                                                                
 [2mThis tool analyzes SQL files to identify table and column dependencies, [0m       
 [2moptionally validating them against a real database schema.[0m                     
                                                                                
[2m╭─[0m[2m Arguments [0m[2m───────────────────────────────────────────────���─────────────────[0m[2m─╮[0m
[2m│[0m [31m*[0m    fpath      [1;33mPATH[0m  SQL file or directory path [2m[default: None][0m [2;31m[required][0m  [2m│[0m
[2m╰─────────────────────

In [23]:
%%bash

# Extract from file
sqldeps extract data/examples/example.sql

2025-05-06 17:56:14.368 | INFO     | sqldeps.cli:extract_dependencies:74 - Extracting dependencies from file: /home/cmcouto-silva/Projects/sqldeps/data/examples/example.sql
2025-05-06 17:56:15.077 | SUCCESS  | sqldeps.cli:save_output:183 - Saved to JSON: dependencies.json


In [24]:
%%bash

# Extract from folder resursively & return CSV (accepts JSON and CSV as output formats)
sqldeps extract data/examples/folders_with_sql_files --recursive -o project_sql_profiles.csv

2025-05-06 17:56:16.979 | INFO     | sqldeps.cli:extract_dependencies:74 - Extracting dependencies from folder: /home/cmcouto-silva/Projects/sqldeps/data/examples/folders_with_sql_files
2025-05-06 17:56:16.979 | INFO     | sqldeps.llm_parsers.base:_process_files_sequentially:180 - Cache usage: enabled
2025-05-06 17:56:16.979 | INFO     | sqldeps.llm_parsers.base:_process_files_sequentially:181 - Processing 5 SQL files sequentially with RPM: 100
Processing SQL files:   0%|          | 0/5 [00:00<?, ?it/s]2025-05-06 17:56:16.980 | INFO     | sqldeps.cache:load_from_cache:99 - Loading from cache: /home/cmcouto-silva/Projects/sqldeps/data/examples/folders_with_sql_files/example1.sql
2025-05-06 17:56:16.980 | INFO     | sqldeps.cache:load_from_cache:99 - Loading from cache: /home/cmcouto-silva/Projects/sqldeps/data/examples/folders_with_sql_files/example2.sql
2025-05-06 17:56:16.980 | INFO     | sqldeps.cache:load_from_cache:99 - Loading from cache: /home/cmcouto-silva/Projects/sqldeps/data/

In [25]:
%%bash

# Clear cache
sqldeps cache clear

2025-05-06 17:56:18.748 | INFO     | sqldeps.cli:cache_clear:379 - Clearing SQLDeps cache...
2025-05-06 17:56:18.748 | INFO     | sqldeps.cache:cleanup_cache:126 - Removed cache directory: .sqldeps_cache
2025-05-06 17:56:18.748 | SUCCESS  | sqldeps.cli:cache_clear:382 - Cache cleared successfully
