In [1]:
# 1. Import all the relevant dbt packages

# Question: what's the difference between
# from dbt import project
# import dbt.project
# Apart from the fact that you reference them differently?

import dbt
import dbt.project
import dbt.config
import dbt.parser
import dbt.compilation


In [2]:
# 2. Generate a list of model fqns to be checked against 
project = dbt.project.read_project("dbt_project.yml")

model1 = dbt.parser.get_fqn('models/adwords/adwords_ads.sql', project)
model2 = dbt.parser.get_fqn('models/snowplow/base/snowplow_events.sql', project)

model_fqns = [model1, model2]
model_fqns

# To do: Build a get_nodes function to generate a list of nodes, and then the fqns
# This should also consider external packages
# Note: I couldn't get this to work - see below

[['my_package_name', 'models', 'adwords', 'adwords_ads'],
 ['my_package_name', 'models', 'snowplow', 'base', 'snowplow_events']]

In [13]:
# 3. Generate a list of PQNs (partially qualified names?) for each model path specified in the project yml file

ModelConfigKeys = [
        'schema',
        'enabled',
        'materialized',
        'dist',
        'sort',
        'sql_where',
        'unique_key',
        'sort_type',
        'bind',
        'vars',
        'pre-hook',
        'post-hook'
    ]
# Is this list exhaustive? Based on these lists: https://github.com/fishtown-analytics/dbt/blob/a5d17a30c774e9b967293eef2d94d755018dc57e/dbt/model.py#L14

# model_config_pqns = []

# 

# General approach:
# a. Traverse each path in the project_yml['models'] file
# b. Stop when you hit one of the above keywords
# c. And add the list of preceding to the model_config_pqns list

def get_model_config_pqn( config_models, pqn = None ,model_config_pqns = None):
    if pqn is None:
        pqn = []
    # could also write `pqn = pqn or []`
    if model_config_pqns is None:
        model_config_pqns = []
    # why can't I have model_config_pqns = [] here?
    for k,v in config_models.items():
        # If the next level is a dictionary
        if isinstance(v,dict):
            # If the key is a config key, add the list of keys to the model_config_pqns list
            # base case - when you get to a model config key
            if k in ModelConfigKeys: 
                if pqn not in model_config_pqns and pqn:
                    model_config_pqns.append(pqn)
            # Else, keep iterating
            # recursive case
            else:
                get_model_config_pqn( v, pqn + [k], model_config_pqns)
        # If you've reached the end of the path, add the path
        # base case - when you reach the end of the dictionary
        else:
            if pqn not in model_config_pqns and pqn:
                model_config_pqns.append(pqn)

    return model_config_pqns
            
model_config_pqns = get_model_config_pqn(project['models']) 
model_config_pqns


[['my_package_name', 'adwords'],
 ['my_package_name', 'adwords', 'adwords_ads'],
 ['my_package_name', 'snowplow', 'base', 'snowplow_events']]

In [11]:
def get_model_config_pqn( config_models, pqn = None ,model_config_pqns = None):
    if pqn is None:
        pqn = []
    # could also write `pqn = pqn or []`
    if model_config_pqns is None:
        model_config_pqns = []
    
    if not isinstance(config_models, dict):
        return model_config_pqns
        
    for k,v in config_models.items():
        # If the next level is a dictionary
        # If the key is a config key, add the list of keys to the model_config_pqns list
        # base case - when you get to a model config key
        if k in ModelConfigKeys: 
            if pqn not in model_config_pqns and pqn:
                pass
#                 model_config_pqns.append(pqn)
        # Else, keep iterating
        # recursive case
        else:
            return model_config_pqns + get_model_config_pqn( v, pqn + [k], model_config_pqns)bra
#             get_model_config_pqn( v, pqn + [k], model_config_pqns)
    # If you've reached the end of the path, add the path
        # base case - when you reach the end of the dictionary

    return model_config_pqns
            
model_config_pqns = get_model_config_pqn(project['models']) 
model_config_pqns

True

In [6]:
# 4. Build a function that checks where a pqn exists in a pqn (even if it has other items in between)
# Note: I'm not confident that this is the best way to write this loop

def is_pqn_in_fqn(pqn, fqn):
    for item in pqn:
        # there's a better word than "item" here...
        # check that the current directory exists in the fqn
        if item in fqn:
            # if it does, then update the fqn to that it now only contains item after that item
            fqn = fqn[fqn.index(item)+1:]
        else:
            # if it doesn't then return false and exit the loop
            return False
            break
    # if the loop doesn't get broken, turn True
    return True
        
# these should return true
print(is_pqn_in_fqn(['a', 'b'], ['a', 'b', 'c', 'd']))
print(is_pqn_in_fqn(['a', 'c'], ['a', 'b', 'c', 'd']))
print(is_pqn_in_fqn(['b', 'd'], ['a', 'b', 'c', 'd']))

# these should return false
print(is_pqn_in_fqn(['a', 'b', 'e'], ['a', 'b', 'c', 'd']))
print(is_pqn_in_fqn(['a', 'a', 'b'], ['a', 'b', 'c', 'd']))
print(is_pqn_in_fqn(['b', 'a'], ['a', 'b', 'c', 'd']))

True
True
True
False
False
False


In [7]:
# 5. Check if each pqn is valid, by checking whether it matches at least one fqn (using the above function)
# Note: What if it matches two? Is that ok?

def check_config_pqns(model_config_pqns, model_fqns):
    for pqn in model_config_pqns:
        for fqn in model_fqns:
            if is_pqn_in_fqn(pqn, fqn):
                print(":WOO: Your config " + str(pqn) + " is valid")
                break
            print("Your config " + str(pqn) + " doesn't point to a model")
check_config_pqns(model_config_pqns, model_fqns)

:WOO: Your config ['my_package_name', 'adwords'] is valid
:WOO: Your config ['my_package_name', 'adwords', 'adwords_ads'] is valid
Your config ['my_package_name', 'snowplow', 'base', 'snowplow_events'] doesn't point to a model
:WOO: Your config ['my_package_name', 'snowplow', 'base', 'snowplow_events'] is valid


In [None]:
# This part was me mucking around to try and generate the model_fqns by first returning the nodes... didn't quite get there.

results = dbt.compilation.Compiler(project)

# for each node, get_fqn
def get_nodes(package_name, root_project, all_projects, root_dir,
                       relative_dirs, resource_type):
    extension = "[!.#~]*.sql"

    if dbt.flags.STRICT_MODE:
        dbt.contracts.project.validate_list(all_projects)

    file_matches = dbt.clients.system.find_matching(
        root_dir,
        relative_dirs,
        extension)

    result = []

    for file_match in file_matches:
        file_contents = dbt.clients.system.load_file_contents(
            file_match.get('absolute_path'))

        parts = dbt.utils.split_path(file_match.get('relative_path', ''))
        name, _ = os.path.splitext(parts[-1])

        if resource_type == NodeType.Test:
            path = dbt.utils.get_pseudo_test_path(
                name, file_match.get('relative_path'), 'data_test')
        elif resource_type == NodeType.Analysis:
            path = os.path.join('analysis', file_match.get('relative_path'))
        else:
            path = file_match.get('relative_path')

        original_file_path = os.path.join(
            file_match.get('searched_path'),
            path)

        result.append({
            'name': name,
            'root_path': root_dir,
            'resource_type': resource_type,
            'path': path,
            'original_file_path': original_file_path,
            'package_name': package_name,
            'raw_sql': file_contents
        })

package_name='my_package_name',
root_project=project,
all_projects=results.get_all_projects(),
root_dir=project.get('project-root'),
relative_dirs=project.get('source-paths', []),
resource_type=NodeType.Model

nodes = get_nodes(package_name, 
                  root_project, 
                  all_projects, 
                  root_dir,
                  relative_dirs,
                  resource_type)
# for node in nodes:
#     print(node)