Skip to content

Commit

Permalink
project and profile scaffolding with clean, compile, and run tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
Christopher Merrick committed Mar 16, 2016
1 parent fd7567f commit b849af2
Show file tree
Hide file tree
Showing 15 changed files with 374 additions and 0 deletions.
22 changes: 22 additions & 0 deletions README.md
@@ -0,0 +1,22 @@
### dbt

A data build tool

#### installation

From the root directory of this repository,

```bash
› python setup.py develop
```

#### use

Create a `dbt_project.yml` file in the root of your source tree
following sample.dbt_project.yml.

`dbt compile` to generate runnable SQL from model files

`dbt run` to run model files on the current `run-target` database

`dbt clean` to clear compiled files
Empty file added dbt/__init__.py
Empty file.
33 changes: 33 additions & 0 deletions dbt/main.py
@@ -0,0 +1,33 @@
import argparse
import os.path
import dbt.project as project
import dbt.task.run as run_task
import dbt.task.compile as compile_task
import dbt.task.debug as debug_task
import dbt.task.clean as clean_task


def main(args):
if os.path.isfile('dbt_project.yml'):
proj = project.read_project('dbt_project.yml')
else:
proj = project.default_project()

p = argparse.ArgumentParser(prog='dbt: data build tool')
subs = p.add_subparsers()

sub = subs.add_parser('clean')
sub.set_defaults(cls=clean_task.CleanTask)

sub = subs.add_parser('run')
sub.set_defaults(cls=run_task.RunTask)

sub = subs.add_parser('compile')
sub.set_defaults(cls=compile_task.CompileTask)

sub = subs.add_parser('debug')
sub.set_defaults(cls=debug_task.DebugTask)

parsed = p.parse_args(args)

parsed.cls(args=parsed, project=proj).run()
84 changes: 84 additions & 0 deletions dbt/project.py
@@ -0,0 +1,84 @@
import os.path
import yaml
import pprint
import copy

default_cfg = {
'source-paths': ['model'],
'test-paths': ['test'],
'target-path': 'target',
'clean-targets': ['target'],
'outputs': {'default': {}},
'run-target': 'default',
}

default_profiles = ['user']


class Project:

def __init__(self, cfg, profiles, active_profile_names=[]):
self.cfg = default_cfg.copy()
self.cfg.update(cfg)
self.profiles = profiles
self.active_profile_names = active_profile_names

for profile_name in active_profile_names:
self.cfg.update(self.profiles[profile_name])

def __str__(self):
return pprint.pformat({'project': self.cfg, 'profiles': self.profiles})

def __repr__(self):
return self.__str__()

def __getitem__(self, key):
return self.cfg.__getitem__(key)

def __setitem__(self, key, value):
return self.cfg.__setitem__(key, value)

def run_environment(self):
target_name = self.cfg['run-target']
return self.cfg['outputs'][target_name]

def context(self):
target_cfg = self.run_environment()
filtered_target = copy.deepcopy(target_cfg)
filtered_target.pop('pass')
return {'env': target_cfg}

def with_profiles(self, profiles=[]):
return Project(
copy.deepcopy(self.cfg),
copy.deepcopy(self.profiles),
profiles)


def read_profiles():
profiles = {}
paths = [
os.path.join(os.path.expanduser('~'), '.dbt/profiles.yml')
]
for path in paths:
if os.path.isfile(path):
with open(path, 'r') as f:
m = yaml.safe_load(f)
profiles.update(m)

return profiles


def init_project(project_cfg):
profiles = read_profiles()
return Project(project_cfg, profiles, default_profiles)


def read_project(filename):
with open(filename, 'r') as f:
cfg = yaml.safe_load(f)
return init_project(cfg)


def default_project():
return init_project(default_cfg)
Empty file added dbt/task/__init__.py
Empty file.
28 changes: 28 additions & 0 deletions dbt/task/clean.py
@@ -0,0 +1,28 @@
import os.path
import os
import shutil


class CleanTask:

def __init__(self, args, project):
self.args = args
self.project = project

def __is_project_path(self, path):
proj_path = os.path.abspath('.')
return not os.path.commonprefix(
[proj_path, os.path.abspath(path)]
) == proj_path

def __is_protected_path(self, path):
abs_path = os.path.abspath(path)
protected_paths = self.project['source-paths'] + self.project['test-paths'] + ['.']

protected_abs_paths = [os.path.abspath for p in protected_paths]
return abs_path in set(protected_abs_paths) or self.__is_project_path(abs_path)

def run(self):
for path in self.project['clean-targets']:
if not self.__is_protected_path(path):
shutil.rmtree(path, True)
47 changes: 47 additions & 0 deletions dbt/task/compile.py
@@ -0,0 +1,47 @@
import pprint
import os
import fnmatch
import jinja2


class CompileTask:
def __init__(self, args, project):
self.args = args
self.project = project

def __src_index(self):
"""returns: {'model': ['pardot/model.sql', 'segment/model.sql']}
"""
indexed_files = {}

for source_path in self.project['source-paths']:
for root, dirs, files in os.walk(source_path):
for filename in files:
if fnmatch.fnmatch(filename, "*.sql"):
abs_path = os.path.join(root, filename)
rel_path = os.path.relpath(abs_path, source_path)
indexed_files.setdefault(source_path, []).append(rel_path)

return indexed_files

def __write(self, path, payload):
target_path = os.path.join(self.project['target-path'], path)

if not os.path.exists(os.path.dirname(target_path)):
os.makedirs(os.path.dirname(target_path))
elif os.path.exists(target_path):
print "Compiler overwrite of {}".format(target_path)

with open(target_path, 'w') as f:
f.write(payload)

def __compile(self, src_index):
for src_path, files in src_index.iteritems():
jinja = jinja2.Environment(loader=jinja2.FileSystemLoader(searchpath=src_path))
for f in files:
template = jinja.get_template(f)
self.__write(f, template.render(self.project.context()))

def run(self):
src_index = self.__src_index()
self.__compile(src_index)
12 changes: 12 additions & 0 deletions dbt/task/debug.py
@@ -0,0 +1,12 @@
import pprint


class DebugTask:
def __init__(self, args, project):
self.args = args
self.project = project

def run(self):
print "args: {}".format(self.args)
print "project: "
pprint.pprint(self.project)
73 changes: 73 additions & 0 deletions dbt/task/run.py
@@ -0,0 +1,73 @@
import pprint
import psycopg2
import os
import fnmatch


class RedshiftTarget:
def __init__(self, cfg):
assert cfg['type'] == 'redshift'
self.host = cfg['host']
self.user = cfg['user']
self.password = cfg['pass']
self.port = cfg['port']
self.dbname = cfg['dbname']
self.schema = cfg['schema']

def __get_spec(self):
return "dbname='{}' user='{}' host='{}' password='{}' port='{}'".format(
self.dbname,
self.user,
self.host,
self.password,
self.port
)

def get_handle(self):
return psycopg2.connect(self.__get_spec())


class RunTask:
def __init__(self, args, project):
self.args = args
self.project = project

def __compiled_files(self):
compiled_files = []
sql_path = self.project['target-path']

for root, dirs, files in os.walk(sql_path):
for filename in files:
if fnmatch.fnmatch(filename, "*.sql"):
abs_path = os.path.join(root, filename)
rel_path = os.path.relpath(abs_path, sql_path)
compiled_files.append(rel_path)

return compiled_files

def __get_target(self):
target_cfg = self.project.run_environment()
if target_cfg['type'] == 'redshift':
return RedshiftTarget(target_cfg)
else:
raise NotImplementedError("Unknown target type '{}'".format(target_cfg['type']))

def __create_schema(self):
target_cfg = self.project.run_environment()
target = self.__get_target()
with target.get_handle() as handle:
with handle.cursor() as cursor:
cursor.execute('create schema if not exists "{}"'.format(target_cfg['schema']))

def __execute_models(self):
target = self.__get_target()
with target.get_handle() as handle:
with handle.cursor() as cursor:
for f in self.__compiled_files():
with open(os.path.join(self.project['target-path'], f), 'r') as fh:
cursor.execute(fh.read())
print " {}".format(cursor.statusmessage)

def run(self):
self.__create_schema()
self.__execute_models()
4 changes: 4 additions & 0 deletions dev_requirements.txt
@@ -0,0 +1,4 @@
nose>=1.3.7
nosy>=1.1.2
mock>=1.3.0
pep8>=1.6.2
4 changes: 4 additions & 0 deletions requirements.txt
@@ -0,0 +1,4 @@
argparse
Jinja2>=2.8
PyYAML>=3.11
psycopg2==2.6.1
26 changes: 26 additions & 0 deletions sample.dbt_project.yml
@@ -0,0 +1,26 @@
# This is an annotated sample project configuration for reference.
# It attempts to show all possible configuration options.

# Compile configuration
source-paths: ["model"] # paths with source code to compile
target-path: "target" # path for compiled code
clean-targets: ["target"] # directories removed by the clean task

# Run configuration
# output environments
outputs:
my-redshift: # uniquely named
type: redshift # only type supported
host: localhost # any IP or fqdn
port: 5439
user: my_user
# best practice: specify outputs in your user profile
# located in ~/.dbt/profiles.yml to ensure credentials
# don't get checked in alongside your model code
password: password
dbname: dev
schema: my_model_schema
run-target: my_redshift

# Test configuration
test-paths: ["test"]
8 changes: 8 additions & 0 deletions scripts/dbt
@@ -0,0 +1,8 @@
#!/usr/bin/env python
import sys
import dbt.main
import logging

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
dbt.main.main(sys.argv[1:])
21 changes: 21 additions & 0 deletions setup.py
@@ -0,0 +1,21 @@
#!/usr/bin/env python
from setuptools import setup, find_packages
import os.path

package_name = "dbt"
package_version = "0.1.0-SNAPSHOT"

setup(
name=package_name,
version=package_version,
packages=find_packages(),
scripts=[
'scripts/dbt',
],
install_requires=[
'argparse>=1.2.1',
'Jinja2>=2.8',
'PyYAML>=3.11',
'psycopg2==2.6.1',
],
)
12 changes: 12 additions & 0 deletions test/pep8_test.py
@@ -0,0 +1,12 @@
import unittest
import pep8


class TestCodeFormat(unittest.TestCase):

def test_pep8_conformance(self):
"""Test that we conform to PEP8."""
pep8style = pep8.StyleGuide(quiet=False)
pep8style.options.ignore = pep8style.options.ignore + ("E501",)
result = pep8style.check_files(['dbt', 'test'])
self.assertEqual(result.total_errors, 0, 'Found code style errors (and warnings).')

0 comments on commit b849af2

Please sign in to comment.