project and profile scaffolding with clean, compile, and run tasks

dbt-labs · Mar 16, 2016 · b849af2 · b849af2
1 parent fd7567f
commit b849af2
Show file tree

Hide file tree

Showing 15 changed files with 374 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,22 @@
+### dbt
+
+A data build tool
+
+#### installation
+
+From the root directory of this repository,
+
+```bash
+› python setup.py develop
+```
+
+#### use
+
+Create a `dbt_project.yml` file in the root of your source tree
+following sample.dbt_project.yml.
+
+`dbt compile` to generate runnable SQL from model files
+
+`dbt run` to run model files on the current `run-target` database
+
+`dbt clean` to clear compiled files
diff --git a/dbt/__init__.py b/dbt/__init__.py
diff --git a/dbt/main.py b/dbt/main.py
@@ -0,0 +1,33 @@
+import argparse
+import os.path
+import dbt.project as project
+import dbt.task.run as run_task
+import dbt.task.compile as compile_task
+import dbt.task.debug as debug_task
+import dbt.task.clean as clean_task
+
+
+def main(args):
+    if os.path.isfile('dbt_project.yml'):
+        proj = project.read_project('dbt_project.yml')
+    else:
+        proj = project.default_project()
+
+    p = argparse.ArgumentParser(prog='dbt: data build tool')
+    subs = p.add_subparsers()
+
+    sub = subs.add_parser('clean')
+    sub.set_defaults(cls=clean_task.CleanTask)
+
+    sub = subs.add_parser('run')
+    sub.set_defaults(cls=run_task.RunTask)
+
+    sub = subs.add_parser('compile')
+    sub.set_defaults(cls=compile_task.CompileTask)
+
+    sub = subs.add_parser('debug')
+    sub.set_defaults(cls=debug_task.DebugTask)
+
+    parsed = p.parse_args(args)
+
+    parsed.cls(args=parsed, project=proj).run()
diff --git a/dbt/project.py b/dbt/project.py
@@ -0,0 +1,84 @@
+import os.path
+import yaml
+import pprint
+import copy
+
+default_cfg = {
+    'source-paths': ['model'],
+    'test-paths': ['test'],
+    'target-path': 'target',
+    'clean-targets': ['target'],
+    'outputs': {'default': {}},
+    'run-target': 'default',
+}
+
+default_profiles = ['user']
+
+
+class Project:
+
+    def __init__(self, cfg, profiles, active_profile_names=[]):
+        self.cfg = default_cfg.copy()
+        self.cfg.update(cfg)
+        self.profiles = profiles
+        self.active_profile_names = active_profile_names
+
+        for profile_name in active_profile_names:
+            self.cfg.update(self.profiles[profile_name])
+
+    def __str__(self):
+        return pprint.pformat({'project': self.cfg, 'profiles': self.profiles})
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __getitem__(self, key):
+        return self.cfg.__getitem__(key)
+
+    def __setitem__(self, key, value):
+        return self.cfg.__setitem__(key, value)
+
+    def run_environment(self):
+        target_name = self.cfg['run-target']
+        return self.cfg['outputs'][target_name]
+
+    def context(self):
+        target_cfg = self.run_environment()
+        filtered_target = copy.deepcopy(target_cfg)
+        filtered_target.pop('pass')
+        return {'env': target_cfg}
+
+    def with_profiles(self, profiles=[]):
+        return Project(
+            copy.deepcopy(self.cfg),
+            copy.deepcopy(self.profiles),
+            profiles)
+
+
+def read_profiles():
+    profiles = {}
+    paths = [
+        os.path.join(os.path.expanduser('~'), '.dbt/profiles.yml')
+    ]
+    for path in paths:
+        if os.path.isfile(path):
+            with open(path, 'r') as f:
+                m = yaml.safe_load(f)
+                profiles.update(m)
+
+    return profiles
+
+
+def init_project(project_cfg):
+    profiles = read_profiles()
+    return Project(project_cfg, profiles, default_profiles)
+
+
+def read_project(filename):
+    with open(filename, 'r') as f:
+        cfg = yaml.safe_load(f)
+        return init_project(cfg)
+
+
+def default_project():
+    return init_project(default_cfg)
diff --git a/dbt/task/__init__.py b/dbt/task/__init__.py
diff --git a/dbt/task/clean.py b/dbt/task/clean.py
@@ -0,0 +1,28 @@
+import os.path
+import os
+import shutil
+
+
+class CleanTask:
+
+    def __init__(self, args, project):
+        self.args = args
+        self.project = project
+
+    def __is_project_path(self, path):
+        proj_path = os.path.abspath('.')
+        return not os.path.commonprefix(
+            [proj_path, os.path.abspath(path)]
+        ) == proj_path
+
+    def __is_protected_path(self, path):
+        abs_path = os.path.abspath(path)
+        protected_paths = self.project['source-paths'] + self.project['test-paths'] + ['.']
+
+        protected_abs_paths = [os.path.abspath for p in protected_paths]
+        return abs_path in set(protected_abs_paths) or self.__is_project_path(abs_path)
+
+    def run(self):
+        for path in self.project['clean-targets']:
+            if not self.__is_protected_path(path):
+                shutil.rmtree(path, True)
diff --git a/dbt/task/compile.py b/dbt/task/compile.py
@@ -0,0 +1,47 @@
+import pprint
+import os
+import fnmatch
+import jinja2
+
+
+class CompileTask:
+    def __init__(self, args, project):
+        self.args = args
+        self.project = project
+
+    def __src_index(self):
+        """returns: {'model': ['pardot/model.sql', 'segment/model.sql']}
+        """
+        indexed_files = {}
+
+        for source_path in self.project['source-paths']:
+            for root, dirs, files in os.walk(source_path):
+                for filename in files:
+                    if fnmatch.fnmatch(filename, "*.sql"):
+                        abs_path = os.path.join(root, filename)
+                        rel_path = os.path.relpath(abs_path, source_path)
+                        indexed_files.setdefault(source_path, []).append(rel_path)
+
+        return indexed_files
+
+    def __write(self, path, payload):
+        target_path = os.path.join(self.project['target-path'], path)
+
+        if not os.path.exists(os.path.dirname(target_path)):
+            os.makedirs(os.path.dirname(target_path))
+        elif os.path.exists(target_path):
+            print "Compiler overwrite of {}".format(target_path)
+
+        with open(target_path, 'w') as f:
+            f.write(payload)
+
+    def __compile(self, src_index):
+        for src_path, files in src_index.iteritems():
+            jinja = jinja2.Environment(loader=jinja2.FileSystemLoader(searchpath=src_path))
+            for f in files:
+                template = jinja.get_template(f)
+                self.__write(f, template.render(self.project.context()))
+
+    def run(self):
+        src_index = self.__src_index()
+        self.__compile(src_index)
diff --git a/dbt/task/debug.py b/dbt/task/debug.py
@@ -0,0 +1,12 @@
+import pprint
+
+
+class DebugTask:
+    def __init__(self, args, project):
+        self.args = args
+        self.project = project
+
+    def run(self):
+        print "args: {}".format(self.args)
+        print "project: "
+        pprint.pprint(self.project)
diff --git a/dbt/task/run.py b/dbt/task/run.py
@@ -0,0 +1,73 @@
+import pprint
+import psycopg2
+import os
+import fnmatch
+
+
+class RedshiftTarget:
+    def __init__(self, cfg):
+        assert cfg['type'] == 'redshift'
+        self.host = cfg['host']
+        self.user = cfg['user']
+        self.password = cfg['pass']
+        self.port = cfg['port']
+        self.dbname = cfg['dbname']
+        self.schema = cfg['schema']
+
+    def __get_spec(self):
+        return "dbname='{}' user='{}' host='{}' password='{}' port='{}'".format(
+            self.dbname,
+            self.user,
+            self.host,
+            self.password,
+            self.port
+        )
+
+    def get_handle(self):
+        return psycopg2.connect(self.__get_spec())
+
+
+class RunTask:
+    def __init__(self, args, project):
+        self.args = args
+        self.project = project
+
+    def __compiled_files(self):
+        compiled_files = []
+        sql_path = self.project['target-path']
+
+        for root, dirs, files in os.walk(sql_path):
+            for filename in files:
+                if fnmatch.fnmatch(filename, "*.sql"):
+                    abs_path = os.path.join(root, filename)
+                    rel_path = os.path.relpath(abs_path, sql_path)
+                    compiled_files.append(rel_path)
+
+        return compiled_files
+
+    def __get_target(self):
+        target_cfg = self.project.run_environment()
+        if target_cfg['type'] == 'redshift':
+            return RedshiftTarget(target_cfg)
+        else:
+            raise NotImplementedError("Unknown target type '{}'".format(target_cfg['type']))
+
+    def __create_schema(self):
+        target_cfg = self.project.run_environment()
+        target = self.__get_target()
+        with target.get_handle() as handle:
+            with handle.cursor() as cursor:
+                cursor.execute('create schema if not exists "{}"'.format(target_cfg['schema']))
+
+    def __execute_models(self):
+        target = self.__get_target()
+        with target.get_handle() as handle:
+            with handle.cursor() as cursor:
+                for f in self.__compiled_files():
+                    with open(os.path.join(self.project['target-path'], f), 'r') as fh:
+                        cursor.execute(fh.read())
+                    print "         {}".format(cursor.statusmessage)
+
+    def run(self):
+        self.__create_schema()
+        self.__execute_models()
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -0,0 +1,4 @@
+nose>=1.3.7
+nosy>=1.1.2
+mock>=1.3.0
+pep8>=1.6.2
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+argparse
+Jinja2>=2.8
+PyYAML>=3.11
+psycopg2==2.6.1
diff --git a/sample.dbt_project.yml b/sample.dbt_project.yml
@@ -0,0 +1,26 @@
+# This is an annotated sample project configuration for reference.
+# It attempts to show all possible configuration options.
+
+# Compile configuration
+source-paths: ["model"]   # paths with source code to compile
+target-path: "target"     # path for compiled code
+clean-targets: ["target"] # directories removed by the clean task
+
+# Run configuration
+# output environments
+outputs:
+  my-redshift: # uniquely named
+    type: redshift # only type supported
+    host: localhost # any IP or fqdn
+    port: 5439
+    user: my_user
+    # best practice: specify outputs in your user profile
+    # located in ~/.dbt/profiles.yml to ensure credentials
+    # don't get checked in alongside your model code
+    password: password
+    dbname: dev
+    schema: my_model_schema
+run-target: my_redshift
+
+# Test configuration
+test-paths: ["test"]
diff --git a/scripts/dbt b/scripts/dbt
@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+import sys
+import dbt.main
+import logging
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    dbt.main.main(sys.argv[1:])
diff --git a/setup.py b/setup.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+from setuptools import setup, find_packages
+import os.path
+
+package_name = "dbt"
+package_version = "0.1.0-SNAPSHOT"
+
+setup(
+  name=package_name,
+  version=package_version,
+  packages=find_packages(),
+  scripts=[
+    'scripts/dbt',
+  ],
+  install_requires=[
+    'argparse>=1.2.1',
+    'Jinja2>=2.8',
+    'PyYAML>=3.11',
+    'psycopg2==2.6.1',
+  ],
+)
diff --git a/test/pep8_test.py b/test/pep8_test.py
@@ -0,0 +1,12 @@
+import unittest
+import pep8
+
+
+class TestCodeFormat(unittest.TestCase):
+
+    def test_pep8_conformance(self):
+        """Test that we conform to PEP8."""
+        pep8style = pep8.StyleGuide(quiet=False)
+        pep8style.options.ignore = pep8style.options.ignore + ("E501",)
+        result = pep8style.check_files(['dbt', 'test'])
+        self.assertEqual(result.total_errors, 0, 'Found code style errors (and warnings).')