Permalink
Browse files

Initial Commit

  • Loading branch information...
0 parents commit 9231db4789095646b2db592d23354ca49343590e @sb2nov sb2nov committed Oct 30, 2014
Showing with 14,933 additions and 0 deletions.
  1. +12 −0 .gitignore
  2. +14 −0 CHANGES.md
  3. +49 −0 CONTRIBUTING.md
  4. +13 −0 LICENSE.md
  5. +51 −0 MANIFEST
  6. +7 −0 MANIFEST.in
  7. +26 −0 README.rst
  8. +1 −0 bin/README.md
  9. +4 −0 dataduct/__init__.py
  10. +1 −0 dataduct/config/__init__.py
  11. +29 −0 dataduct/config/config.py
  12. +47 −0 dataduct/config/example_config
  13. +72 −0 dataduct/definition_parser.py
  14. +672 −0 dataduct/etl_pipeline.py
  15. 0 dataduct/pipeline/__init__.py
  16. +63 −0 dataduct/pipeline/activity.py
  17. +63 −0 dataduct/pipeline/copy_activity.py
  18. +116 −0 dataduct/pipeline/data_pipeline.py
  19. +43 −0 dataduct/pipeline/default_object.py
  20. +68 −0 dataduct/pipeline/ec2_resource.py
  21. +62 −0 dataduct/pipeline/emr_activity.py
  22. +103 −0 dataduct/pipeline/emr_resource.py
  23. +68 −0 dataduct/pipeline/mysql_node.py
  24. +145 −0 dataduct/pipeline/pipeline_object.py
  25. +35 −0 dataduct/pipeline/precondition.py
  26. +79 −0 dataduct/pipeline/redshift_copy_activity.py
  27. +43 −0 dataduct/pipeline/redshift_database.py
  28. +60 −0 dataduct/pipeline/redshift_node.py
  29. +83 −0 dataduct/pipeline/s3_node.py
  30. +80 −0 dataduct/pipeline/schedule.py
  31. +80 −0 dataduct/pipeline/shell_command_activity.py
  32. +50 −0 dataduct/pipeline/sns_alarm.py
  33. +71 −0 dataduct/pipeline/sql_activity.py
  34. +154 −0 dataduct/pipeline/utils.py
  35. 0 dataduct/s3/__init__.py
  36. +47 −0 dataduct/s3/s3_directory.py
  37. +107 −0 dataduct/s3/s3_file.py
  38. +38 −0 dataduct/s3/s3_log_path.py
  39. +115 −0 dataduct/s3/s3_path.py
  40. +160 −0 dataduct/s3/utils.py
  41. 0 dataduct/steps/__init__.py
  42. +126 −0 dataduct/steps/emr_streaming.py
  43. +343 −0 dataduct/steps/etl_step.py
  44. +20 −0 dataduct/steps/extract_local.py
  45. +118 −0 dataduct/steps/extract_rds.py
  46. +55 −0 dataduct/steps/extract_redshift.py
  47. +20 −0 dataduct/steps/extract_s3.py
  48. +65 −0 dataduct/steps/load_redshift.py
  49. +57 −0 dataduct/steps/sql_command.py
  50. +70 −0 dataduct/steps/transform.py
  51. 0 dataduct/tests/__init__.py
  52. +19 −0 dataduct/tests/test_definition_parser.py
  53. 0 dataduct/utils/__init__.py
  54. +20 −0 dataduct/utils/exceptions.py
  55. +11 −0 dataduct/utils/helpers.py
  56. +177 −0 docs/Makefile
  57. +6 −0 docs/README.md
  58. +277 −0 docs/conf.py
  59. +224 −0 docs/creating_an_etl.rst
  60. +163 −0 docs/dataduct.pipeline.rst
  61. +29 −0 docs/dataduct.rst
  62. +54 −0 docs/dataduct.s3.rst
  63. +86 −0 docs/dataduct.steps.rst
  64. +22 −0 docs/dataduct.tests.rst
  65. +30 −0 docs/dataduct.utils.rst
  66. +7 −0 docs/etl_pipeline.rst
  67. +29 −0 docs/index.rst
  68. +78 −0 docs/installation.rst
  69. +1 −0 examples/README.md
  70. +25 −0 examples/double_input.yaml
  71. +41 −0 examples/double_output.yaml
  72. +23 −0 examples/emr_streaming.yaml
  73. +10 −0 examples/extract_local.yaml
  74. +19 −0 examples/extract_rds.yaml
  75. +11 −0 examples/extract_redshift.yaml
  76. +10 −0 examples/extract_s3.yaml
  77. +13 −0 examples/load_redshift.yaml
  78. +3 −0 examples/resources/test_table1.tsv
  79. +1 −0 examples/resources/test_table2.tsv
  80. +9,488 −0 examples/resources/word_data.txt
  81. +116 −0 examples/scripts/s3_profiler.py
  82. +30 −0 examples/scripts/word_mapper.py
  83. +34 −0 examples/scripts/word_reducer.py
  84. +9 −0 examples/sql_command.yaml
  85. +15 −0 examples/transform.yaml
  86. +1 −0 resources/README.md
  87. +1 −0 scripts/README.md
  88. +45 −0 setup.py
@@ -0,0 +1,12 @@
+# Compiled python modules.
+*.pyc
+
+# Setuptools distribution folder.
+/dist/
+
+# Docs build folder
+/docs/_build
+
+# Python egg metadata, regenerated from source files by setuptools.
+/*.egg-info
+/*.egg
@@ -0,0 +1,14 @@
+# Changes in ETL_Lib
+
+### 0.1.0
+- Initial version of the dataduct library released
+- Support for the following steps:
+ - emr_streaming
+ - extract-local
+ - extract-s3
+ - extract-rds
+ - extract-redshift
+ - load-redshift
+ - sql-command
+ - transform
+- Examples and documentation added for all the steps
@@ -0,0 +1,49 @@
+# How to contribute
+
+We really appreciate any help we can get in making dataduct a successful project.
+There are a few guidelines that we need contributors to follow so that we can
+have a chance of keeping on top of things.
+
+## Getting Started
+
+* Make sure you have a [GitHub account](https://github.com/signup/free)
+* Create an issue for the bug, assuming one does not already exist.
+ * Clearly describe the issue including steps to reproduce when it is a bug.
+ * Make sure you fill in the earliest version that you know has the issue.
+* Fork the repository on GitHub
+
+## Making Changes
+
+* Create a topic branch from where you want to base your work.
+ * This is usually the master branch.
+ * Only target release branches if you are certain your fix must be on that
+ branch.
+ * To quickly create a topic branch based on master; `git checkout -b
+ fix/master/my_contribution master`. Please avoid working directly on the
+ `master` branch.
+* Make commits of logical units.
+* Check for unnecessary whitespace with `git diff --check` before committing.
+* Make sure your commit messages are in the proper format.
+* Make sure you have added the necessary tests for your changes.
+* Run _all_ the tests to assure nothing else was accidentally broken.
+* Make sure all the code follows PEP8
+
+## Making Trivial Changes
+
+### Documentation
+
+For changes of a trivial nature to comments and documentation, it is not
+always necessary to create a new issue. In this case, it is
+appropriate to start the first line of a commit with '(doc)' instead of
+a ticket number.
+
+## Submitting Changes
+
+* Push your changes to a topic branch in your fork of the repository.
+* Submit a pull request to the repository in the coursera organization.
+* Reference the issue you created in the pull requrest
+
+# Additional Resources
+
+* [General GitHub documentation](http://help.github.com/)
+* [GitHub pull request documentation](http://help.github.com/send-pull-requests/)
@@ -0,0 +1,13 @@
+Copyright [2014] [Coursera]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
@@ -0,0 +1,51 @@
+# file GENERATED by distutils, do NOT edit
+CHANGES.md
+CONTRIBUTING.md
+LICENSE.md
+README.rst
+setup.py
+bin/README.md
+dataduct/__init__.py
+dataduct/constants.py
+dataduct/definition_parser.py
+dataduct/etl_pipeline.py
+dataduct/pipeline/__init__.py
+dataduct/pipeline/activity.py
+dataduct/pipeline/copy_activity.py
+dataduct/pipeline/data_pipeline.py
+dataduct/pipeline/default_object.py
+dataduct/pipeline/ec2_resource.py
+dataduct/pipeline/emr_activity.py
+dataduct/pipeline/emr_resource.py
+dataduct/pipeline/mysql_node.py
+dataduct/pipeline/pipeline_object.py
+dataduct/pipeline/precondition.py
+dataduct/pipeline/redshift_copy_activity.py
+dataduct/pipeline/redshift_database.py
+dataduct/pipeline/redshift_node.py
+dataduct/pipeline/s3_node.py
+dataduct/pipeline/schedule.py
+dataduct/pipeline/shell_command_activity.py
+dataduct/pipeline/sns_alarm.py
+dataduct/pipeline/sql_activity.py
+dataduct/pipeline/utils.py
+dataduct/s3/__init__.py
+dataduct/s3/s3_directory.py
+dataduct/s3/s3_file.py
+dataduct/s3/s3_log_path.py
+dataduct/s3/s3_path.py
+dataduct/s3/utils.py
+dataduct/steps/__init__.py
+dataduct/steps/emr_streaming.py
+dataduct/steps/etl_step.py
+dataduct/steps/extract_local.py
+dataduct/steps/extract_rds.py
+dataduct/steps/extract_redshift.py
+dataduct/steps/extract_s3.py
+dataduct/steps/load_redshift.py
+dataduct/steps/sql_command.py
+dataduct/steps/transform.py
+dataduct/utils/__init__.py
+dataduct/utils/exceptions.py
+dataduct/utils/helpers.py
+scripts/README.md
@@ -0,0 +1,7 @@
+include *.txt
+include *.md
+include *.rst
+include *.sh
+include *.py
+recursive-include bin *
+recursive-include scripts *
@@ -0,0 +1,26 @@
+Dataduct
+----------
+Dataduct is a wrapper built on top of AWS Datapipeline which makes it easy to
+create ETL jobs. All jobs can be specified as a series of steps in a YAML file
+and would automatically be translated into datapipeline with appropriate
+pipeline objects.
+
+**Documentation and Details**
+
+Documentation and more details can be found at https://github.com/coursera/dataduct
+
+**License**
+
+Copyright [2014] [Coursera]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
@@ -0,0 +1 @@
+#### THIS IS THE BIN FOLDER
@@ -0,0 +1,4 @@
+"""Welcome to DataDuct
+"""
+__version__ = '0.1.0'
+
@@ -0,0 +1 @@
+from config import Config
@@ -0,0 +1,29 @@
+import os
+import yaml
+
+# We look at (in order of precedence):
+# /etc/dataduct.cfg and ~/.dataduct for configuration constants
+
+DataductConfigPath = '/etc/.dataduct'
+DataductUserConfigPath = os.path.join(os.path.expanduser('~/.dataduct'))
+DataductConfigFiles = [DataductConfigPath, DataductUserConfigPath]
+
+# Check DATADUCT_PATH env variable for other configuration locations
+if 'DATADUCT_PATH' in os.environ:
+ for path in os.environ['DATADUCT_PATH'].split(":"):
+ DataductConfigFiles.append(os.path.expanduser(path))
+
+
+def load_yaml(configFiles):
+ for configFile in configFiles:
+ try:
+ return yaml.load(open(configFile, 'r'))
+ except (OSError, IOError):
+ continue
+
+
+class Config(object):
+ _shared_config = load_yaml(DataductConfigFiles)
+
+ def __init__(self):
+ self.__dict__ = self._shared_config
@@ -0,0 +1,47 @@
+# Constants that are used across the dataduct library
+
+ec2:
+ DEFAULT_ROLE: FILL_ME_IN
+ DEFAULT_RESOURCE_ROLE: FILL_ME_IN
+ DEFAULT_EC2_INSTANCE_TYPE: m1.large
+ ETL_AMI: FILL_ME_IN
+ KEY_PAIR: FILL_ME_IN
+ SECURITY_GROUP: FILL_ME_IN
+
+emr:
+ DEFAULT_NUM_CORE_INSTANCES: 3
+ DEFAULT_CORE_INSTANCE_TYPE: m1.large
+ DEFAULT_TASK_INSTANCE_BID_PRICE: null # null if we want it to be None
+ DEFAULT_TASK_INSTANCE_TYPE: m1.large
+ DEFAULT_MASTER_INSTANCE_TYPE: m1.large
+ DEFAULT_CLUSTER_TIMEOUT: 6 Hours
+ DEFAULT_HADOOP_VERSION: null
+ DEFAULT_HIVE_VERSION: null
+ DEFAULT_PIG_VERSION: null
+ DEFAULT_CLUSTER_AMI: 2.4.7
+
+redshift:
+ REDSHIFT_DATABASE_NAME: FILL_ME_IN
+ REDSHIFT_CLUSTER_ID: FILL_ME_IN
+ REDSHIFT_USERNAME: FILL_ME_IN
+ REDSHIFT_PASSWORD: FILL_ME_IN
+
+mysql:
+ DATABASE_KEY:
+ HOST: FILL_ME_IN,
+ USERNAME: FILL_ME_IN,
+ PASSWORD: FILL_ME_IN
+
+etl:
+ RETRY_DELAY: 10 Minutes
+ DEFAULT_MAX_RETRIES: 0
+ ETL_BUCKET: FILL_ME_IN
+ DATA_PIPELINE_TOPIC_ARN: FILL_ME_IN
+ DAILY_LOAD_TIME: 1 # run at 1AM UTC
+
+bootstrap:
+ - type: transform
+ input_node: []
+ command: whoami >> ${OUTPUT1_STAGING_DIR}/output.txt
+ resource: FILL_ME_IN
+ name: bootstrap_transform
@@ -0,0 +1,72 @@
+"""
+Script that parses the pipeline definition from the yaml schema
+"""
+import yaml
+
+from .etl_pipeline import ETLPipeline
+from .utils.exceptions import ETLInputError
+
+
+def read_pipeline_definition(file_path):
+ """Function reads the yaml pipeline definitions.
+
+ Function reads the yaml pipeline definitions. We also remove the variables
+ key as that was only used for yaml placeholders.
+
+ Args:
+ file_path (str): Path to the pipeline definition.
+
+ Returns:
+ dict: parsed yaml definition as dictionary.
+
+ Raises:
+ ETLInputError: If `file_path` extention is not yaml
+ """
+ extention = file_path.split('.').pop()
+ if extention != 'yaml':
+ raise ETLInputError('Pipeline definition should have a yaml extention')
+ with open(file_path) as f:
+ definition = yaml.load(f.read())
+
+ # remove the variables key from the pipeline definition
+ # http://stackoverflow.com/questions/4150782/using-yaml-with-variables
+ definition.pop('variables', None)
+ definition.pop('description', None)
+
+ return definition
+
+def create_pipeline(definition):
+ """Creates the pipeline and add the steps specified to the pipeline
+
+ Args:
+ definition(dict): YAML definition parsed from the datapipeline
+ """
+ steps = definition.pop('steps')
+ etl = ETLPipeline(**definition)
+
+ # Add the steps to the pipeline object
+ etl.create_steps(steps)
+ print 'Created pipeline. Name: %s' % etl.name
+
+ return etl
+
+def validate_pipeline(etl, force_overwrite=False):
+ """Validates the pipeline that was created
+
+ Args:
+ etl(EtlPipeline): pipeline object that needs to be validated
+ force_overwrite(bool): delete if a pipeline of same name exists
+ """
+ if force_overwrite:
+ etl.delete_if_exists()
+ etl.validate()
+ print 'Validated pipeline. Id: %s' % etl.pipeline.id
+
+def activate_pipeline(etl):
+ """Activate the pipeline that was created
+
+ Args:
+ etl(EtlPipeline): pipeline object that needs to be activated
+ """
+ etl.activate()
+ print 'Activated pipeline. Id: %s' % etl.pipeline.id
Oops, something went wrong.

0 comments on commit 9231db4

Please sign in to comment.