drivendataorg · pjbull · Mar 20, 2021 · Mar 22, 2019 · Aug 5, 2019 · Feb 10, 2020
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,9 @@ docs/site/
 # test cache
 .cache/*
 tests/__pycache__/*
-*.pytest_cache/
+*.pytest_cache/
+*.pyc
+
+# other local dev info
+.vscode/
+cookiecutter_data_science.egg-info/
diff --git a/README.md b/README.md
@@ -63,8 +63,8 @@ The directory structure of your new project looks like this:
 ├── requirements.txt   <- The requirements file for reproducing the analysis environment, e.g.
 │                         generated with `pip freeze > requirements.txt`
 │
-├── src                <- Source code for use in this project.
-│   ├── __init__.py    <- Makes src a Python module
+├── {{ cookiecutter.module_name }}                <- Source code for use in this project.
+│   ├── __init__.py    <- Makes {{ cookiecutter.module_name }} a Python module
 │   │
 │   ├── data           <- Scripts to download or generate data
 │   │   └── make_dataset.py

diff --git a/ccds/__main__.py b/ccds/__main__.py
@@ -0,0 +1,31 @@
+import json
+from pathlib import Path
+import re
+import sys
+
+# 2/3 compat
+try:
+    input = raw_input
+except NameError:
+    pass
+
+import click
+
+# Monkey-patch jinja to allow variables to not exist, which happens with sub-options
+import jinja2
+jinja2.StrictUndefined = jinja2.Undefined
+
+
+# Monkey-patch cookiecutter to allow sub-items
+import cookiecutter
+from cookiecutter import prompt 
+from ccds.monkey_patch import prompt_for_config
+
+prompt.prompt_for_config = prompt_for_config
+
+from cookiecutter import cli
+main = cli.main
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ccds/monkey_patch.py b/ccds/monkey_patch.py
@@ -0,0 +1,156 @@
+from collections import OrderedDict
+import json
+
+import click
+from past.builtins import basestring
+
+from future.utils import iteritems
+
+from jinja2.exceptions import UndefinedError
+
+from cookiecutter.exceptions import UndefinedVariableInTemplate
+from cookiecutter.environment import StrictEnvironment
+
+
+from cookiecutter.prompt import (prompt_choice_for_config, render_variable, read_user_variable, read_user_choice)
+
+def _prompt_choice_and_subitems(cookiecutter_dict, env, key, options, no_input):
+    result = {}
+
+    # first, get the selection
+    rendered_options = [
+        render_variable(env, list(raw.keys())[0], cookiecutter_dict) for raw in options
+    ]
+
+    if no_input:
+        selected = rendered_options[0]
+
+    selected = read_user_choice(key, rendered_options)
+
+    selected_item = [list(c.values())[0] for c in options if list(c.keys())[0] == selected][0]
+
+    result[selected] = {}
+
+    # then, fill in the sub values for that item
+    for subkey, raw in selected_item.items():
+        # We are dealing with a regular variable
+        val = render_variable(env, raw, cookiecutter_dict)
+
+        if not no_input:
+            val = read_user_variable(subkey, val)
+
+        result[selected][subkey] = val
+
+    return result
+
+
+def prompt_for_config(context, no_input=False):
+    """
+    Prompts the user to enter new config, using context as a source for the
+    field names and sample values.
+    :param no_input: Prompt the user at command line for manual configuration?
+    """
+    cookiecutter_dict = OrderedDict([])
+    env = StrictEnvironment(context=context)
+
+    # First pass: Handle simple and raw variables, plus choices.
+    # These must be done first because the dictionaries keys and
+    # values might refer to them.
+    for key, raw in iteritems(context[u'cookiecutter']):
+        if key.startswith(u'_'):
+            cookiecutter_dict[key] = raw
+            continue
+
+        try:
+            if isinstance(raw, list):
+                if isinstance(raw[0], dict):
+                    val = _prompt_choice_and_subitems(
+                        cookiecutter_dict, env, key, raw, no_input
+                    )
+                    cookiecutter_dict[key] = val
+                else:
+                    # We are dealing with a choice variable
+                    val = prompt_choice_for_config(
+                        cookiecutter_dict, env, key, raw, no_input
+                    )
+                    cookiecutter_dict[key] = val
+            elif not isinstance(raw, dict):
+                # We are dealing with a regular variable
+                val = render_variable(env, raw, cookiecutter_dict)
+
+                if not no_input:
+                    val = read_user_variable(key, val)
+
+                cookiecutter_dict[key] = val
+        except UndefinedError as err:
+            msg = "Unable to render variable '{}'".format(key)
+            raise UndefinedVariableInTemplate(msg, err, context)
+
+    # Second pass; handle the dictionaries.
+    for key, raw in iteritems(context[u'cookiecutter']):
+
+        try:
+            if isinstance(raw, dict):
+                # We are dealing with a dict variable
+                val = render_variable(env, raw, cookiecutter_dict)
+
+                if not no_input:
+                    val = read_user_dict(key, val)
+
+                cookiecutter_dict[key] = val
+        except UndefinedError as err:
+            msg = "Unable to render variable '{}'".format(key)
+            raise UndefinedVariableInTemplate(msg, err, context)
+
+    return cookiecutter_dict
+
+# from cookiecutter.main import cookiecutter
+# from cookiecutter import prompt
+# from cookiecutter.cli import main as cc_main
+
+# class NestedQuestion:
+#     ''' [{'a': {'val1': 'default1', 'val2': 'default2'}}]
+
+#         Interprets lists as questions with multiple options, where the
+#         and dictionaries as single questions with defaults values.
+#     '''
+#     @classmethod
+#     def update_context(cls, context, question_structure):
+#         qd = question_structure
+#         if isinstance(qd, list):
+#             selection = cls.get_user_option(qd)
+
+#             name, vals = list(selection.items())[0]
+
+#             context[name] = {}
+#             cls.update_context(context[name], vals)
+
+#         elif isinstance(qd, dict):
+#             for k, v in qd.items():
+#                 context[k]= {}
+
+#                 if isinstance(v, (dict, list)):
+#                     context[k] = cls.update_context(context[k], v)
+#                 else:
+#                     context[k] = cls.get_user_input(k, v)
+
+#         return context
+
+#     @staticmethod
+#     def get_user_input(key, default):
+#         return prompt.read_user_variable(key, default)
+#         # return input(f"{key} [{default}]: ") or default
+
+#     @staticmethod
+#     def get_user_option(options):
+#         prompt.read_user_choice()
+
+#         # input_msg = '\n'.join(
+#         #     f" [{ix + 1}] - {list(value.keys())[0]}" for ix, value in enumerate(options)
+#         # )
+
+#         # prepend = 'Select an item:\n'
+#         # postpend = "\n - Enter number [1]: "
+
+#         # ix = int(input(prepend + input_msg + postpend) or 1) - 1
+#         # return options[ix]
diff --git a/cookiecutter.json b/cookiecutter.json
@@ -1,10 +1,27 @@
 {
     "project_name": "project_name",
     "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}",
+    "module_name": "{{ cookiecutter.project_name.lower().replace(' ', '_').replace('-', '_') }}",
     "author_name": "Your name (or your organization/company/team)",
     "description": "A short description of the project.",
-    "open_source_license": ["MIT", "BSD-3-Clause", "No license file"],
-    "s3_bucket": "[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')",
-    "aws_profile": "default",
-    "python_interpreter": ["python3", "python"]
-}
+    "python_version_number": "3.7",
+    "dataset_storage": [
+        {"none": {}},
+        {"azure": {"container": "container-name"}},
+        {"s3": {"bucket": "bucket-name", "aws_profile": "default"}},
+        {"gcs": {"bucket": "bucket-name"}}
+    ],
+    "environment_manager" : [
+        "none",
+        "conda",
+        "virtualenv",
+        "pipenv"
+    ],
+    "dependency_file": [
+        "none",
+        "requirements.txt",
+        "environment.yml",
+        "Pipfile"
+    ],
+    "open_source_license": ["MIT", "BSD-3-Clause", "No license file"]
+}
diff --git a/docs/docs/index.md b/docs/docs/index.md
@@ -54,7 +54,7 @@ Disagree with a couple of the default folder names? Working on a project that's
 
 ## Getting started
 
-With this in mind, we've created a data science cookiecutter template for projects in Python. Your analysis doesn't have to be in Python, but the template does provide some Python boilerplate that you'd want to remove (in the `src` folder for example, and the Sphinx documentation skeleton in `docs`).
+With this in mind, we've created a data science cookiecutter template for projects in Python. Your analysis doesn't have to be in Python, but the template does provide some Python boilerplate that you'd want to remove (in the `{{ cookiecutter.module_name }}` folder for example, and the Sphinx documentation skeleton in `docs`).
 
 ### Requirements
 
@@ -72,7 +72,7 @@ cookiecutter https://github.com/drivendata/cookiecutter-data-science
 
 ### Example
 
-<script type="text/javascript" src="https://asciinema.org/a/9bgl5qh17wlop4xyxu9n9wr02.js" id="asciicast-9bgl5qh17wlop4xyxu9n9wr02" async></script>
+<script type="text/javascript" {{ cookiecutter.module_name }}="https://asciinema.org/a/9bgl5qh17wlop4xyxu9n9wr02.js" id="asciicast-9bgl5qh17wlop4xyxu9n9wr02" async></script>
 
 ## Directory structure
 
@@ -103,8 +103,8 @@ cookiecutter https://github.com/drivendata/cookiecutter-data-science
 │                         generated with `pip freeze > requirements.txt`
 │
 ├── setup.py           <- Make this project pip installable with `pip install -e`
-├── src                <- Source code for use in this project.
-│   ├── __init__.py    <- Makes src a Python module
+├── {{ cookiecutter.module_name }}                <- Source code for use in this project.
+│   ├── __init__.py    <- Makes {{ cookiecutter.module_name }} a Python module
 │   │
 │   ├── data           <- Scripts to download or generate data
 │   │   └── make_dataset.py
@@ -129,7 +129,7 @@ There are some opinions implicit in the project structure that have grown out of
 
 ### Data is immutable
 
-Don't ever edit your raw data, especially not manually, and especially not in Excel. Don't overwrite your raw data. Don't save multiple versions of the raw data. Treat the data (and its format) as immutable. The code you write should move the raw data through a pipeline to your final analysis. You shouldn't have to run all of the steps every time you want to make a new figure (see [Analysis is a DAG](#analysis-is-a-dag)), but anyone should be able to reproduce the final products with only the code in `src` and the data in `data/raw`.
+Don't ever edit your raw data, especially not manually, and especially not in Excel. Don't overwrite your raw data. Don't save multiple versions of the raw data. Treat the data (and its format) as immutable. The code you write should move the raw data through a pipeline to your final analysis. You shouldn't have to run all of the steps every time you want to make a new figure (see [Analysis is a DAG](#analysis-is-a-dag)), but anyone should be able to reproduce the final products with only the code in `{{ cookiecutter.module_name }}` and the data in `data/raw`.
 
 Also, if data is immutable, it doesn't need source control in the same way that code does. Therefore, ***by default, the data folder is included in the `.gitignore` file.*** If you have a small amount of data that rarely changes, you may want to include the data in the repository. Github currently warns if files are over 50MB and rejects files over 100MB. Some other options for storing/syncing large data include [AWS S3](https://aws.amazon.com/s3/) with a syncing tool (e.g., [`s3cmd`](http://s3tools.org/s3cmd)), [Git Large File Storage](https://git-lfs.github.com/), [Git Annex](https://git-annex.branchable.com/), and [dat](http://dat-data.com/). Currently by default, we ask for an S3 bucket and use [AWS CLI](http://docs.aws.amazon.com/cli/latest/reference/s3/index.html) to sync data in the `data` folder with the server.
 
@@ -141,18 +141,18 @@ Since notebooks are challenging objects for source control (e.g., diffs of the `
 
  1. Follow a naming convention that shows the owner and the order the analysis was done in. We use the format `<step>-<ghuser>-<description>.ipynb` (e.g., `0.3-bull-visualize-distributions.ipynb`).
 
- 2. Refactor the good parts. Don't write code to do the same task in multiple notebooks. If it's a data preprocessing task, put it in the pipeline at `src/data/make_dataset.py` and load data from `data/interim`. If it's useful utility code, refactor it to `src`.
+ 2. Refactor the good parts. Don't write code to do the same task in multiple notebooks. If it's a data preprocessing task, put it in the pipeline at `{{ cookiecutter.module_name }}/data/make_dataset.py` and load data from `data/interim`. If it's useful utility code, refactor it to `{{ cookiecutter.module_name }}`.
 
  Now by default we turn the project into a Python package (see the `setup.py` file). You can import your code and use it in notebooks with a cell like the following:
 
 ```
 # OPTIONAL: Load the "autoreload" extension so that code can change
 %load_ext autoreload
 
-# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
+# OPTIONAL: always reload modules so that as you change code in {{ cookiecutter.module_name }}, it gets loaded
 %autoreload 2
 
-from src.data import make_dataset
+from {{ cookiecutter.module_name }}.data import make_dataset
 ```
 
 ### Analysis is a DAG
@@ -192,10 +192,10 @@ OTHER_VARIABLE=something
 
 #### Use a package to load these variables automatically.
 
-If you look at the stub script in `src/data/make_dataset.py`, it uses a package called [python-dotenv](https://github.com/theskumar/python-dotenv) to load up all the entries in this file as environment variables so they are accessible with `os.environ.get`. Here's an example snippet adapted from the `python-dotenv` documentation:
+If you look at the stub script in `{{ cookiecutter.module_name }}/data/make_dataset.py`, it uses a package called [python-dotenv](https://github.com/theskumar/python-dotenv) to load up all the entries in this file as environment variables so they are accessible with `os.environ.get`. Here's an example snippet adapted from the `python-dotenv` documentation:
 
 ```python
-# src/data/dotenv_example.py
+# {{ cookiecutter.module_name }}/data/dotenv_example.py
 import os
 from dotenv import load_dotenv, find_dotenv
 

diff --git a/hooks/post_gen_project.py b/hooks/post_gen_project.py
@@ -0,0 +1,66 @@
+import os
+
+packages = [
+    'flake8',
+    'pathlib2',
+    'pip',
+    'setuptools',
+    'wheel',
+]
+
+pip_only_packages = [
+    'awscli',
+    'python-dotenv',
+]
+
+{% if cookiecutter.dataset_storage.s3 %}
+packages += ['awscli']
+{% endif %}
+
+dependencies = '{{ cookiecutter.dependency_file }}'
+
+def write_dependencies():
+    if dependencies == 'requirements.txt':
+        with open(dependencies, 'w') as f:
+            lines = sorted(packages + pip_only_packages)
+
+            lines += [
+                ""
+                "-e ."
+            ]
+
+            f.write("\n".join(lines))
+
+    elif dependencies == 'environment.yml':
+        with open(dependencies, 'w') as f:
+            lines = ["name: {{ cookiecutter.repo_name }}",
+                     "dependencies:"]
+
+            lines += [f"  - {p}" for p in packages]
+
+            lines += ["  - pip:"] + [f"    - {p}" for p in pip_only_packages]
+
+            lines += ['    - -e .']
+
+            lines += ["  - python={{ cookiecutter.python_version_number }}"]
+
+            f.write("\n".join(lines))
+
+
+    elif dependencies == 'Pipfile':
+        with open(dependencies, 'w') as f:
+            lines = ["[packages]"]
+            lines += [f'{p} = "*"' for p in sorted(packages + pip_only_packages)]
+
+            lines += ['"{{ cookiecutter.module_name }}" = {editable = true, path = "."}']
+
+            lines += [
+                "",
+                "[requires]",
+                'python_version = "{{ cookiecutter.python_version_number }}"'
+            ]
+
+            f.write("\n".join(lines))
+
+
+write_dependencies()