Skip to content

Commit

Permalink
Merge pull request #503 from great-expectations/feature/cli_refactor_…
Browse files Browse the repository at this point in the history
…rebase

Add test_cli_profile; Various auto-pepping
  • Loading branch information
jcampbell committed Jun 17, 2019
2 parents a5ea721 + f799675 commit 17d8fa9
Show file tree
Hide file tree
Showing 8 changed files with 364 additions and 256 deletions.
261 changes: 39 additions & 222 deletions great_expectations/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@
import sys

from pyfiglet import figlet_format

try:
from termcolor import colored
except ImportError:
colored = None

from .supporting_methods import _scaffold_directories_and_notebooks
from great_expectations import __version__, read_csv
from great_expectations.dataset import Dataset, PandasDataset
from great_expectations.data_asset import FileDataAsset
Expand All @@ -23,19 +21,20 @@
from great_expectations.render.renderer import DescriptivePageRenderer, PrescriptivePageRenderer
from great_expectations.render.view import DescriptivePageView


from .util import cli_message
from .init import (
scaffold_directories_and_notebooks,
greeting_1,
msg_prompt_lets_begin,
)
from .datasource import (
add_datasource
)

# Take over the entire GE module logging namespace when running CLI
logger = logging.getLogger("great_expectations")

def cli_message(string, color, font="big", figlet=False):
if colored:
if not figlet:
six.print_(colored(string, color))
else:
six.print_(colored(figlet_format(
string, font=font), color))
else:
six.print_(string)


@click.group()
@click.version_option(version=__version__)
Expand Down Expand Up @@ -124,236 +123,50 @@ def validate(dataset, expectations_config_file, evaluation_parameters, result_fo
only_return_failures=only_return_failures,
)

# Note: Should this be rendered through cli_message?
# Probably not, on the offchance that the JSON object contains <color> tags
print(json.dumps(result, indent=2))
sys.exit(result['statistics']['unsuccessful_expectations'])


@cli.command()
@click.option('--target_directory', '-d', default="./",
help='The root of the project directory where you want to initialize Great Expectations.')
@click.option(
'--target_directory',
'-d',
default="./",
help='The root of the project directory where you want to initialize Great Expectations.'
)
def init(target_directory):
"""Initialze a new Great Expectations project.
"""Initialize a new Great Expectations project.
This guided input walks the user through setting up a project.
It scaffolds directories, sets up notebooks, creates a project file, and
appends to a `.gitignore` file.
"""

#!!! This injects a version tag into the docs. We should test that those versioned docs exist in RTD.
greeting_1 = """
Always know what to expect from your data.
If you're new to Great Expectations, this tutorial is a good place to start:
https://great-expectations.readthedocs.io/en/v%s/intro.html#how-do-i-get-started
""" % __version__

msg_prompt_lets_begin = """
Let's add Great Expectations to your project, by scaffolding a new great_expectations directory:
great_expectations
├── great_expectations.yml
├── datasources
├── expectations
├── fixtures
├── notebooks
├── plugins
├── uncommitted
│  ├── validations
│  ├── credentials
│  └── samples
└── .gitignore
OK to proceed?
"""

msg_prompt_choose_data_source = """
Configure a data source
1. Pandas data frames from local filesystem (CSV files)
2. Relational database (SQL)
3. Spark DataFrames from local filesystem (CSV files)
4. None of the above
"""

# msg_prompt_choose_data_source = """
# Time to create expectations for your data. This is done in Jupyter Notebook/Jupyter Lab.
#
# Before we point you to the right notebook, what data does your project work with?
# 1. Directory on local filesystem
# 2. Relational database (SQL)
# 3. DBT (data build tool) models
# 4. None of the above
# """


# msg_prompt_dbt_choose_profile = """
# Please specify the name of the dbt profile (from your ~/.dbt/profiles.yml file Great Expectations \
# should use to connect to the database
# """

# msg_dbt_go_to_notebook = """
# To create expectations for your dbt models start Jupyter and open notebook
# great_expectations/notebooks/using_great_expectations_with_dbt.ipynb -
# it will walk you through next steps.
# """

msg_prompt_filesys_enter_base_path = """
Enter the path of the root directory where the data files are stored
(the path may be either absolute or relative to current directory)
"""

msg_filesys_go_to_notebook = """
To create expectations for your CSV files start Jupyter and open the notebook
great_expectations/notebooks/using_great_expectations_with_pandas.ipynb.
it will walk you through configuring the database connection and next steps.
To launch with jupyter notebooks:
jupyter notebook great_expectations/notebooks/create_expectations_for_csv_files.ipynb
To launch with jupyter lab:
jupyter lab great_expectations/notebooks/create_expectations_for_csv_files.ipynb
"""

msg_prompt_datasource_name = """
Give your new data source a short name
"""

msg_sqlalchemy_config_connection = """
Great Expectations relies on sqlalchemy to connect to relational databases.
Please make sure that you have it installed.
Next, we will configure database credentials and store them in the "{0:s}" section
of this config file: great_expectations/uncommitted/credentials/profiles.yml:
"""

msg_sqlalchemy_go_to_notebook = """
To create expectations for your SQL queries start Jupyter and open notebook
great_expectations/notebooks/using_great_expectations_with_sql.ipynb -
it will walk you through configuring the database connection and next steps.
"""

msg_unknown_data_source = """
We are looking for more types of data types to support.
Please create a GitHub issue here:
https://github.com/great-expectations/great_expectations/issues/new
In the meantime you can see what Great Expectations can do on CSV files.
To create expectations for your CSV files start Jupyter and open notebook
great_expectations/notebooks/using_great_expectations_with_pandas.ipynb -
it will walk you through configuring the database connection and next steps.
"""
msg_spark_go_to_notebook = """
To create expectations for your CSV files start Jupyter and open the notebook
great_expectations/notebooks/using_great_expectations_with_pandas.ipynb.
it will walk you through configuring the database connection and next steps.
To launch with jupyter notebooks:
jupyter notebook great_expectations/notebooks/create_expectations_for_spark_dataframes.ipynb
To launch with jupyter lab:
jupyter lab great_expectations/notebooks/create_expectations_for_spark_dataframes.ipynb
"""
context = DataContext.create('.')

context = DataContext.create(target_directory)
base_dir = os.path.join(target_directory, "great_expectations")

cli_message("Great Expectations", color="cyan", figlet=True)
six.print_(colored(
figlet_format("Great Expectations", font="big"),
color="cyan"
))

cli_message(greeting_1, color="blue")
cli_message(greeting_1)

if not click.confirm(msg_prompt_lets_begin, default=True):
cli_message(
"OK - run great_expectations init again when ready. Exiting...", color="blue")
"OK - run great_expectations init again when ready. Exiting..."
)
exit(0)

_scaffold_directories_and_notebooks(base_dir)
scaffold_directories_and_notebooks(base_dir)
cli_message(
"\nDone.",
color="blue")


# Shows a list of options to select from

data_source_selection = click.prompt(msg_prompt_choose_data_source, type=click.Choice(["1", "2", "3", "4"]),
show_choices=False)

print(data_source_selection)

# if data_source_selection == "5": # dbt
# dbt_profile = click.prompt(msg_prompt_dbt_choose_profile)
# log_message(msg_dbt_go_to_notebook, color="blue")
# context.add_datasource("dbt", "dbt", profile=dbt_profile)
if data_source_selection == "3": # Spark
path = click.prompt(msg_prompt_filesys_enter_base_path, default='/data/', type=click.Path(exists=True,
file_okay=False,
dir_okay=True,
readable=True),
show_default=True)
if path.startswith("./"):
path = path[2:]

if path.endswith("/"):
basenamepath = path[:-1]
default_data_source_name = os.path.basename(basenamepath)
data_source_name = click.prompt(
msg_prompt_datasource_name, default=default_data_source_name, show_default=True)

cli_message(msg_spark_go_to_notebook, color="blue")
context.add_datasource(data_source_name, "spark", base_directory=path)

elif data_source_selection == "2": # sqlalchemy
data_source_name = click.prompt(
msg_prompt_datasource_name, default="mydb", show_default=True)

cli_message(msg_sqlalchemy_config_connection.format(
data_source_name), color="blue")

drivername = click.prompt("What is the driver for the sqlalchemy connection?", default="postgres",
show_default=True)
host = click.prompt("What is the host for the sqlalchemy connection?", default="localhost",
show_default=True)
port = click.prompt("What is the port for the sqlalchemy connection?", default="5432",
show_default=True)
username = click.prompt("What is the username for the sqlalchemy connection?", default="postgres",
show_default=True)
password = click.prompt("What is the password for the sqlalchemy connection?", default="",
show_default=False, hide_input=True)
database = click.prompt("What is the database name for the sqlalchemy connection?", default="postgres",
show_default=True)

credentials = {
"drivername": drivername,
"host": host,
"port": port,
"username": username,
"password": password,
"database": database
}
context.add_profile_credentials(data_source_name, **credentials)

cli_message(msg_sqlalchemy_go_to_notebook, color="blue")

context.add_datasource(
data_source_name, "sqlalchemy", profile=data_source_name)

elif data_source_selection == "1": # csv
path = click.prompt(msg_prompt_filesys_enter_base_path, default='/data/', type=click.Path(exists=False,
file_okay=False,
dir_okay=True,
readable=True),
show_default=True)
if path.startswith("./"):
path = path[2:]

default_data_source_name = os.path.basename(path)
data_source_name = click.prompt(
msg_prompt_datasource_name, default=default_data_source_name, show_default=True)

cli_message(msg_filesys_go_to_notebook, color="blue")
context.add_datasource(data_source_name, "pandas", base_directory=path)
)

else:
cli_message(msg_unknown_data_source, color="blue")
add_datasource(context)


@cli.command()
Expand All @@ -377,7 +190,9 @@ def render(render_object):
help='Maximum number of named data assets to profile.')
@click.option('--profile_all_data_assets', '-A', is_flag=True, default=False,
help='Profile ALL data assets within the target data source. If True, this will override --max_data_assets.')
def profile(datasource_name, max_data_assets, profile_all_data_assets):
@click.option('--target_directory', '-d', default="./",
help='The root of a project directory containing a great_expectations/ config.')
def profile(datasource_name, max_data_assets, profile_all_data_assets, target_directory):
"""Profile a great expectations object.
datasource_name: A datasource within this GE context to profile.
Expand All @@ -387,17 +202,19 @@ def profile(datasource_name, max_data_assets, profile_all_data_assets):
max_data_assets = None

# FIXME: By default, this should iterate over all datasources
context = DataContext('.')
context = DataContext(target_directory)
context.profile_datasource(
datasource_name, max_data_assets=max_data_assets)
datasource_name,
max_data_assets=max_data_assets
)


def main():
handler = logging.StreamHandler()
# Just levelname and message Could re-add other info if we want
formatter = logging.Formatter(
'%(levelname)s %(message)s')
# '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
# '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
Expand Down
Loading

0 comments on commit 17d8fa9

Please sign in to comment.