Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvement: make Python datasource to use the RestrictedPython sandbox #404

Merged
merged 4 commits into from Apr 13, 2015
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 10 additions & 1 deletion redash/query_runner/__init__.py
Expand Up @@ -15,6 +15,7 @@
'TYPE_STRING',
'TYPE_DATE',
'TYPE_FLOAT',
'SUPPORTED_COLUMN_TYPES',
'register',
'get_query_runner',
'import_query_runners'
Expand All @@ -28,6 +29,14 @@
TYPE_DATETIME = 'datetime'
TYPE_DATE = 'date'

SUPPORTED_COLUMN_TYPES = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why dict and not set?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are strings, AFAIK, using a dict means the "in" operator will use a hash search instead of matching the strings. In this case I think a set is the same as a list (maybe a bit better because it might be sorted).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting point, but does it really matter? Considering we're talking about 6 elements and rarely used function? I think the set variant will be easier to understand.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are the BDFL :-) will change it to a set.

TYPE_INTEGER : None,
TYPE_FLOAT : None,
TYPE_BOOLEAN : None,
TYPE_STRING : None,
TYPE_DATETIME : None,
TYPE_DATE : None
}

class BaseQueryRunner(object):
def __init__(self, configuration):
Expand Down Expand Up @@ -104,4 +113,4 @@ def validate_configuration(query_runner_type, configuration_json):

def import_query_runners(query_runner_imports):
for runner_import in query_runner_imports:
__import__(runner_import)
__import__(runner_import)
113 changes: 105 additions & 8 deletions redash/query_runner/python.py
@@ -1,9 +1,44 @@
import sys
import json
import logging

from redash.query_runner import *
from redash import models

import importlib

logger = logging.getLogger(__name__)

try:
from RestrictedPython import compile_restricted
from RestrictedPython.Guards import safe_builtins

enabled = True
except ImportError:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need for this - it's in requirements.txt.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. Will remove it - unless you want to remove it from the requirements file instead?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, let's keep it in the requirements. Feels harmless enough.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. Will remove the try..except

logger.warning("Missing dependencies. Please install RestrictedPython")
logger.warning("You can use pip: pip install RestrictedPython")

enabled = False

ALLOWED_MODULES = {}

# Custom hooks which controls the way objects/lists/tuples/dicts behave in
# RestrictedPython
def custom_write(obj):
return obj

def custom_import(name, globals=None, locals=None, fromlist=(), level=0):
if name in ALLOWED_MODULES:
m = None
if ALLOWED_MODULES[name] is None:
m = importlib.import_module(name)
ALLOWED_MODULES[name] = m
else:
m = ALLOWED_MODULES[name]

return m

raise Exception("'{0}' is not configured as a supported import module".format(name))

def get_query_result(query_id):
try:
Expand All @@ -19,10 +54,12 @@ def get_query_result(query_id):

return json.loads(query.latest_query_data.data)


def execute_query(data_source_name, query):
def execute_query(data_source_name_or_id, query):
try:
data_source = models.DataSource.get(models.DataSource.name==data_source_name)
if type(data_source_name) == int:
data_source = models.DataSource.get(models.DataSource.id==data_source_name_or_id)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can use DataSource.get_by_id.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will update the code

else:
data_source = models.DataSource.get(models.DataSource.name==data_source_name_or_id)
except models.DataSource.DoesNotExist:
raise Exception("Wrong data source name: %s." % data_source_name)

Expand All @@ -35,6 +72,26 @@ def execute_query(data_source_name, query):
# TODO: allow avoiding the json.dumps/loads in same process
return json.loads(data)

def add_result_column(result, column_name, friendly_name, column_type):
""" Helper function to add columns inside a Python script running in re:dash in an easier way """
if column_type not in SUPPORTED_COLUMN_TYPES:
raise Exception("'{0}' is not a supported column type".format(column_type))

if not "columns" in result:
result["columns"] = []

result["columns"].append({
"name" : column_name,
"friendly_name" : friendly_name,
"type" : column_type
})

def add_result_row(result, values):
if not "rows" in result:
result["rows"] = []

result["rows"].append(values)


class Python(BaseQueryRunner):
"""
Expand All @@ -45,24 +102,63 @@ def configuration_schema(cls):
return {
'type': 'object',
'properties': {
}
'allowedImportModules': {
'type': 'string',
'title': 'Modules to import prior to running the script'
}
},
}

@classmethod
def enabled(cls):
return enabled

@classmethod
def annotate_query(cls):
return False

def __init__(self, configuration_json):
global ALLOWED_MODULES

super(Python, self).__init__(configuration_json)

if "allowedImportModules" in self.configuration and self.configuration["allowedImportModules"]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if self.configuration.get('allowedImportModules', None):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will update the code

for item in self.configuration["allowedImportModules"].split(","):
ALLOWED_MODULES[item] = None

def run_query(self, query):
try:
error = None

script_globals = {'get_query_result': get_query_result, 'execute_query': execute_query}
script_locals = {'result': None}
# TODO: timeout, sandboxing
exec query in script_globals, script_locals
code = compile_restricted(query, '<string>', 'exec')

safe_builtins["_write_"] = custom_write
safe_builtins["__import__"] = custom_import
safe_builtins["_getattr_"] = getattr
safe_builtins["getattr"] = getattr
safe_builtins["_setattr_"] = setattr
safe_builtins["setattr"] = setattr

script_locals = { "result" : { "rows" : [], "columns" : [] } }

restricted_globals = dict(__builtins__=safe_builtins)
restricted_globals["get_query_result"] = get_query_result
restricted_globals["execute_query"] = execute_query
restricted_globals["add_result_column"] = add_result_column
restricted_globals["add_result_row"] = add_result_row

restricted_globals["TYPE_DATETIME"] = TYPE_DATETIME
restricted_globals["TYPE_BOOLEAN"] = TYPE_BOOLEAN
restricted_globals["TYPE_INTEGER"] = TYPE_INTEGER
restricted_globals["TYPE_STRING"] = TYPE_STRING
restricted_globals["TYPE_DATE"] = TYPE_DATE
restricted_globals["TYPE_FLOAT"] = TYPE_FLOAT

# TODO: Figure out the best way to have a timeout on a script
# One option is to use ETA with Celery + timeouts on workers
# And replacement of worker process every X requests handled.

exec(code) in restricted_globals, script_locals

if script_locals['result'] is None:
raise Exception("result wasn't set to value.")
Expand All @@ -76,4 +172,5 @@ def run_query(self, query):

return json_data, error


register(Python)
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -25,3 +25,4 @@ gunicorn==18.0
celery==3.1.11
jsonschema==2.4.0
click==3.3
RestrictedPython==3.6.0