This repository has been archived by the owner on Sep 3, 2022. It is now read-only.
/
_sql.py
402 lines (328 loc) · 14.8 KB
/
_sql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# Copyright 2015 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under
# the License.
"""Google Cloud Platform library - %%arguments IPython Cell Magic Functionality."""
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
from builtins import str
from past.builtins import basestring
try:
import IPython
import IPython.core.magic
except ImportError:
raise Exception('This module can only be loaded in ipython.')
import argparse
import datetime
import imp
import re
import sys
import time
import datalab.bigquery
import datalab.data
from datalab.utils.commands import CommandParser, handle_magic_line
def _create_sql_parser():
sql_parser = CommandParser(prog="%%sql",
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""
Create a named SQL module with one or more queries.
The cell body should contain an optional initial part defining the default
values for the variables, if any, using Python code, followed by one or more
queries.
Queries should start with 'DEFINE QUERY <name>' in order to bind them to
<module name>.<query name> in the notebook (as datalab.data.SqlStament instances).
The final query can optionally omit 'DEFINE QUERY <name>', as using the module
name in places where a SqlStatement is expected will resolve to the final query
in the module.
Queries can refer to variables with '$<name>', as well as refer to other queries
within the same module, making it easy to compose nested queries and test their
parts.
The Python code defining the variable default values can assign scalar or list/tuple values to
variables, or one of the special functions 'datestring' and 'source'.
When a variable with a 'datestring' default is expanded it will expand to a formatted
string based on the current date, while a 'source' default will expand to a table whose
name is based on the current date.
datestring() takes two named arguments, 'format' and 'offset'. The former is a
format string that is the same as for Python's time.strftime function. The latter
is a string containing a comma-separated list of expressions such as -1y, +2m,
etc; these are offsets from the time of expansion that are applied in order. The
suffix (y, m, d, h, M) correspond to units of years, months, days, hours and
minutes, while the +n or -n prefix is the number of units to add or subtract from
the time of expansion. Three special values 'now', 'today' and 'yesterday' are
also supported; 'today' and 'yesterday' will be midnight UTC on the current date
or previous days date.
source() can take a 'name' argument for a fixed table name, or 'format' and 'offset'
arguments similar to datestring(), but unlike datestring() will resolve to a Table
with the specified name.
""")
sql_parser.add_argument('-m', '--module', help='The name for this SQL module')
sql_parser.add_argument('-d', '--dialect', help='BigQuery SQL dialect',
choices=['legacy', 'standard'])
sql_parser.add_argument('-b', '--billing', type=int, help='BigQuery billing tier')
sql_parser.set_defaults(func=lambda args, cell: sql_cell(args, cell))
return sql_parser
_sql_parser = _create_sql_parser()
# Register the line magic as well as the cell magic so we can at least give people help
# without requiring them to enter cell content first.
@IPython.core.magic.register_line_cell_magic
def sql(line, cell=None):
""" Create a SQL module with one or more queries. Use %sql --help for more details.
The supported syntax is:
%%sql [--module <modulename>]
[<optional Python code for default argument values>]
[<optional named queries>]
[<optional unnamed query>]
At least one query should be present. Named queries should start with:
DEFINE QUERY <name>
on a line by itself.
Args:
args: the optional arguments following '%%sql'.
cell: the contents of the cell; Python code for arguments followed by SQL queries.
"""
if cell is None:
_sql_parser.print_help()
else:
return handle_magic_line(line, cell, _sql_parser)
def _date(val, offset=None):
""" A special pseudo-type for pipeline arguments.
This allows us to parse dates as Python datetimes, including special values like 'now'
and 'today', as well as apply offsets to the datetime.
Args:
val: a string containing the value for the datetime. This can be 'now', 'today' (midnight at
start of day), 'yesterday' (midnight at start of yesterday), or a formatted date that
will be passed to the datetime constructor. Note that 'now' etc are assumed to
be in UTC.
offset: for date arguments a string containing a comma-separated list of
relative offsets to apply of the form <n><u> where <n> is an integer and
<u> is a single character unit (d=day, m=month, y=year, h=hour, m=minute).
Returns:
A Python datetime resulting from starting at <val> and applying the sequence of deltas
specified in <offset>.
"""
if val is None:
return val
if val == '' or val == 'now':
when = datetime.datetime.utcnow()
elif val == 'today':
dt = datetime.datetime.utcnow()
when = datetime.datetime(dt.year, dt.month, dt.day)
elif val == 'yesterday':
dt = datetime.datetime.utcnow() - datetime.timedelta(1)
when = datetime.datetime(dt.year, dt.month, dt.day)
else:
when = datetime.datetime.strptime(val, "%Y%m%d")
if offset is not None:
for part in offset.split(','):
unit = part[-1]
quantity = int(part[:-1])
# We can use timedelta for days and under, but not for years and months
if unit == 'y':
when = datetime.datetime(year=when.year + quantity, month=when.month, day=when.day,
hour=when.hour, minute=when.minute)
elif unit == 'm':
new_year = when.year
new_month = when.month + quantity
if new_month < 1:
new_month = -new_month
new_year += 1 + (new_month // 12)
new_month = 12 - new_month % 12
elif new_month > 12:
new_year += (new_month - 1) // 12
new_month = 1 + (new_month - 1) % 12
when = datetime.datetime(year=new_year, month=new_month, day=when.day,
hour=when.hour, minute=when.minute)
elif unit == 'd':
when += datetime.timedelta(days=quantity)
elif unit == 'h':
when += datetime.timedelta(hours=quantity)
elif unit == 'M':
when += datetime.timedelta(minutes=quantity)
return when
def _resolve_table(v, format, delta):
try:
when = _date(v, delta)
v = time.strftime(format, when.timetuple())
except Exception:
pass
return datalab.bigquery.Table(v)
def _make_string_formatter(f, offset=None):
""" A closure-izer for string arguments that include a format and possibly an offset. """
format = f
delta = offset
return lambda v: time.strftime(format, (_date(v, delta)).timetuple())
def _make_table_formatter(f, offset=None):
""" A closure-izer for table arguments that include a format and possibly an offset. """
format = f
delta = offset
return lambda v: _resolve_table(v, format, delta)
def _make_table(v):
return datalab.bigquery.Table(v)
def _datestring(format, offset=''):
return {'type': 'datestring', 'format': format, 'offset': offset}
def _table(name=None, format=None, offset=''):
return {'type': 'table', 'name': name, 'format': format, 'offset': offset}
def _arguments(code, module):
"""Define pipeline arguments.
Args:
code: the Python code to execute that defines the arguments.
"""
arg_parser = CommandParser.create('')
try:
# Define our special argument 'types' and add them to the environment.
builtins = {'source': _table, 'datestring': _datestring}
env = {}
env.update(builtins)
# Execute the cell which should be one or more calls to arg().
exec(code, env)
# Iterate through the module dictionary. For any newly defined objects,
# add args to the parser.
for key in env:
# Skip internal/private stuff.
if key in builtins or key[0] == '_':
continue
# If we want to support importing query modules into other query modules, uncomment next 4
# Skip imports but add them to the module
# if isinstance(env[key], types.ModuleType):
# module.__dict__[key] = env[key]
# continue
val = env[key]
key = '--%s' % key
if isinstance(val, bool):
if val:
arg_parser.add_argument(key, default=val, action='store_true')
else:
arg_parser.add_argument(key, default=val, action='store_false')
elif isinstance(val, basestring) or isinstance(val, int) or isinstance(val, float) \
or isinstance(val, int):
arg_parser.add_argument(key, default=val)
elif isinstance(val, list):
arg_parser.add_argument(key, default=val, nargs='+')
elif isinstance(val, tuple):
arg_parser.add_argument(key, default=list(val), nargs='+')
# Is this one of our pseudo-types for dates/tables?
elif isinstance(val, dict) and 'type' in val:
if val['type'] == 'datestring':
arg_parser.add_argument(key, default='',
type=_make_string_formatter(val['format'],
offset=val['offset']))
elif val['type'] == 'table':
if val['format'] is not None:
arg_parser.add_argument(key, default='',
type=_make_table_formatter(val['format'],
offset=val['offset']))
else:
arg_parser.add_argument(key, default=val['name'], type=_make_table)
else:
raise Exception('Cannot generate argument for %s of type %s' % (key, type(val)))
else:
raise Exception('Cannot generate argument for %s of type %s' % (key, type(val)))
except Exception as e:
print("%%sql arguments: %s from code '%s'" % (str(e), str(code)))
return arg_parser
def _split_cell(cell, module):
""" Split a hybrid %%sql cell into the Python code and the queries.
Populates a module with the queries.
Args:
cell: the contents of the %%sql cell.
module: the module that the contents will populate.
Returns:
The default (last) query for the module.
"""
lines = cell.split('\n')
code = None
last_def = -1
name = None
define_wild_re = re.compile('^DEFINE\s+.*$', re.IGNORECASE)
define_re = re.compile('^DEFINE\s+QUERY\s+([A-Z]\w*)\s*?(.*)$', re.IGNORECASE)
select_re = re.compile('^SELECT\s*.*$', re.IGNORECASE)
standard_sql_re = re.compile('^(CREATE|WITH|INSERT|DELETE|UPDATE)\s*.*$', re.IGNORECASE)
# TODO(gram): a potential issue with this code is if we have leading Python code followed
# by a SQL-style comment before we see SELECT/DEFINE. When switching to the tokenizer see
# if we can address this.
for i, line in enumerate(lines):
define_match = define_re.match(line)
select_match = select_re.match(line)
standard_sql_match = standard_sql_re.match(line)
if i:
prior_content = ''.join(lines[:i]).strip()
if select_match:
# Avoid matching if previous token was '(' or if Standard SQL is found
# TODO: handle the possibility of comments immediately preceding SELECT
select_match = len(prior_content) == 0 or \
(prior_content[-1] != '(' and not standard_sql_re.match(prior_content))
if standard_sql_match:
standard_sql_match = len(prior_content) == 0 or not standard_sql_re.match(prior_content)
if define_match or select_match or standard_sql_match:
# If this is the first query, get the preceding Python code.
if code is None:
code = ('\n'.join(lines[:i])).strip()
if len(code):
code += '\n'
elif last_def >= 0:
# This is not the first query, so gather the previous query text.
query = '\n'.join([line for line in lines[last_def:i] if len(line)]).strip()
if select_match and name != datalab.data._utils._SQL_MODULE_MAIN and len(query) == 0:
# Avoid DEFINE query name\nSELECT ... being seen as an empty DEFINE followed by SELECT
continue
# Save the query
statement = datalab.data.SqlStatement(query, module)
module.__dict__[name] = statement
# And set the 'last' query to be this too
module.__dict__[datalab.data._utils._SQL_MODULE_LAST] = statement
# Get the query name and strip off our syntactic sugar if appropriate.
if define_match:
name = define_match.group(1)
lines[i] = define_match.group(2)
else:
name = datalab.data._utils._SQL_MODULE_MAIN
# Save the starting line index of the new query
last_def = i
else:
define_wild_match = define_wild_re.match(line)
if define_wild_match:
raise Exception('Expected "DEFINE QUERY <name>"')
if last_def >= 0:
# We were in a query so save this tail query.
query = '\n'.join([line for line in lines[last_def:] if len(line)]).strip()
statement = datalab.data.SqlStatement(query, module)
module.__dict__[name] = statement
module.__dict__[datalab.data._utils._SQL_MODULE_LAST] = statement
if code is None:
code = ''
module.__dict__[datalab.data._utils._SQL_MODULE_ARGPARSE] = _arguments(code, module)
return module.__dict__.get(datalab.data._utils._SQL_MODULE_LAST, None)
def sql_cell(args, cell):
"""Implements the SQL cell magic for ipython notebooks.
The supported syntax is:
%%sql [--module <modulename>]
[<optional Python code for default argument values>]
[<optional named queries>]
[<optional unnamed query>]
At least one query should be present. Named queries should start with:
DEFINE QUERY <name>
on a line by itself.
Args:
args: the optional arguments following '%%sql'.
cell: the contents of the cell; Python code for arguments followed by SQL queries.
"""
name = args['module'] if args['module'] else '_sql_cell'
module = imp.new_module(name)
query = _split_cell(cell, module)
ipy = IPython.get_ipython()
if not args['module']:
# Execute now
if query:
return datalab.bigquery.Query(query, values=ipy.user_ns) \
.execute(dialect=args['dialect'], billing_tier=args['billing']).results
else:
# Add it as a module
sys.modules[name] = module
exec('import %s' % name, ipy.user_ns)