Links to other notebooks in the same folder:
<a href='http://pivotal.io/data-science'><img src='https://raw.githubusercontent.com/crawles/Logos/master/Pivotal_TealOnWhite.png' width='200px' align='right'></a>

<nav class = "navbar navbar-light bg-faded">
    <ul class = "nav navbar-nav">
        <li class = "">
            <a class = "nav-link">notebook1</a>
        </li>
        <li class = "">
            <a class = "nav-link">notebook2</a>
        </li>
        <li class = "">
            <a class = "nav-link">notebook3</a>
        </li>
        

# Import useful libraries

In [1]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import getopt
from IPython.core.display import display, HTML
import os
import sys
import urllib

# If we want to move the graph
# %matplotlib notebook
from IPython.core.magic import register_line_magic, register_cell_magic, register_line_cell_magic
from IPython.display import display
from IPython.display import HTML
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas.io.sql as psql
import psycopg2
import seaborn as sns

# Credentials file to connect to database
import credentials
from sql_functions import *
from magic_functions import *
from mpp_plotting_functions import *



In [2]:
# Changes logo to a Pivotal logo
jPrefs = urllib.urlopen("https://raw.githubusercontent.com/crawles/Logos/master/jupyterPrefs.js").read()
HTML('<script>{}</script>'.format(jPrefs))

In [3]:
# Set default cell width
display(HTML('<style>.container {width:80% !important;}</style>'))

# Set default matplotlib settings
mpl.rcParams['figure.figsize'] = (10, 7)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['figure.titlesize'] = 26
mpl.rcParams['axes.labelsize'] = 18
mpl.rcParams['axes.titlesize'] = 22
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14
mpl.rcParams['legend.fontsize'] = 16

# Set Seaborn Colours
blue, green, red, purple, yellow, cyan = sns.color_palette()

# Connect to Database

In [4]:
conn = psycopg2.connect(**credentials.login_info_dict)
conn.autocommit = True

In [5]:
schema_name = 'template'
psql.execute('SET search_path TO {}'.format(schema_name), conn)

<cursor object at 0x11939a718; closed: 0>

# Define magic functions - helpful for interacting with cluster

In [6]:
@register_cell_magic
def readsql(line, cell):
    """
    Extract the code in the specific cell (should be valid SQL), 
    and execute it using the connection object to the backend 
    database. The resulting pandas DataFrame is rendered inline 
    below the cell using IPython.display. You'd use this for SELECT.
    
    Returns a DataFrame with the name specified in the magic
    function. If this is not specified, then the DataFrame
    is called _df. This also takes in an option "-h", followed 
    by a number. This will show only the specified number of rows
    in the DataFrame.
    """
    # Use the global connection object defined above.
    global conn
    optlist, args = getopt.getopt(line.split(), 'h:')
    optdict = dict(optlist)
    if '-h' in optdict:
        head_num = optdict['-h']
    elif len(optdict) == 0:
        head_num = 0
    
    # If there is more than one table name specified,
    # throw an exception.
    if len(args) > 1:
        raise Exception('More than one table name specified')

    elif len(args) == 1:
        # If a table name is specified, store it as that
        table_name = args[0]
        globals()[table_name] = psql.read_sql(cell.format(**globals()), conn)
        if '-h' in optdict:
            display(globals()[table_name].head(int(optdict['-h'])))
        else:
            display(globals()[table_name])

    else:
        # Otherwise, call it _df
        global _df
        _df = psql.read_sql(cell.format(**globals()), conn)
        if '-h' in optdict:
            display(_df.head(int(optdict['-h'])))
        else:
            display(_df)

@register_cell_magic
def execsql(line, cell):
    """
    Extract the code in the specific cell (should be valid SQL),
    and execute it using the connection object to the backend 
    database. You'd use this for CREATE/UPDATE/DELETE.
    """
    # Use the global connection object defined above.
    global conn
    psql.execute(cell.format(**globals()), conn)

@register_cell_magic
def printsql(line, cell):
    """
    Show the SQL query that will be run.
    """
    print cell.format(**globals())

# We delete these to avoid name conflicts for automagic to work
del execsql, readsql, printsql

# Autofill Table Names
One downside of interacting with a remote database in Python is that table names will not be imported in. When this is run, it will import all of the schema names as classes and their respective table names as variables. That way, when we type a schema name, we can use tab completion to list out all of its columns.

In [7]:
sql = '''
SELECT table_schema, array_agg(table_name::TEXT) AS tables
  FROM information_schema.tables
 GROUP BY table_schema;
'''
info_df = psql.read_sql(sql, conn)

class Schema:
    def __init__(self, tables):
        for t in tables:
            exec('self.{t} = "{t}"'.format(t=t))

for row in info_df.iterrows():
    command = '{s} = Schema({tables})'.format(s=row[1][0], tables=row[1][1])
    exec(command)

# Examples

## Create Tables
Here we can create tables in SQL simply by using a SQL command and putting <code>%%execsql</code> at the very top of the cell.

In [8]:
%%execsql
DROP TABLE IF EXISTS example_data_table;
CREATE TABLE example_data_table
   AS SELECT 1 AS col1,
             2 AS col2, 
             3 AS col3;
            
DROP TABLE IF EXISTS other_table;
CREATE TABLE other_table
   AS SELECT 1;

Now we can check how autocomplete works:

<img src='autofill.png' width= 300px align='left'>

Try it by typing "<code>template.</code>" below then pressing tab

In [None]:
template.

## Viewing a Table
We can now view one of the tables that we just created.

In [9]:
%%readsql
SELECT *
  FROM example_data_table;

Unnamed: 0,col1,col2,col3
0,1,2,3


By default, this will store the resulting pandas DataFrame into a variable called <code>_df</code>.

In [10]:
_df

Unnamed: 0,col1,col2,col3
0,1,2,3


### Storing the Table to a DataFrame

We can also write this to a specific DataFrame. We do this by appending the desired DataFrame name to '%%readsql'.

In [11]:
%%readsql test_df
SELECT *
  FROM example_data_table

Unnamed: 0,col1,col2,col3
0,1,2,3


In [12]:
test_df

Unnamed: 0,col1,col2,col3
0,1,2,3


Another option is to specify "-h" and a number, which will show the head of the DataFrame with the number of rows specified by the option, but store the entire DataFrame into the specified variable.

In [13]:
%%readsql -h 10 df_head
SELECT generate_series(1, 20);

Unnamed: 0,generate_series
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


In [14]:
df_head

Unnamed: 0,generate_series
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


## Using Variable Table Names

We can also use variable table or column names in our commands. We simply create a variable beforehand. This variable, when wrapped inside '{' and '}' in our magic function executions, will be replaced by its value. We can view the SQL query that will be executed by using the magic command '%%printsql'

In [15]:
table_name = 'example_data_table'

In [16]:
%%readsql
SELECT *
  FROM {table_name};

Unnamed: 0,col1,col2,col3
0,1,2,3


In [17]:
%%printsql
SELECT *
  FROM {table_name};

SELECT *
  FROM example_data_table;


In [18]:
col_name = 'col1'

In [19]:
%%readsql
SELECT {col_name}
  FROM example_data_table;

Unnamed: 0,col1
0,1


In [20]:
%%printsql
SELECT {col_name}
  FROM example_data_table;

SELECT col1
  FROM example_data_table;
