Links to other notebooks in the same folder:

<nav class = "navbar navbar-light bg-faded">
    <ul class = "nav navbar-nav">
        <li class = "">
            <a class = "nav-link">notebook1</a>
        </li>
        <li class = "">
            <a class = "nav-link">notebook2</a>
        </li>
        <li class = "">
            <a class = "nav-link">notebook3</a>
        </li>
        

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Useful-Libraries" data-toc-modified-id="Import-Useful-Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Useful Libraries</a></span></li><li><span><a href="#Connect-to-Database" data-toc-modified-id="Connect-to-Database-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Connect to Database</a></span></li><li><span><a href="#Autofill-Table-Names" data-toc-modified-id="Autofill-Table-Names-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Autofill Table Names</a></span></li><li><span><a href="#Thread-Manager" data-toc-modified-id="Thread-Manager-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Thread Manager</a></span></li><li><span><a href="#Magic-Functions-Useful-for-Interacting-with-the-Cluster" data-toc-modified-id="Magic-Functions-Useful-for-Interacting-with-the-Cluster-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Magic Functions Useful for Interacting with the Cluster</a></span></li></ul></div>

# Import Useful Libraries

In [None]:
from datetime import date, datetime, time, timedelta
from dateutil.relativedelta import relativedelta
from functools import reduce
import getopt
from itertools import chain, zip_longest, product
import os
import re
import sys
from textwrap import dedent
import threading
import time
import urllib

import graphviz
from IPython.core.display import display, Image, HTML
from IPython.core.magic import register_cell_magic,\
                               register_line_cell_magic,\
                               register_line_magic
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas.io.sql as psql
import psycopg2
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import ElasticNet, LinearRegression,\
                                 LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.model_selection import KFold, train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.svm import OneClassSVM, SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor,\
                         export_graphviz,\
                         ExtraTreeClassifier, ExtraTreeRegressor
from sqlalchemy import create_engine, Column, MetaData, Table
from sqlalchemy import all_, and_, any_, not_, or_
from sqlalchemy import alias, between, case, cast, column, distinct, extract,\
                       false, func, intersect, literal, literal_column,\
                       select, text, true, union, union_all
from sqlalchemy import BigInteger, Boolean, Date, DateTime, Integer, Float,\
                       Numeric, String
from sqlalchemy.dialects.postgresql import aggregate_order_by
from treeinterpreter import treeinterpreter as ti
import pydotplus

# Credentials file to connect to database
import credentials


%run mpp-plotting/mpp_plotting
%run python-utils/plotting_utils
%run python-utils/ml_utils
%run python-utils/sql_utils

In [None]:
# Set default cell width
display(HTML('<style>.container {width:70% !important;}</style>'))

# Set default matplotlib settings
plt.rcParams['figure.figsize'] = (10, 7)
plt.rcParams['lines.linewidth'] = 3
plt.rcParams['figure.titlesize'] = 26
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['legend.fontsize'] = 16

# Set seaborn colours
sns.set_style('darkgrid')
sns.set_palette('colorblind')
blue, green, red, purple, yellow, cyan = sns.color_palette('colorblind')
black = (0, 0, 0)
white = (1, 1, 1)

# Connect to Database

In [None]:
# Psycopg2 connection
conn = psycopg2.connect(**credentials.login_info_dict)
conn.autocommit = True

# SQLAlchemy connection
engine_str = 'postgresql://{user}:{password}@{host}/{dbname}'\
    .format(**credentials.login_info_dict)
engine = create_engine(engine_str)

metadata = MetaData(engine)

# Set the schema name
schema_name = 'template'
psql.execute('SET search_path TO {}'.format(schema_name), conn)

# Autofill Table Names
One downside of interacting with a remote database in Python is that table names will not be imported in. When this is run, it will import all of the schema names as classes and their respective table names as variables. That way, when we type a schema name, we can use tab completion to list out all of its columns.

In [None]:
class Schema:
    def __init__(self, tables):
        for t in tables:
            exec('self.{t} = "{t}"'.format(t=t))

def refresh_tables(conn):
    """Refreshes the auto-fill tables."""
    sql = '''
    SELECT table_schema, array_agg(table_name::TEXT) AS tables
      FROM information_schema.tables
     GROUP BY table_schema;
    '''
    info_df = psql.read_sql(sql, conn)

    for row in info_df.iterrows():
        command = '''
        global {s}
        {s} = Schema({tables})
        '''.format(s=row[1][0], tables=row[1][1])
        exec(dedent(command))

# Thread Manager
This class provides a framework to manage concurrent threads. The `%%background` magic function will automatically call the `ThreadManager` when opening and close threads. We can use the `ThreadManager` to view all current and past threads including start and finish times and any comments associated to the threads.

In [None]:
class ThreadManager:
    def __call__(self, num_rows=5):
        """Returns the num_rows most recent threads (Default: 5)"""
        # Update exec time if thread has yet to finish
        for ix in self.thread_df.index:
            if self.thread_df.loc[ix, 'finish_time'] == '':
                start_time = self.thread_df.loc[ix, 'start_time']
                time_elapsed = datetime.now() - start_time
                
                self.thread_df.loc[ix, 'exec_time'] = time_elapsed
        return self.thread_df.tail(num_rows).iloc[::-1]
    
    def __init__(self):
        df_cols = ['start_time', 'finish_time', 'exec_time',
                   'cell_text', 'comment', 'error_message']
        self.thread_df = pd.DataFrame(columns=df_cols)
        
    def _add_finish_times(self, thread_id):
        """Adds the finish time and exec time to thread_df.
        
        Returns: a tuple of finish time, exec time
        """
        
        # Set finish time
        finish_time = datetime.now()
        self.thread_df.loc[thread_id, 'finish_time'] = finish_time

        # Set execution time
        exec_time = finish_time - self.thread_df.loc[thread_id, 'start_time']
        self.thread_df.loc[thread_id, 'exec_time'] = exec_time
        
        return finish_time, exec_time
    
    def get_error_threads(self):
        """Returns a DataFrame of threads which threw an error."""
        return self.thread_df[self.thread_df['error_message'] != '']
    
    def get_finished_threads(self):
        """Returns a DataFrame of the finished threads."""
        return self.thread_df[self.thread_df['finish_time'] != '']
        
    def get_next_thread_id(self):
        """Returns an integer representing the ID for the next thread."""
        return self.thread_df.shape[0]

    def get_unfinished_threads(self):
        """Returns a DataFrame of the unfinished threads."""
        return self.thread_df[self.thread_df['finish_time'] == '']
    
    def add_thread(self, cell_text, comment='N/A', hide_output=False):
        """Adds a new background thread."""
        thread_id = self.get_next_thread_id()
        start_time = datetime.now()
        
        self.thread_df.loc[thread_id] = [start_time, '', '', cell_text, comment, '']
        if not hide_output:
            print('Started Thread {} at {}.\nComment: {}'
                      .format(thread_id, start_time, comment))
        
    def finish_thread(self, thread_id, hide_output=False):
        """Completes a thread."""
        if self.thread_df.loc[thread_id, 'finish_time'] == '':
            finish_time, exec_time = self._add_finish_times(thread_id)
            
            # Print comment
            comment = str(self.thread_df.loc[thread_id, 'comment'])
            if not hide_output:
                print('Finished Thread {} at {}.\nDone in {}.\nComment: {}'
                          .format(thread_id, finish_time, exec_time, comment))
                        
        else:
            raise Exception('Cannot finish an already completed thread.')
        
    def raise_thread_error(self, thread_id, error_message):
        if self.thread_df.loc[thread_id, 'finish_time'] == '':
            exception_message = 'Exception: {}'.format(error_message)
            self._add_finish_times(thread_id)            
            self.thread_df.loc[thread_id, 'error_message'] = exception_message
        
thread_manager = ThreadManager()

# Magic Functions Useful for Interacting with the Cluster
These functions allow us to type write and run raw SQL a cell with the magic function at the top.

In [None]:
@register_cell_magic
def readsql(line, cell):
    """
    Extract the code in the specific cell (should be valid SQL), and 
    execute it using the connection object to the backend  database.
    The resulting pandas DataFrame is rendered inline  below the cell
    using IPython.display. You'd use this for SELECT.
    
    Returns a DataFrame with the name specified in the magic function.
    If this is not specified, then the DataFrame is called _df. This
    also takes in an option "-h", followed by a number. This will show
    only the specified number of rows in the DataFrame.
    """
    
    # Use the global connection object defined above.
    global conn
    optlist, args = getopt.getopt(line.split(), 'ih:')
    optdict = dict(optlist) 
    # If '-h' tag is specified, set the number of rows to display
    if '-h' in optdict:
        head_num = int(optdict['-h'])
    
    # Do string formatting. If a PL/Python function
    # is being created, then it should not try and
    # format whatever is inside the function.
    split_cell = cell.split('$')
    if '-i' not in optdict:
        if len(split_cell) > 1:
            split_cell[0] = split_cell[0].format(**globals())
            split_cell[-1] = split_cell[-1].format(**globals())
            cell = '$'.join(split_cell)
        elif len(split_cell) == 1:
            cell = cell.format(**globals()) 
    
    # If there is more than one table name specified,
    # throw an exception.
    if len(args) > 1:
        raise Exception('More than one table name specified.')

    elif len(args) == 1:
        # If a table name is specified, store it as that
        table_name = args[0]
        globals()[table_name] = psql.read_sql(cell, conn)
        if '-h' in optdict:
            # If head_num is not 0, then display rows
            if head_num != 0:
                display(globals()[table_name].head(int(optdict['-h'])))
        else:
            display(globals()[table_name])

    else:
        # Otherwise, call it _df
        global _df
        _df = psql.read_sql(cell, conn)
        if '-h' in optdict:
            # If head_num is not 0, then display rows
            if head_num != 0:
                display(_df.head(head_num))
        else:
            display(_df)
            
    refresh_tables(conn)


@register_cell_magic
def execsql(line, cell):
    """
    Extract the code in the specific cell (should be valid SQL), and
    execute it using the connection object to the backend  database.
    You'd use this for CREATE/UPDATE/DELETE.
    """
    
    # Use the global connection object defined above.
    global conn
    optlist, args = getopt.getopt(line.split(), 'ih:')
    optdict = dict(optlist)
    
    # Do string formatting. If a PL/Python function
    # is being created, then it should not try and
    # format whatever is inside the function.
    split_cell = cell.split('$')
    if '-i' not in optdict:
        if len(split_cell) > 1:
            split_cell[0] = split_cell[0].format(**globals())
            split_cell[-1] = split_cell[-1].format(**globals())
            cell = '$'.join(split_cell)
        elif len(split_cell) == 1:
            cell = cell.format(**globals())
    psql.execute(cell, conn)
    refresh_tables(conn)


@register_cell_magic
def printsql(line, cell):
    """Show the SQL query that will be run."""
    
    optlist, args = getopt.getopt(line.split(), 'ih:')
    optdict = dict(optlist)
    
    # Do string formatting. If a PL/Python function
    # is being created, then it should not try and
    # format whatever is inside the function.
    split_cell = cell.split('$')
    if '-i' not in optdict:
        if len(split_cell) > 1:
            split_cell[0] = split_cell[0].format(**globals())
            split_cell[-1] = split_cell[-1].format(**globals())
            cell = '$'.join(split_cell)
        elif len(split_cell) == 1:
            cell = cell.format(**globals())
    print(cell)
    
    
# We delete these to avoid name conflicts for automagic to work
del execsql, readsql, printsql

In [None]:
@register_cell_magic
def background(line, cell):
    """Runs whatever is in the cell in a separate thread. This allows
    the user to run cells in the background so that additional cells
    can be run concurrently. This will also micromanage by labelling 
    each thread with an ID number.
    
    Whatever follows after specifying '%%background' will be used as a
    comment to label the process if the ID number is not descriptive
    enough.
    """
    
    def is_useful_code(code):
        """Returns True if code is useful to use. Code that is
        considered not useful is code that contains only commented
        lines or empty lines.
        """

        lines = code.split('\n')

        # For each line, check if line equals '' or starts with a '#'
        return not np.all([line == '' or line[0] == '#' for line in lines])
    
    def run_cell(cell_value, line_value, thread_id, hide_output):
        try:
            exec(cell_value, globals())
        except Exception as error_message:
            thread_manager.raise_thread_error(thread_id, error_message)
            raise Exception(error_message)
            
        thread_manager.finish_thread(thread_id, hide_output)
        
    def get_thread_comment(line, sub_comment):
        """Gets the final thread comment by looking at the main comment
        line and the sub-comments.
        """
        
        # Has main comment if the length of the line is greater than 0
        # or if the line is not all spaces
        has_main_comment = len(line) > 0 and not bool(re.match('^( )+$', line))
        
        # Has sub comment if it is not None and if it is not equal to
        # the blank string (The regex will handle cases of all spaces)
        has_sub_comment = sub_comment is not None and sub_comment != ''
        
        if has_main_comment:
            if has_sub_comment:
                return '{} - {}'.format(line, sub_comment)
            else:
                return line
                
        else:
            if has_sub_comment:
                return sub_comment
            else:
                return 'N/A'
        
    def run_in_background(sub_cell, sub_comment, line_comment, hide_output):
        """Run the cell in background and update the thread manager."""
        thread_id = thread_manager.get_next_thread_id()

        # Add thread to thread manager
        thread_comment = get_thread_comment(line_comment, sub_comment)
    
        thread_manager.add_thread(cell_text=sub_cell,
                                  comment=thread_comment,
                                  hide_output=hide_output
                                 )

        # Run the thread in the background
        thread = threading.Thread(target=run_cell,
                                  args=(sub_cell, line_comment, thread_id,
                                        hide_output)
                                 )
        thread.start()
        
    def _assign_iterable_from_for_loop(for_loop_lines):
        """Extracts the iterable object from the for loop."""
        range_extract_pattern = 'for .+ in (.*):'
        iterable_str = re.findall(range_extract_pattern, for_loop_lines)[0]
        
        # Assign iterable based on the for loop string
        exec(f'_iterable = {iterable_str}', globals())
        
    def _assign_var(for_loop_lines, item):
        """Assigns a variable from the items in the iterable."""
        var_extract_pattern = 'for (.+) in .*:'
        var_str = re.findall(var_extract_pattern, for_loop_lines)[0]
        
        if isinstance(item, str):
            exec_str = f"{var_str} = '{item}'"
        else:
            exec_str = f'{var_str} = {item}'
        
        # Assign the for loop item variable
        exec(exec_str, globals())
        
        return exec_str
    
    def create_comment_list(cell, cell_list):
        # Find comment fields, which are defined by lines with two or
        # more '#' characters, spaces, any character (except '#' and
        # '\n'), then a terminating '\n'. Use non-capturing group to
        # ignore '#' characters and spaces before the comment.
        comment_list = re.findall('(?:^#{2,}|\n#{2,})(?: )*([^#\n]*)\n', cell)

        # If there is no break above the first one, comment list will be
        # smaller than cell list, and it will be offset by one.
        if len(comment_list) < len(cell_list):
            comment_list.insert(0, None)

        return comment_list
    
    def extract_cell_contents(bg_for_loop_pattern):
        """Extract the pre-loop and loop contents of the cell."""
        # Split the cell according to the loop
        cell_loop_split_list = re.split(bg_for_loop_pattern, cell)

        # Execute stuff before the loop
        pre_loop_contents = cell_loop_split_list[0]

        loop_contents = cell_loop_split_list[3]
        
        # Remove beginnning white space
        loop_contents = re.sub('^\s{4}', '', loop_contents)
        # Remove white space at the beginning of each line
        loop_contents = re.sub('\n\s{4}', '\n', loop_contents)
        
        return pre_loop_contents, loop_contents
    
        
    # Parse line option arguments
    optlist, args = getopt.getopt(line.split(), 'h', ['hide'])
    optdict = dict(optlist)
    
    hide_output = '-h' in optdict or '--hide' in optdict
    
    # Join the arguments to form the line comment
    line_comment = ' '.join(args)
    
    
    # Splits the code into separate threads by lines that start with
    # two or more '#' characters. 
    cell_list = re.split('(?:^#{2,}|\n#{2,}).*\n', cell)
    cell_list = [sub_cell
                     for sub_cell in cell_list
                         if is_useful_code(sub_cell)]
    
    comment_list = create_comment_list(cell, cell_list)
        
    # Finds a for loop with bg flag
    bg_for_loop_pattern = '(for .+ in .*:)\s*#{2,}\s*([\w ]*)\n'
    
    for sub_cell, sub_comment in zip_longest(cell_list, comment_list):
        if re.search(bg_for_loop_pattern, sub_cell):
            pre_loop_contents, loop_contents =\
                extract_cell_contents(bg_for_loop_pattern)
            
            # Execute cell contents that come before the loop
            exec(pre_loop_contents, globals())
            
            # The lines of code that represent for loops
            for_loop_lines, for_loop_comment =\
                re.findall(bg_for_loop_pattern, cell)[0]
            
            # Interprets the iterable object from the for loop and
            # assigns it to the _iterable variable
            _assign_iterable_from_for_loop(for_loop_lines)
            
            for item in _iterable:
                # Assign the for loop iterating variable before running
                # each iteration of the loop.
                var_comment = _assign_var(for_loop_lines, item)
                sub_comment = f'{for_loop_comment} ({var_comment})'
                
                run_in_background(loop_contents, sub_comment, line_comment,
                                  False)

        else:
            run_in_background(sub_cell, sub_comment, line_comment, hide_output)
        

# We delete these to avoid name conflicts for automagic to work
del background