# Randomize raw tables in db to test for data leakage

Run this notebook as the first step to test for data leakage in your machine learning pipeline.  
Requires a credentials.py file defining the following variables: dbname, user, host, password

In [None]:
import sys
import os
import math
import warnings

import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
from IPython.core.interactiveshell import InteractiveShell

from credentials import dbname, user, host, password

warnings.filterwarnings(action='once')
InteractiveShell.ast_node_interactivity = "all"

In [None]:
def execute_sql(statement, dbname, user, host, password, isolation = False, results = True):
    """
    Use psycopg2 to execute PostgreSQL queries
    
    Input:
        statement (str): SQL statement to run in database
        dbname, user, host, password (str): database credentials
        isolation (bool): indicator for whether to change isolation level to autocommit; True for queries that cannot be run 
            from within a transation (see https://wiki.postgresql.org/wiki/Psycopg2_Tutorial)
        results (bool): indicator for whether the query is expected to output results;
            for example, True for SELECT statements and False for CREATE TABLE statements
    
    Output:
        relation (dataframe): query results or empty dataframe if results = False
    """
    conn = psycopg2.connect("dbname={} user={} host={} password={}".format(dbname, user, host, password))
    cur = conn.cursor()
    if isolation:
        conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
    cur.execute(statement)
    relation = pd.DataFrame()
    if results:
        results = cur.fetchall()
        colnames = [desc[0] for desc in cur.description]
        relation = pd.DataFrame(results, columns=colnames)
    cur.close()
    conn.close()
    return relation


In [None]:
def randomize(df, do_not_randomize = None):
    """
    Randomize column values of a file. Each column is randomized independently.
    
    Inputs:
        df (dataframe): dataframe to randomize
        do_not_randomize (list): optional list of strings indicating names of 
            columns that should not be randomized
    Outputs:
        df (dataframe): dataframe of randomized data
    """     
    df_random = df.copy()
    if do_not_randomize:
        cols = [c for c in df.columns if c not in do_not_randomize]
    else:
        cols = df.columns
        
    for col in cols:
        #print('\t\tRandomizing column ' + col)
        df_random[col] = np.random.permutation(df_random[col])

    return df_random

Pull the information schema from selected database.  

In [None]:
statement = "SELECT * FROM information_schema.tables;"
tables = execute_sql(statement, dbname, user, host, password)

Select the schema containing raw datasets to be randomized. For this tutorial, the schema is named etl.

In [None]:
etl = tables[tables.table_schema == 'etl']
etl.head()

First created the etl_randomized schema if it doesn't yet exist

In [None]:
statement = "CREATE SCHEMA IF NOT EXISTS etl_randomized;"
output = execute_sql(statement, dbname, user, host, password, results = False)

Randomize every table in etl and write output to etl_randomized schema

In [None]:
for table_name in etl.table_name:
    print("Working on table {}".format(table_name))
    
    # Pull the table from original schema
    print("\tPulling table")
    statement = "SELECT * FROM etl.{};".format(table_name)
    table = execute_sql(statement, dbname, user, host, password)
    
    # Randomize the table
    print("\tRandomizing")
    randomized_table = randomize(table)
    
    # Make a new table in etl_randomized schema
    print("\tUploading randomized version")
    statement = "CREATE TABLE IF NOT EXISTS etl_randomized.{} (LIKE etl.{});".format(table_name, table_name)
    output = execute_sql(statement, dbname, user, host, password, isolation = True, results = False)
    
    # Write results into new table
    statement = "SELECT COUNT(*) FROM etl_randomized.{};".format(table_name)
    output = execute_sql(statement, dbname, user, host, password)
    if output.iloc[0,0]>0: #  do nothing if new table already contains data
        print("\t*****SKIPPING TABLE {} -- it already has data".format(table_name))
    else:
        engine = create_engine('postgresql://{}:{}@{}/{}'.format(user, password, host, dbname))
        randomized_table.to_sql(table_name, engine, schema = 'etl_randomized', index = False, if_exists='append')

Spot check a few to make sure they're randomized

In [None]:
statement = "SELECT * FROM etl_randomized.arrests LIMIT 1;"
output = execute_sql(statement, dbname, user, host, password)
output

In [None]:
statement = "SELECT * FROM etl.arrests WHERE incident_no = '130856668.0';"
output = execute_sql(statement, dbname, user, host, password)
output

In [None]:
statement = "SELECT * FROM etl_randomized.eis_complete LIMIT 1;"
output = execute_sql(statement, dbname, user, host, password)
output

In [None]:
statement = "SELECT * FROM etl.eis_complete WHERE index = '1553';"
output = execute_sql(statement, dbname, user, host, password)
output