# Test for data leakage in pipeline

Run this notebook to test for data leakage in your machine learning pipeline.  
Requires a credentials.py file defining the following variables: dbname, user, host, password

In [1]:
import sys
import os
import math
import warnings

import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
from IPython.core.interactiveshell import InteractiveShell

from credentials import dbname, user, host, password

warnings.filterwarnings(action='once')
InteractiveShell.ast_node_interactivity = "all"

  """)


In [2]:
def execute_sql(statement, dbname, user, host, password, results = True):
    """
    Use psycopg2 to execute PostgreSQL queries
    
    Input:
        statement (str): SQL statement to run in database
        dbname, user, host, password (str): database credentials
        
    Output (if exists):
        relation (dataframe): query results
    """
    conn = psycopg2.connect("dbname={} user={} host={} password={}".format(dbname, user, host, password))
    cur = conn.cursor()
    cur.execute(statement)
    relation = pd.DataFrame()
    if results:
        results = cur.fetchall()
        colnames = [desc[0] for desc in cur.description]
        relation = pd.DataFrame(results, columns=colnames)
    cur.close()
    conn.close()
    return relation


In [3]:
def randomize(df, do_not_randomize = None):
    """
    Randomize column values of a file. Each column is randomized independently.
    
    Inputs:
        df (dataframe): dataframe to randomize
        do_not_randomize (list): optional list of strings indicating names of 
            columns that should not be randomized
    Outputs:
        df (dataframe): dataframe of randomized data
    """     
    df_random = df.copy()
    if do_not_randomize:
        cols = [c for c in df.columns if c not in do_not_randomize]
    else:
        cols = df.columns
        
    for col in cols:
        #print('\t\tRandomizing column ' + col)
        df_random[col] = np.random.permutation(df_random[col])

    return df_random

Pull the information schema from selected database.  

In [4]:
statement = "SELECT * FROM information_schema.tables;"
tables = execute_sql(statement, dbname, user, host, password)

Select the schema containing raw datasets to be randomized. For this tutorial, the schema is named etl.

In [5]:
etl = tables[tables.table_schema == 'etl']
etl.head()

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
15,san_francisco_eis,etl,arrests,BASE TABLE,,,,,,YES,NO,
16,san_francisco_eis,etl,occ_incidents,BASE TABLE,,,,,,YES,NO,
160,san_francisco_eis,etl,uof_employyee,BASE TABLE,,,,,,YES,NO,
193,san_francisco_eis,etl,uof_employee_subjectlink,BASE TABLE,,,,,,YES,NO,
212,san_francisco_eis,etl,dem_id,BASE TABLE,,,,,,YES,NO,


First created the etl_randomized schema if it doesn't yet exist

In [6]:
statement = "CREATE SCHEMA IF NOT EXISTS etl_randomized;"
output = execute_sql(statement, dbname, user, host, password, results = False)

Randomize every table in etl and write output to etl_randomized

In [14]:
for table_name in etl.table_name:
    # Pull the table from original schema
    print("Pulling table {}".format(table_name))
    statement = "SELECT * FROM etl.{};".format(table_name)
    table = execute_sql(statement, dbname, user, host, password)
    
    # Randomize the table
    print("\tRandomizing table {}".format(table_name))
    randomized_table = randomize(table)
    
    # Make a new table in etl_randomized schema
    statement = "CREATE TABLE IF NOT EXISTS etl_randomized.{} (LIKE etl.{});".format(table_name, table_name)
    output = execute_sql(statement, dbname, user, host, password, results = False)
    
    # Write results into new table
    statement = "SELECT COUNT(*) FROM etl_randomized.{};".format(table_name)
    output = execute_sql(statement, dbname, user, host, password)
    if output.iloc[0]>0: #  do nothing if new table already contains data
        print("\t*****SKIPPING TABLE {} -- it already has data".format(table_name))
    else:
        engine = create_engine('postgresql://{}:{}@{}/{}'.format(user, password, host, dbname))
        randomized_table.to_sql(table_name, engine, schema = 'etl_randomized', index = False, if_exists='append')

Pulling table arrests
	Randomizing table arrests


ProgrammingError: relation "etl_randomized.arrests" does not exist
LINE 1: SELECT COUNT(*) FROM etl_randomized.arrests;
                             ^


In [15]:
statement = "CREATE TABLE IF NOT EXISTS etl_randomized.{} (LIKE etl.{});".format(table_name, table_name)
output = execute_sql(statement, dbname, user, host, password, results = False)