# Leakage Detection Tests

A series of tests for leakage, demonstrated on the [Los Angeles Chronic Offenders Leakage](https://github.com/dssg/la_prosecutor) project  
Requires a credentials.py file defining the following variables: dbname, user, host, password, port

In [1]:
import sys
import os
import math
import warnings

import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
from IPython.core.interactiveshell import InteractiveShell

from credentials import dbname, user, host, password, port

pd.set_option('display.max_rows', 500)
warnings.filterwarnings(action='once')
InteractiveShell.ast_node_interactivity = "all"

In [2]:
def execute_sql(statement, dbname, user, host, password, port, isolation = False, results = True):
    """
    Use psycopg2 to execute PostgreSQL queries
    
    Input:
        statement (str): SQL statement to run in database
        dbname, user, host, password (str): database credentials
        isolation (bool): indicator for whether to change isolation level to autocommit; True for queries that cannot be run 
            from within a transation (see https://wiki.postgresql.org/wiki/Psycopg2_Tutorial)
        results (bool): indicator for whether the query is expected to output results;
            for example, True for SELECT statements and False for CREATE TABLE statements
    
    Output:
        relation (dataframe): query results or empty dataframe if results = False
    """
    conn = psycopg2.connect("dbname={} user={} host={} password={}".format(dbname, user, host, password))
    cur = conn.cursor()
    if isolation:
        conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
    cur.execute(statement)
    relation = pd.DataFrame()
    if results:
        results = cur.fetchall()
        colnames = [desc[0] for desc in cur.description]
        relation = pd.DataFrame(results, columns=colnames)
    cur.close()
    conn.close()
    return relation


In [3]:
def randomize(df, do_not_randomize = None):
    """
    Randomize column values of a file. Each column is randomized independently.
    
    Inputs:
        df (dataframe): dataframe to randomize
        do_not_randomize (list): optional list of strings indicating names of 
            columns that should not be randomized
    Outputs:
        df (dataframe): dataframe of randomized data
    """     
    df_random = df.copy()
    if do_not_randomize:
        cols = [c for c in df.columns if c not in do_not_randomize]
    else:
        cols = df.columns
        
    for col in cols:
        #print('\t\tRandomizing column ' + col)
        df_random[col] = np.random.permutation(df_random[col])

    return df_random

In [4]:
# source: https://overlaid.net/2016/02/08/replace-words-in-files-or-strings-using-python/
def do_replacement(base_text, word_map):
    """
    Helper function for replace_words_in_file. Make replacements in base_text, as 
    indicated in  word_map.
    """
    for key, val in word_map.items():
        base_text = base_text.replace(key, val)
    return base_text

def replace_words_in_file(read_from, write_to, word_map):
    """
    Create copy of a file with certain words replaced
    
    Inputs
        read_from: name of file to read from
        write_to: name of new file to be created
        word_map: dictionary of mappings between words and their replacements
            (e.g. {'old_word': 'new_word'})
    Outputs
        None. Will create a new file with the name given in write_to
        
    """
    print('Generating file ', write_to)
    # Open your desired file as 't' and read the lines into string 'tempstr'
    t = open(read_from, 'r')
    tempstr = t.read()
    t.close()

    # Using the "replace_words" function, we'll pass in our tempstr to be used as the base, 
    # and our device_values to be used as replacement.  
    output = do_replacement(tempstr, word_map)

    # Write out the new config file
    fout = open(write_to, 'w')
    fout.write(output)
    fout.close()

In [5]:
def get_new_filename(filename, suffix, replacement_map=None):
    """
    Output a new filename (str) given a filename and suffix to append. Assumes file extension at the end is separated by a period.
    """
    k = filename.rfind(".")
    new_filename = filename[:k] + suffix + '.' + filename[k+1:]
    if replacement_map:
        new_filename = do_replacement(new_filename, replacement_map)
    return new_filename

## I. Randomize the input data

A number of preprocessing steps were required for this project, because raw cases and bookings data did not come with unique identifiers for the individuals involved. As such, we cannot randomize the raw data because we would be unable to identify individuals in the results. Randomization must be done after entity linkage but before features are built.

Pull the information schema from selected database.

In [7]:
# name of the database schema containing input data to be randomized
INPUT_SCHEMA = 'staging' 

In [9]:
statement = "SELECT * FROM information_schema.tables;"
tables = execute_sql(statement, dbname, user, host, password, port)

Select the schema containing input datasets to be randomized. For this project, the schema is named staging

In [10]:
input_tables = tables.table_name[tables.table_schema == INPUT_SCHEMA]
input_tables = list(input_tables)

First created the _randomized schema if it doesn't yet exist

In [20]:
statement = "CREATE SCHEMA IF NOT EXISTS {}_randomized;".format(INPUT_SCHEMA)
output = execute_sql(statement, dbname, user, host, password, port, isolation=True, results = False)

Copy over all the tables from original input schema, i.e. all tables in input_tables list.

Randomize only a subset of these tables. Tables to be randomized are:    
1) The set of tables from the staging schema that are referenced in the from\_obj section of each of the yaml files in /config/features. These staging tables have been linked to entity identifiers and are the source data for feature generation.  
2) The two tables from the staging schema that are referenced in the experiment config file, used for label generation and state management. IS THIS NECESSARY?

Note that table names beginning with "staging.entity_" are not included in the list because they contain mappings between entity identifiers and other data. These relationships must be maintained so should not be randomized.

Write output to _randomized schema. 

In [11]:
input_tables

['addinfo',
 'booking',
 'booking_addl_info',
 'branch_lkup',
 'case_booking',
 'case_dispo_lkup',
 'case_flag_lkup',
 'case_result_lkup',
 'cases',
 'casesaka',
 'casescharge',
 'casesflag',
 'ccms_to_ucc',
 'charge_categories',
 'charge_lkup',
 'citation',
 'docass',
 'entity_address',
 'entity_aka',
 'entity_all_events',
 'entity_any_name',
 'entity_best_demos',
 'entity_booking',
 'entity_case',
 'entity_cii',
 'entity_dmv',
 'entity_dob',
 'entity_gender',
 'entity_main_number',
 'entity_name',
 'entity_race',
 'feature_case_info',
 'feature_case_results',
 'feature_casesflag_info',
 'feature_charge_info',
 'firstname_gender',
 'firstname_race',
 'labels_1prior_win6mo_lab6mo',
 'labels_casesonly_multiprior_win6mo_lab6mo',
 'labels_multiprior_win6mo_lab6mo',
 'lastname_race',
 'orig_agency_lkup',
 'prob',
 'probcon',
 'states_win6mo_lab6mo',
 'status_h',
 'turkey_list']

In [15]:
input_tables_to_randomize = [
               'booking_addl_info', 
               'branch_lkup',
               'case_booking',  
               'case_dispo_lkup', 
               'case_flag_lkup',
               'case_result_lkup',
               'charge_lkup',
               'feature_casesflag_info', 
               'feature_case_info', 
               'feature_case_results',
               'feature_charge_info',
               'labels_casesonly_multiprior_win6mo_lab6mo',
               'orig_agency_lkup',
               'labels_multiprior_win6mo_lab6mo'
               ]

In [13]:
for table_name in input_tables:
    print("Working on table {}".format(table_name))
    
    if table_name in input_tables_to_randomize: # randomize before adding to _randomized schema
        # Pull the table from original schema
        print("\tPulling table")
        statement = "SELECT * FROM {}.{};".format(INPUT_SCHEMA, table_name)
        table = execute_sql(statement, dbname, user, host, password, port)

        # Randomize the table
        print("\tRandomizing")
        randomized_table = randomize(table)

        # Make a new table in _randomized schema
        print("\tUploading randomized version")
        statement = "CREATE TABLE IF NOT EXISTS {0}_randomized.{1}_randomized (LIKE {0}.{1} INCLUDING ALL);".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)

        # Write results into new table
        statement = "SELECT COUNT(*) FROM {}_randomized.{}_randomized;".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port)
        if output.iloc[0,0]>0: # do nothing if new table already contains data
            print("\t*****SKIPPING TABLE {}_randomized -- it already has data".format(table_name))
        else:
            engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(user, password, host, port, dbname))
            randomized_table.to_sql(table_name+'_randomized', engine, schema = '{}_randomized'.format(INPUT_SCHEMA), index = False, if_exists='append')
    
    else: # copy to _randomized schema as is, without randomizing
        # Make a new table in _randomized schema
        print("\tUploading original version without randomizing")
        statement = "CREATE TABLE IF NOT EXISTS {0}_randomized.{1} (LIKE {0}.{1} INCLUDING ALL);".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)
        
        # Copy original data without randomizing
        statement = "SELECT COUNT(*) FROM {}_randomized.{};".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port)
        if output.iloc[0,0]>0: # do nothing if new table already contains data
            print("\t*****SKIPPING TABLE {} -- it already has data".format(table_name))
        else:
            statement = "INSERT INTO {0}_randomized.{1} (SELECT * from {0}.{1});".format(INPUT_SCHEMA, table_name)
            output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)


Working on table addinfo
	Uploading original version without randomizing
Working on table booking
	Uploading original version without randomizing
Working on table booking_addl_info
	Pulling table
	Randomizing
	Uploading randomized version
Working on table branch_lkup
	Pulling table
	Randomizing
	Uploading randomized version
Working on table case_booking
	Pulling table
	Randomizing
	Uploading randomized version
Working on table case_dispo_lkup
	Pulling table
	Randomizing
	Uploading randomized version
Working on table case_flag_lkup
	Pulling table
	Randomizing
	Uploading randomized version
Working on table case_result_lkup
	Pulling table
	Randomizing
	Uploading randomized version
Working on table cases
	Uploading original version without randomizing
Working on table casesaka
	Uploading original version without randomizing
Working on table casescharge
	Uploading original version without randomizing
Working on table casesflag
	Uploading original version without randomizing
Working on table

Also randomized the labels, table labels_multiprior_win6mo_lab6mo; forgot to include in input_tables_to_randomize list when previous cell was run

## II. Edit config files so that they point to the newly randomized raw schema

Experiment config file being used on the project is prediction_config_multiprior.yaml. Also need to edit all the feature config files under config/features. New versions of config files with an "_edited" suffix will be produced.

In [17]:
CONFIG_FILES = [
    'config/prediction_config_multiprior.yaml', 
    'config/features/booking_info.yaml',
    'config/features/case_flags.yaml',
    'config/features/case_info.yaml',
    'config/features/case_results.yaml',
    'config/features/charge_info.yaml',
    'config/features/days_since.yaml',
    'config/features/demos.yaml',
    'config/features/multi_prior.yaml'
    ]

Make a dictionary of text replacements to apply across every preprocessing & config file. This should include:
- Schema and table names that have been changed, i.e. have a "_randomized" suffix

In [18]:
WORD_MAP = {x:x+'_randomized' for x in input_tables_to_randomize}
WORD_MAP[INPUT_SCHEMA+'.'] = INPUT_SCHEMA+'_randomized.'
WORD_MAP

{'booking_addl_info': 'booking_addl_info_randomized',
 'branch_lkup': 'branch_lkup_randomized',
 'case_booking': 'case_booking_randomized',
 'case_dispo_lkup': 'case_dispo_lkup_randomized',
 'case_flag_lkup': 'case_flag_lkup_randomized',
 'case_result_lkup': 'case_result_lkup_randomized',
 'charge_lkup': 'charge_lkup_randomized',
 'feature_case_info': 'feature_case_info_randomized',
 'feature_case_results': 'feature_case_results_randomized',
 'feature_casesflag_info': 'feature_casesflag_info_randomized',
 'feature_charge_info': 'feature_charge_info_randomized',
 'labels_casesonly_multiprior_win6mo_lab6mo': 'labels_casesonly_multiprior_win6mo_lab6mo_randomized',
 'labels_multiprior_win6mo_lab6mo': 'labels_multiprior_win6mo_lab6mo_randomized',
 'orig_agency_lkup': 'orig_agency_lkup_randomized',
 'staging.': 'staging_randomized.'}

Apply find+replace to every config file. Write feature config files to a new folder (config/features_randomized) that will later be referenced when running the experiment. 

In [19]:
!cd config && mkdir features_randomized

mkdir: cannot create directory ‘features_randomized’: File exists


In [20]:
for filename in CONFIG_FILES:
    new_filename = get_new_filename(filename, "_edited", {'features': 'features_randomized'})
    replace_words_in_file(filename, new_filename, WORD_MAP) 

Generating file  config/prediction_config_multiprior_edited.yaml
Generating file  config/features_randomized/booking_info_edited.yaml
Generating file  config/features_randomized/case_flags_edited.yaml
Generating file  config/features_randomized/case_info_edited.yaml
Generating file  config/features_randomized/case_results_edited.yaml
Generating file  config/features_randomized/charge_info_edited.yaml
Generating file  config/features_randomized/days_since_edited.yaml
Generating file  config/features_randomized/demos_edited.yaml
Generating file  config/features_randomized/multi_prior_edited.yaml


**NOTE**: Be sure to open the outputted files (with "_edited" suffix) to check that no unexpected replacements were made and that all necessary replacements are made. 

Model Group Key: We have to manually add a "purpose" model group key into the experiment config file. Including a "purpose" indicates that this experiment is for leakage detection and helps to distinguish it from existing experiments. Add "purpose: leakage_detection" under the user_metadata section, and add "purpose" under the model_group_keys section.

## III. Run an experiment with this new setup

Now that we've set up randomized versions of the original schemas in the database and created a new config file for a randomized experiment, it's time to run the actual triage experiment. This is done in an aws ec2 instance.

Make sure you have a database.yaml file, required by triage for modeling. See example [here](https://github.com/dssg/la_prosecutor/blob/master/example_database.yaml).

Update run.py to reflect the particulars of this new config.
- set PROJECT_PATH to the local directory where you want to store output
- set n_processes to be less than the number of cores on your machine

Run the following command: python run.py -v -c config/prediction_config_multiprior_edited.yaml -f config/features_randomized/

## IV. Align the results of randomized experiment against original experiment

The model group we're comparing the randomized results to is model_group_id 22; this is the model group created from the original, non-randomized experiment.  

Confirm that we now see model groups where purpose = "leakage-detection"

In [None]:
statement = "SELECT * FROM results.model_groups WHERE model_config LIKE 'leakage_detection'"
model_groups = execute_sql(statement, dbname, user, host, password, port)
model_groups

In [None]:
# create variables for the model_group id's you selected
GROUP_ID_ORIG = 22
GROUP_ID_RAND = ??

Look at the models relevant to each model group

In [None]:
statement = "SELECT * FROM results.models where model_group_id = {};".format(GROUP_ID_ORIG)
models_orig = execute_sql(statement, dbname, user, host, password, port)
models_orig

In [None]:
statement = "SELECT * FROM results.models where model_group_id = {};".format(GROUP_ID_RAND)
models_rand = execute_sql(statement, dbname, user, host, password, port)
models_rand


Pick a pair of comparable models on which to test for leakage, i.e. with matching train_end_time. Let's go with the first row of each table, i.e. model_id numbers 1 (original) and 13 (random).  
Grab the predictions made by each of these models.

In [None]:
# create variables for the model id's you selected
MODEL_ID_ORIG = 1
MODEL_ID_RAND = 13

In [None]:
statement = "SELECT * FROM results.predictions where model_id = {};".format(MODEL_ID_ORIG)
predictions_orig = execute_sql(statement, dbname, user, host, password, port)
predictions_orig

In [None]:
predictions_orig_sorted = predictions_orig.sort_values(by=['score', 'label_value'], ascending=[False, True])

In [None]:
statement = "SELECT * FROM results.predictions where model_id = {};".format(MODEL_ID_RAND)
predictions_rand = execute_sql(statement, dbname, user, host, password, port)
predictions_rand

In [None]:
predictions_rand_sorted = predictions_rand.sort_values(by=['score', 'label_value'], ascending=[False, True])

Grab the evaluation metrics for each of these models.

In [None]:
statement = "SELECT * FROM results.evaluations where model_id = {};".format(MODEL_ID_ORIG)
evaluations_orig = execute_sql(statement, dbname, user, host, password, port)
evaluations_orig

In [None]:
statement = "SELECT * FROM results.evaluations where model_id = {};".format(MODEL_ID_RAND)
evaluations_rand = execute_sql(statement, dbname, user, host, password, port)
evaluations_rand

## V. Review the predictions and important features of randomized model