# Leakage Detection Tests

A series of tests for leakage, demonstrated on the [Los Angeles Chronic Offenders Leakage](https://github.com/dssg/la_prosecutor) project  
Requires a credentials.py file defining the following variables: dbname, user, host, password, port

In [None]:
import sys
import os
import math
import warnings

import pickle
import pandas as pd
import numpy as np
import psycopg2
from catwalk.db import connect
from catwalk.storage import FSModelStorageEngine
from sqlalchemy import create_engine
from IPython.core.interactiveshell import InteractiveShell

from credentials import dbname, user, host, password, port

pd.set_option('display.max_rows', 500)
warnings.simplefilter('ignore')
InteractiveShell.ast_node_interactivity = "all"

In [None]:
def execute_sql(statement, dbname, user, host, password, port, isolation = False, results = True):
    """
    Use psycopg2 to execute PostgreSQL queries
    
    Input:
        statement (str): SQL statement to run in database
        dbname, user, host, password (str): database credentials
        isolation (bool): indicator for whether to change isolation level to autocommit; True for queries that cannot be run 
            from within a transation (see https://wiki.postgresql.org/wiki/Psycopg2_Tutorial)
        results (bool): indicator for whether the query is expected to output results;
            for example, True for SELECT statements and False for CREATE TABLE statements
    
    Output:
        relation (dataframe): query results or empty dataframe if results = False
    """
    conn = psycopg2.connect("dbname={} user={} host={} password={}".format(dbname, user, host, password))
    cur = conn.cursor()
    if isolation:
        conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
    cur.execute(statement)
    relation = pd.DataFrame()
    if results:
        results = cur.fetchall()
        colnames = [desc[0] for desc in cur.description]
        relation = pd.DataFrame(results, columns=colnames)
    cur.close()
    conn.close()
    return relation


In [None]:
def randomize(df, do_not_randomize = None, seed=0):
    """
    Randomize column values of a file. Each column is randomized independently.
    
    Inputs:
        df (dataframe): dataframe to randomize
        do_not_randomize (list): optional list of strings indicating names of 
            columns that should not be randomized
    Outputs:
        df (dataframe): dataframe of randomized data
    """     
    df_random = df.copy()
    if do_not_randomize:
        cols = [c for c in df.columns if c not in do_not_randomize]
    else:
        cols = df.columns
        
    for col in cols:
        #print('\t\tRandomizing column ' + col)
        np.random.seed(seed)
        df_random[col] = np.random.permutation(df_random[col])

    return df_random

In [None]:
# source: https://overlaid.net/2016/02/08/replace-words-in-files-or-strings-using-python/
def do_replacement(base_text, word_map):
    """
    Helper function for replace_words_in_file. Make replacements in base_text, as 
    indicated in  word_map.
    """
    for key, val in word_map.items():
        base_text = base_text.replace(key, val)
    return base_text

def replace_words_in_file(read_from, write_to, word_map):
    """
    Create copy of a file with certain words replaced
    
    Inputs
        read_from: name of file to read from
        write_to: name of new file to be created
        word_map: dictionary of mappings between words and their replacements
            (e.g. {'old_word': 'new_word'})
    Outputs
        None. Will create a new file with the name given in write_to
        
    """
    print('Generating file ', write_to)
    # Open your desired file as 't' and read the lines into string 'tempstr'
    t = open(read_from, 'r')
    tempstr = t.read()
    t.close()

    # Using the "replace_words" function, we'll pass in our tempstr to be used as the base, 
    # and our device_values to be used as replacement.  
    output = do_replacement(tempstr, word_map)

    # Write out the new config file
    fout = open(write_to, 'w')
    fout.write(output)
    fout.close()

In [None]:
def get_new_filename(filename, suffix, replacement_map=None):
    """
    Output a new filename (str) given a filename and suffix to append. Assumes file extension at the end is separated by a period.
    """
    k = filename.rfind(".")
    new_filename = filename[:k] + suffix + '.' + filename[k+1:]
    if replacement_map:
        new_filename = do_replacement(new_filename, replacement_map)
    return new_filename

## I. Randomize the input data

A number of preprocessing steps were required for this project, because raw cases and bookings data did not come with unique identifiers for the individuals involved. As such, we cannot randomize the raw data because we would be unable to identify individuals in the results. Randomization must be done after entity linkage but before features are built.

Pull the information schema from selected database.

In [None]:
# name of the database schema containing input data to be randomized
INPUT_SCHEMA = 'staging' 

In [None]:
statement = "SELECT * FROM information_schema.tables;"
tables = execute_sql(statement, dbname, user, host, password, port)

Select the schema containing input datasets to be randomized. For this project, the schema is named staging

In [None]:
input_tables = tables.table_name[tables.table_schema == INPUT_SCHEMA]
input_tables = list(input_tables)

First created the _randomized schema if it doesn't yet exist

In [None]:
statement = "CREATE SCHEMA IF NOT EXISTS {}_randomized;".format(INPUT_SCHEMA)
output = execute_sql(statement, dbname, user, host, password, port, isolation=True, results = False)

Copy over all the tables from original input schema, i.e. all tables in input_tables list.

Randomize only a subset of these tables. Tables to be randomized are:    
1) The set of tables from the staging schema that are referenced in the from\_obj section of each of the yaml files in /config/features. These staging tables have been linked to entity identifiers and are the source data for feature generation.  
2) Two labels tables from the staging schema that are referenced in config files: 'labels_casesonly_multiprior_win6mo_lab6mo' and 'labels_multiprior_win6mo_lab6mo'. Only randomizing the outcomes column in these labels tables to maintain the entity_id and outcome_date key pairing. This way, the states table does not need to be rerun.

Note that table names beginning with "staging.entity_" are not included in the list because they contain mappings between entity identifiers and other data. These relationships must be maintained so should not be randomized.

Write output to _randomized schema. 

In [None]:
input_tables

In [None]:
input_tables_to_randomize = [
               'booking_addl_info', 
               'branch_lkup',
               'case_booking',  
               'case_dispo_lkup', 
               'case_flag_lkup',
               'case_result_lkup',
               'charge_lkup',
               'feature_casesflag_info', 
               'feature_case_info', 
               'feature_case_results',
               'feature_charge_info',
               'labels_casesonly_multiprior_win6mo_lab6mo',
               'orig_agency_lkup',
               'labels_multiprior_win6mo_lab6mo'
               ]

In [None]:
for table_name in input_tables:
    seed = 0 # initiate to 0, increment in each iteration
    
    print("Working on table {}".format(table_name))
    
    if table_name in input_tables_to_randomize: # randomize before adding to _randomized schema
        # Pull the table from original schema
        print("\tPulling table")
        statement = "SELECT * FROM {}.{};".format(INPUT_SCHEMA, table_name)
        table = execute_sql(statement, dbname, user, host, password, port)

        # Randomize the table
        print("\tRandomizing")
        if table_name in ['labels_casesonly_multiprior_win6mo_lab6mo','labels_multiprior_win6mo_lab6mo']:
            randomized_table = randomize(table, do_not_randomize=['entity_id', 'outcome_date'], seed = seed)          
        else:
            randomized_table = randomize(table, seed = seed)

        # Make a new table in _randomized schema
        print("\tUploading randomized version")
        statement = "CREATE TABLE IF NOT EXISTS {0}_randomized.{1}_randomized (LIKE {0}.{1} INCLUDING ALL);".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)

        # Write results into new table
        statement = "SELECT COUNT(*) FROM {}_randomized.{}_randomized;".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port)
        if output.iloc[0,0]>0: # do nothing if new table already contains data
            print("\t*****SKIPPING TABLE {}_randomized -- it already has data".format(table_name))
        else:
            engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(user, password, host, port, dbname))
            randomized_table.to_sql(table_name+'_randomized', engine, schema = '{}_randomized'.format(INPUT_SCHEMA), index = False, if_exists='append')
    
    else: # copy to _randomized schema as is, without randomizing
        # Make a new table in _randomized schema
        print("\tUploading original version without randomizing")
        statement = "CREATE TABLE IF NOT EXISTS {0}_randomized.{1} (LIKE {0}.{1} INCLUDING ALL);".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)
        
        # Copy original data without randomizing
        statement = "SELECT COUNT(*) FROM {}_randomized.{};".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port)
        if output.iloc[0,0]>0: # do nothing if new table already contains data
            print("\t*****SKIPPING TABLE {} -- it already has data".format(table_name))
        else:
            statement = "INSERT INTO {0}_randomized.{1} (SELECT * from {0}.{1});".format(INPUT_SCHEMA, table_name)
            output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)
    seed += 1

Also randomized the labels, table labels_multiprior_win6mo_lab6mo; forgot to include in input_tables_to_randomize list when previous cell was run

## II. Edit config files so that they point to the newly randomized raw schema

Experiment config file being used on the project is experiment_config_multiprior.yaml. Also need to edit all the feature config files under config/features. New versions of config files with an "_edited" suffix will be produced and saved in "features_randomized" folder.

In [None]:
CONFIG_FILES = [
    'config/experiment_config_multiprior.yaml', 
    'config/features/booking_info.yaml',
    'config/features/case_flags.yaml',
    'config/features/case_info.yaml',
    'config/features/case_results.yaml',
    'config/features/charge_info.yaml',
    'config/features/days_since.yaml',
    'config/features/demos.yaml',
    'config/features/multi_prior.yaml'
    ]

Make a dictionary of text replacements to apply across every preprocessing & config file. This should include:
- Schema and table names that have been changed, i.e. have a "_randomized" suffix

In [None]:
WORD_MAP = {x:x+'_randomized' for x in input_tables_to_randomize}
WORD_MAP[INPUT_SCHEMA+'.'] = INPUT_SCHEMA+'_randomized.'
WORD_MAP

Apply find+replace to every config file. Write feature config files to a new folder (config/features_randomized) that will later be referenced when running the experiment. 

In [None]:
# !cd config && mkdir features_randomized

In [None]:
#for filename in CONFIG_FILES:
#    new_filename = get_new_filename(filename, "_edited", {'features': 'features_randomized'})
#    replace_words_in_file(filename, new_filename, WORD_MAP) 

**NOTE**: Be sure to open the outputted files (with "_edited" suffix) to check that no unexpected replacements were made and that all necessary replacements are made. 

Model Group Key: We have to manually add a "purpose" model group key into the experiment config file. Including a "purpose" indicates that this experiment is for leakage detection and helps to distinguish it from existing experiments. Add "purpose: leakage_detection" under the user_metadata section, and add "purpose" under the model_group_keys section.

## III. Run an experiment with this new setup

Now that we've set up randomized versions of the original schemas in the database and created a new config file for a randomized experiment, it's time to run the actual triage experiment. This is done in an aws ec2 instance.

Make sure you have a database.yaml file, required by triage for modeling. See example [here](https://github.com/dssg/la_prosecutor/blob/master/example_database.yaml).

Update run.py to reflect the particulars of this new config.
- set PROJECT_PATH to the local directory where you want to store output
- set n_processes to be less than the number of cores on your machine

Run the following command: python run.py -v -c config/experiment_config_multiprior_edited.yaml -f features_randomized > log.txt

## IV. Align the results of randomized experiment against original experiment

Model group from the original, non-randomized experiment: 22  
Model group from randomized experiment: 115. 

Note: experiment didn't finish running successfully, so for now we can only look at model_id 1116, 1120, 1124, and 1128 under model_group_id 115, with train_end_time between 2012-07-01 and 2014-01-01. This corresponds to models 22, 50, 76, and 100 under model_group_22.

First look at model groups

In [None]:
GROUP_ID_ORIG = 22
GROUP_ID_RAND = 115

In [None]:
# original model group
statement = "SELECT * FROM results.models where model_group_id = {};".format(GROUP_ID_ORIG)
models_orig = execute_sql(statement, dbname, user, host, password, port)
models_orig

In [None]:
# randomized model group 
statement = "SELECT * FROM results.models WHERE model_group_id = {}".format(GROUP_ID_RAND)
model_groups = execute_sql(statement, dbname, user, host, password, port)
model_groups

Now look at models

In [None]:
MODEL_ID_ORIG = 22 
MODEL_ID_RAND = 1116 

In [None]:
statement = "SELECT * FROM results.models where model_id = {};".format(MODEL_ID_ORIG)
models_rand = execute_sql(statement, dbname, user, host, password, port)
models_rand

In [None]:
statement = "SELECT * FROM results.models where model_id = {};".format(MODEL_ID_RAND)
models_rand = execute_sql(statement, dbname, user, host, password, port)
models_rand

Now look at predictions. Why does randomized model give the same score for every single person?

In [None]:
statement = "SELECT * FROM results.predictions where model_id = {};".format(MODEL_ID_ORIG)
predictions_orig = execute_sql(statement, dbname, user, host, password, port)
predictions_orig.head()

In [None]:
statement = "SELECT * FROM results.predictions where model_id = {};".format(MODEL_ID_RAND)
predictions_rand = execute_sql(statement, dbname, user, host, password, port)
predictions_rand.head(100)

In [None]:
np.mean(predictions_orig.label_value)
np.mean(predictions_rand.label_value)

Why are there different number of observations in the two predictions tables?   
Answer: Entity IDs have been reassigned since then, so it's okay that they don't match. However, the overall length of the lists of people should be the same, because they depend on the entity states which have not changed. Going to rerun staging pipeline to check for instabilities and investigate why prediction counts of original model do not match the state.

For example: 

Model with train end time 2012-07-01:  
Only 87,353 entities in common between original and random predictions.  
There are 245,056 entities in original and not in random.  
There are 238,644 entities in random and not in original.

Model with train end time 2013-07-01:  
Only 93,010 entities in common between original and random predictions.  
There are 251,345 entities in original and not in random.  
There are 244,662 entities in random and not in original.

In [None]:
predictions_orig.shape
predictions_rand.shape

In [None]:
entities_orig = set(predictions_orig.entity_id)
entities_rand = set(predictions_rand.entity_id)
entities_diff1 = entities_orig - entities_rand
entities_diff2 = entities_rand - entities_orig

In [None]:
len(entities_diff1)
len(entities_diff2)

In [None]:
len(entities_orig.intersection(entities_rand))

Compare train and test matrices between original and randomized model. Issues:   
- There are additional feature columns in original model that are not in randomized model. Ivan/Erika have been working on fixing this.
- Some features do not seem to have been created correctly in randomized model, such as the days_since... column which is only 0.

In [None]:
train_matrix_orig = pd.read_csv('orig_models/7f857d499d4cb5f198ada6fbfc298932.csv', nrows=5000)
train_orig_cols = set(train_matrix_orig.columns)

In [None]:
train_matrix_orig.shape
train_matrix_orig.describe()

In [None]:
train_matrix_rand = pd.read_csv('experiment_output/matrices/ae5dfa11a6dc725a5ea9058a99eb599b.csv', nrows=5000)
train_rand_cols = set(train_matrix_rand.columns)

In [None]:
train_matrix_rand.shape
train_matrix_rand.describe()

In [None]:
train_matrix_orig.days_since_entity_id_50y_last_event_min.describe()


In [None]:
train_matrix_rand.days_since_entity_id_50y_last_event_min.describe()

In [None]:
diff1 = train_orig_cols - train_rand_cols
diff1

diff2 = train_rand_cols - train_orig_cols
diff2

Grab the evaluation metrics for each of these models.

In [None]:
statement = "SELECT * FROM results.evaluations where model_id = {};".format(MODEL_ID_ORIG)
evaluations_orig = execute_sql(statement, dbname, user, host, password, port)
evaluations_orig.head()

In [None]:
statement = "SELECT * FROM results.evaluations where model_id = {};".format(MODEL_ID_RAND)
evaluations_rand = execute_sql(statement, dbname, user, host, password, port)
evaluations_rand.head()