# Leakage Detection Tests

A series of tests for leakage, demonstrated on the [Los Angeles Chronic Offenders Leakage](https://github.com/dssg/la_prosecutor) project  
Requires a credentials.py file defining the following variables: dbname, user, host, password, port

In [27]:
import sys
import os
import math
import warnings

import pickle
import pandas as pd
import numpy as np
import psycopg2
from catwalk.db import connect
from catwalk.storage import FSModelStorageEngine
from sqlalchemy import create_engine
from IPython.core.interactiveshell import InteractiveShell

from credentials import dbname, user, host, password, port

pd.set_option('display.max_rows', 500)
warnings.simplefilter('ignore')
InteractiveShell.ast_node_interactivity = "all"

In [28]:
def execute_sql(statement, dbname, user, host, password, port, isolation = False, results = True):
    """
    Use psycopg2 to execute PostgreSQL queries
    
    Input:
        statement (str): SQL statement to run in database
        dbname, user, host, password (str): database credentials
        isolation (bool): indicator for whether to change isolation level to autocommit; True for queries that cannot be run 
            from within a transation (see https://wiki.postgresql.org/wiki/Psycopg2_Tutorial)
        results (bool): indicator for whether the query is expected to output results;
            for example, True for SELECT statements and False for CREATE TABLE statements
    
    Output:
        relation (dataframe): query results or empty dataframe if results = False
    """
    conn = psycopg2.connect("dbname={} user={} host={} password={}".format(dbname, user, host, password))
    cur = conn.cursor()
    if isolation:
        conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
    cur.execute(statement)
    relation = pd.DataFrame()
    if results:
        results = cur.fetchall()
        colnames = [desc[0] for desc in cur.description]
        relation = pd.DataFrame(results, columns=colnames)
    cur.close()
    conn.close()
    return relation


In [29]:
def randomize(df, do_not_randomize = None, seed=0):
    """
    Randomize column values of a file. Each column is randomized independently.
    
    Inputs:
        df (dataframe): dataframe to randomize
        do_not_randomize (list): optional list of strings indicating names of 
            columns that should not be randomized
    Outputs:
        df (dataframe): dataframe of randomized data
    """     
    df_random = df.copy()
    if do_not_randomize:
        cols = [c for c in df.columns if c not in do_not_randomize]
    else:
        cols = df.columns
        
    for col in cols:
        #print('\t\tRandomizing column ' + col)
        np.random.seed(seed)
        df_random[col] = np.random.permutation(df_random[col])

    return df_random

In [30]:
# source: https://overlaid.net/2016/02/08/replace-words-in-files-or-strings-using-python/
def do_replacement(base_text, word_map):
    """
    Helper function for replace_words_in_file. Make replacements in base_text, as 
    indicated in  word_map.
    """
    for key, val in word_map.items():
        base_text = base_text.replace(key, val)
    return base_text

def replace_words_in_file(read_from, write_to, word_map):
    """
    Create copy of a file with certain words replaced
    
    Inputs
        read_from: name of file to read from
        write_to: name of new file to be created
        word_map: dictionary of mappings between words and their replacements
            (e.g. {'old_word': 'new_word'})
    Outputs
        None. Will create a new file with the name given in write_to
        
    """
    print('Generating file ', write_to)
    # Open your desired file as 't' and read the lines into string 'tempstr'
    t = open(read_from, 'r')
    tempstr = t.read()
    t.close()

    # Using the "replace_words" function, we'll pass in our tempstr to be used as the base, 
    # and our device_values to be used as replacement.  
    output = do_replacement(tempstr, word_map)

    # Write out the new config file
    fout = open(write_to, 'w')
    fout.write(output)
    fout.close()

In [31]:
def get_new_filename(filename, suffix, replacement_map=None):
    """
    Output a new filename (str) given a filename and suffix to append. Assumes file extension at the end is separated by a period.
    """
    k = filename.rfind(".")
    new_filename = filename[:k] + suffix + '.' + filename[k+1:]
    if replacement_map:
        new_filename = do_replacement(new_filename, replacement_map)
    return new_filename

## I. Randomize the input data

A number of preprocessing steps were required for this project, because raw cases and bookings data did not come with unique identifiers for the individuals involved. As such, we cannot randomize the raw data because we would be unable to identify individuals in the results. Randomization must be done after entity linkage but before features are built.

Pull the information schema from selected database.

In [32]:
# name of the database schema containing input data to be randomized
INPUT_SCHEMA = 'staging' 

In [33]:
statement = "SELECT * FROM information_schema.tables;"
tables = execute_sql(statement, dbname, user, host, password, port)

Select the schema containing input datasets to be randomized. For this project, the schema is named staging

In [34]:
input_tables = tables.table_name[tables.table_schema == INPUT_SCHEMA]
input_tables = list(input_tables)

First created the _randomized schema if it doesn't yet exist

In [35]:
statement = "CREATE SCHEMA IF NOT EXISTS {}_randomized;".format(INPUT_SCHEMA)
output = execute_sql(statement, dbname, user, host, password, port, isolation=True, results = False)

Copy over all the tables from original input schema, i.e. all tables in input_tables list.

Randomize only a subset of these tables. Tables to be randomized are:    
1) The set of tables from the staging schema that are referenced in the from\_obj section of each of the yaml files in /config/features. These staging tables have been linked to entity identifiers and are the source data for feature generation.  
2) Two labels tables from the staging schema that are referenced in config files: 'labels_casesonly_multiprior_win6mo_lab6mo' and 'labels_multiprior_win6mo_lab6mo'. Only randomizing the outcomes column in these labels tables to maintain the entity_id and outcome_date key pairing. This way, the states table does not need to be rerun.

Note that table names beginning with "staging.entity_" are not included in the list because they contain mappings between entity identifiers and other data. These relationships must be maintained so should not be randomized.

Write output to _randomized schema. 

In [36]:
input_tables

['addinfo',
 'booking',
 'booking_addl_info',
 'case_booking',
 'cases',
 'casesaka',
 'casescharge',
 'casesflag',
 'ccms_to_ucc',
 'citation',
 'docass',
 'entity_address',
 'entity_aka',
 'entity_all_events',
 'entity_any_name',
 'entity_best_demos',
 'entity_booking',
 'entity_case',
 'entity_cii',
 'entity_dmv',
 'entity_dob',
 'entity_gender',
 'entity_main_number',
 'entity_name',
 'entity_race',
 'feature_case_info',
 'feature_case_results',
 'feature_casesflag_info',
 'feature_charge_info',
 'firstname_gender',
 'firstname_race',
 'labels_1prior_win6mo_lab6mo',
 'labels_casesonly_multiprior_win6mo_lab6mo',
 'labels_multiprior_win6mo_lab6mo',
 'lastname_race',
 'prob',
 'probcon',
 'states_win6mo_lab6mo',
 'status_h',
 'turkey_list',
 'charge_categories',
 'case_flag_lkup',
 'case_dispo_lkup',
 'orig_agency_lkup',
 'branch_lkup',
 'case_result_lkup',
 'charge_lkup']

In [37]:
input_tables_to_randomize = [
               'booking_addl_info', 
               'branch_lkup',
               'case_booking',  
               'case_dispo_lkup', 
               'case_flag_lkup',
               'case_result_lkup',
               'charge_lkup',
               'feature_casesflag_info', 
               'feature_case_info', 
               'feature_case_results',
               'feature_charge_info',
               'labels_casesonly_multiprior_win6mo_lab6mo',
               'orig_agency_lkup',
               'labels_multiprior_win6mo_lab6mo'
               ]

In [40]:
for table_name in input_tables:
    seed = 0 # initiate to 0, increment in each iteration
    
    print("Working on table {}".format(table_name))
    
    if table_name in input_tables_to_randomize: # randomize before adding to _randomized schema
        # Pull the table from original schema
        print("\tPulling table")
        statement = "SELECT * FROM {}.{};".format(INPUT_SCHEMA, table_name)
        table = execute_sql(statement, dbname, user, host, password, port)

        # Randomize the table
        print("\tRandomizing")
        if table_name in ['labels_casesonly_multiprior_win6mo_lab6mo','labels_multiprior_win6mo_lab6mo']:
            randomized_table = randomize(table, do_not_randomize=['entity_id', 'outcome_date'], seed = seed)          
        else:
            randomized_table = randomize(table, seed = seed)

        # Make a new table in _randomized schema
        print("\tUploading randomized version")
        statement = "CREATE TABLE IF NOT EXISTS {0}_randomized.{1}_randomized (LIKE {0}.{1} INCLUDING ALL);".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)

        # Write results into new table
        statement = "SELECT COUNT(*) FROM {}_randomized.{}_randomized;".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port)
        if output.iloc[0,0]>0: # do nothing if new table already contains data
            print("\t*****SKIPPING TABLE {}_randomized -- it already has data".format(table_name))
        else:
            engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(user, password, host, port, dbname))
            randomized_table.to_sql(table_name+'_randomized', engine, schema = '{}_randomized'.format(INPUT_SCHEMA), index = False, if_exists='append')
    
    else: # copy to _randomized schema as is, without randomizing
        # Make a new table in _randomized schema
        print("\tUploading original version without randomizing")
        statement = "CREATE TABLE IF NOT EXISTS {0}_randomized.{1} (LIKE {0}.{1} INCLUDING ALL);".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)
        
        # Copy original data without randomizing
        statement = "SELECT COUNT(*) FROM {}_randomized.{};".format(INPUT_SCHEMA, table_name)
        output = execute_sql(statement, dbname, user, host, password, port)
        if output.iloc[0,0]>0: # do nothing if new table already contains data
            print("\t*****SKIPPING TABLE {} -- it already has data".format(table_name))
        else:
            statement = "INSERT INTO {0}_randomized.{1} (SELECT * from {0}.{1});".format(INPUT_SCHEMA, table_name)
            output = execute_sql(statement, dbname, user, host, password, port, isolation = True, results = False)
    seed += 1

Working on table addinfo
	Uploading original version without randomizing
	*****SKIPPING TABLE addinfo -- it already has data
Working on table booking
	Uploading original version without randomizing
	*****SKIPPING TABLE booking -- it already has data
Working on table booking_addl_info
	Pulling table
	Randomizing
	Uploading randomized version
	*****SKIPPING TABLE booking_addl_info_randomized -- it already has data
Working on table case_booking
	Pulling table
	Randomizing
	Uploading randomized version
	*****SKIPPING TABLE case_booking_randomized -- it already has data
Working on table cases
	Uploading original version without randomizing
	*****SKIPPING TABLE cases -- it already has data
Working on table casesaka
	Uploading original version without randomizing
	*****SKIPPING TABLE casesaka -- it already has data
Working on table casescharge
	Uploading original version without randomizing
	*****SKIPPING TABLE casescharge -- it already has data
Working on table casesflag
	Uploading original 

Also randomized the labels, table labels_multiprior_win6mo_lab6mo; forgot to include in input_tables_to_randomize list when previous cell was run

## II. Edit config files so that they point to the newly randomized raw schema

Experiment config file being used on the project is experiment_config_multiprior.yaml. Also need to edit all the feature config files under config/features. New versions of config files with an "_edited" suffix will be produced and saved in "features_randomized" folder.

In [None]:
CONFIG_FILES = [
    'config/experiment_config_multiprior.yaml', 
    'config/features/booking_info.yaml',
    'config/features/case_flags.yaml',
    'config/features/case_info.yaml',
    'config/features/case_results.yaml',
    'config/features/charge_info.yaml',
    'config/features/days_since.yaml',
    'config/features/demos.yaml',
    'config/features/multi_prior.yaml'
    ]

Make a dictionary of text replacements to apply across every preprocessing & config file. This should include:
- Schema and table names that have been changed, i.e. have a "_randomized" suffix

In [41]:
WORD_MAP = {x:x+'_randomized' for x in input_tables_to_randomize}
WORD_MAP[INPUT_SCHEMA+'.'] = INPUT_SCHEMA+'_randomized.'
WORD_MAP

{'booking_addl_info': 'booking_addl_info_randomized',
 'branch_lkup': 'branch_lkup_randomized',
 'case_booking': 'case_booking_randomized',
 'case_dispo_lkup': 'case_dispo_lkup_randomized',
 'case_flag_lkup': 'case_flag_lkup_randomized',
 'case_result_lkup': 'case_result_lkup_randomized',
 'charge_lkup': 'charge_lkup_randomized',
 'feature_case_info': 'feature_case_info_randomized',
 'feature_case_results': 'feature_case_results_randomized',
 'feature_casesflag_info': 'feature_casesflag_info_randomized',
 'feature_charge_info': 'feature_charge_info_randomized',
 'labels_casesonly_multiprior_win6mo_lab6mo': 'labels_casesonly_multiprior_win6mo_lab6mo_randomized',
 'labels_multiprior_win6mo_lab6mo': 'labels_multiprior_win6mo_lab6mo_randomized',
 'orig_agency_lkup': 'orig_agency_lkup_randomized',
 'staging.': 'staging_randomized.'}

Apply find+replace to every config file. Write feature config files to a new folder (config/features_randomized) that will later be referenced when running the experiment. 

In [None]:
# !cd config && mkdir features_randomized

In [None]:
#for filename in CONFIG_FILES:
#    new_filename = get_new_filename(filename, "_edited", {'features': 'features_randomized'})
#    replace_words_in_file(filename, new_filename, WORD_MAP) 

**NOTE**: Be sure to open the outputted files (with "_edited" suffix) to check that no unexpected replacements were made and that all necessary replacements are made. 

Model Group Key: We have to manually add a "purpose" model group key into the experiment config file. Including a "purpose" indicates that this experiment is for leakage detection and helps to distinguish it from existing experiments. Add "purpose: leakage_detection" under the user_metadata section, and add "purpose" under the model_group_keys section.

## III. Run an experiment with this new setup

Now that we've set up randomized versions of the original schemas in the database and created a new config file for a randomized experiment, it's time to run the actual triage experiment. This is done in an aws ec2 instance.

Make sure you have a database.yaml file, required by triage for modeling. See example [here](https://github.com/dssg/la_prosecutor/blob/master/example_database.yaml).

Update run.py to reflect the particulars of this new config.
- set PROJECT_PATH to the local directory where you want to store output
- set n_processes to be less than the number of cores on your machine

Run the following command: python run.py -v -c config/experiment_config_multiprior_edited.yaml -f features_randomized > log.txt

## IV. Align the results of randomized experiment against original experiment

Model group from the original, non-randomized experiment: 22, model_id 521 (matches train end date of randomized experiment)  
Model group from randomized experiment: 112, model_id 1113  

In [42]:
GROUP_ID_ORIG = 22
GROUP_ID_RAND = 112

In [43]:
# original model group
statement = "SELECT * FROM results.models where model_group_id = {};".format(GROUP_ID_ORIG)
models_orig = execute_sql(statement, dbname, user, host, password, port)
models_orig

Unnamed: 0,model_id,model_group_id,model_hash,run_time,batch_run_time,model_type,model_parameters,model_comment,batch_comment,config,experiment_hash,train_end_time,test,train_matrix_uuid,train_label_window
0,22,22,77f84946bf25e4e5ef2879cc126c45ee,2017-09-26 22:06:39.349441,2017-09-26 22:06:31.322862,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2012-07-01,False,7f857d499d4cb5f198ada6fbfc298932,180 days
1,50,22,b1c57c7b607cbbb48a5ad1b8eed22678,2017-09-27 01:03:23.889844,2017-09-27 01:03:17.143301,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2013-01-01,False,9abc62d7bb8ba6f76b159e42978db856,180 days
2,76,22,e745ecd44c0808091c83a1ac92ec5e84,2017-09-27 04:13:12.106100,2017-09-27 04:13:04.753935,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2013-07-01,False,3c24fb375687ee858081c270c2e2523a,180 days
3,100,22,0b86a9335051571a1abb7b0281b3afa4,2017-09-27 07:14:26.207931,2017-09-27 07:14:17.470914,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2014-01-01,False,1abb834182a2c7d2813571290db4092c,180 days
4,127,22,b1f2449b07561c3656a2f196a6ff5b40,2017-09-27 10:40:08.474958,2017-09-27 10:40:00.779670,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2014-07-01,False,a8d2f56c60a0eb0c6817a556ea04ad01,180 days
5,154,22,4377998afedbf7eaeecfa845ce594fa7,2017-09-27 14:20:25.823376,2017-09-27 14:20:17.907213,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2015-01-01,False,068f2995bfd9084867566271e0ed028b,180 days
6,180,22,539f1a5a91f898eb10e87215c1a9d075,2017-09-27 17:53:51.344478,2017-09-27 17:53:40.796521,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2015-07-01,False,13b7167eaa4f276d2cc2304ca11bf31b,180 days
7,205,22,5c225133baad5556244fe722fe098bdf,2017-09-27 21:28:12.925761,2017-09-27 21:28:05.338159,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2016-01-01,False,760c7a17a6800bffc6616d6c0dd4f611,180 days
8,232,22,ca28b4335551d6e75a332329d876a340,2017-09-28 01:08:52.532772,2017-09-28 01:08:43.338539,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2016-07-01,False,b0db98ce151fdeec11f0eeaea061d56f,180 days
9,258,22,564296d9b3353ba25b507b0dff835085,2017-09-28 04:59:54.356953,2017-09-28 04:59:46.265510,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,f58631bba52780245ce2b57e181b97a5,2017-01-01,False,740dce77bc130c0a469b5bd46bb4aa35,180 days


In [44]:
# randomized model group 
statement = "SELECT * FROM results.models WHERE model_group_id = {}".format(GROUP_ID_RAND)
model_groups = execute_sql(statement, dbname, user, host, password, port)
model_groups

Unnamed: 0,model_id,model_group_id,model_hash,run_time,batch_run_time,model_type,model_parameters,model_comment,batch_comment,config,experiment_hash,train_end_time,test,train_matrix_uuid,train_label_window
0,1113,112,fa6f62424b9d18cfee2c2b31ce37f33c,2018-05-02 19:01:14.825899,2018-05-02 19:00:15.451838,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,ac327328c1e4881280d1f946390e34df,2017-07-01,False,0828a23f6b0d636924a7b9f0a4c89ec9,180 days


In [45]:
MODEL_ID_ORIG = 521
MODEL_ID_RAND = 1113

In [46]:
# randomized model
statement = "SELECT * FROM results.models where model_id = {};".format(MODEL_ID_ORIG)
models_rand = execute_sql(statement, dbname, user, host, password, port)
models_rand

Unnamed: 0,model_id,model_group_id,model_hash,run_time,batch_run_time,model_type,model_parameters,model_comment,batch_comment,config,experiment_hash,train_end_time,test,train_matrix_uuid,train_label_window
0,521,22,73c83fc936f7463a4213f4ae68a2fa96,2017-10-05 22:06:21.864805,2017-10-05 22:06:14.131792,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,3ae1a6b647dc2c9031dcfd0036722d59,2017-07-01,False,25b3364739391116963fd4a1f1d1291a,180 days


In [47]:
statement = "SELECT * FROM results.models where model_id = {};".format(MODEL_ID_RAND)
models_rand = execute_sql(statement, dbname, user, host, password, port)
models_rand

Unnamed: 0,model_id,model_group_id,model_hash,run_time,batch_run_time,model_type,model_parameters,model_comment,batch_comment,config,experiment_hash,train_end_time,test,train_matrix_uuid,train_label_window
0,1113,112,fa6f62424b9d18cfee2c2b31ce37f33c,2018-05-02 19:01:14.825899,2018-05-02 19:00:15.451838,catwalk.estimators.classifiers.ScaledLogisticR...,"{'penalty': 'l1', 'C': 1}",,,,ac327328c1e4881280d1f946390e34df,2017-07-01,False,0828a23f6b0d636924a7b9f0a4c89ec9,180 days


In [48]:
statement = "SELECT * FROM results.predictions where model_id = {};".format(MODEL_ID_ORIG)
predictions_orig = execute_sql(statement, dbname, user, host, password, port)
predictions_orig.head()

Unnamed: 0,model_id,entity_id,as_of_date,score,label_value,rank_abs,rank_pct,matrix_uuid,test_label_window
0,521,1,2017-07-01,0.004601681861918,0,,,2d6d75cc75f290801592df0ed32109fb,180 days
1,521,2,2017-07-01,0.0136164505928362,0,,,2d6d75cc75f290801592df0ed32109fb,180 days
2,521,4,2017-07-01,0.0102403881577497,0,,,2d6d75cc75f290801592df0ed32109fb,180 days
3,521,6,2017-07-01,0.0020673067347159,0,,,2d6d75cc75f290801592df0ed32109fb,180 days
4,521,11,2017-07-01,0.0942134032862156,0,,,2d6d75cc75f290801592df0ed32109fb,180 days


In [49]:
statement = "SELECT * FROM results.predictions where model_id = {};".format(MODEL_ID_RAND)
predictions_rand = execute_sql(statement, dbname, user, host, password, port)
predictions_rand.head(100)

Unnamed: 0,model_id,entity_id,as_of_date,score,label_value,rank_abs,rank_pct,matrix_uuid,test_label_window
0,1113,1,2017-07-01,0.0704480512697282,0.0,,,500d25f0ba035bad035f1dd775f618bc,180 days
1,1113,2,2017-07-01,0.0704480512697282,,,,500d25f0ba035bad035f1dd775f618bc,180 days
2,1113,4,2017-07-01,0.0704480512697282,,,,500d25f0ba035bad035f1dd775f618bc,180 days
3,1113,6,2017-07-01,0.0704480512697282,0.0,,,500d25f0ba035bad035f1dd775f618bc,180 days
4,1113,11,2017-07-01,0.0704480512697282,,,,500d25f0ba035bad035f1dd775f618bc,180 days
5,1113,32,2017-07-01,0.0704480512697282,0.0,,,500d25f0ba035bad035f1dd775f618bc,180 days
6,1113,61,2017-07-01,0.0704480512697282,0.0,,,500d25f0ba035bad035f1dd775f618bc,180 days
7,1113,66,2017-07-01,0.0704480512697282,,,,500d25f0ba035bad035f1dd775f618bc,180 days
8,1113,68,2017-07-01,0.0704480512697282,0.0,,,500d25f0ba035bad035f1dd775f618bc,180 days
9,1113,75,2017-07-01,0.0704480512697282,0.0,,,500d25f0ba035bad035f1dd775f618bc,180 days


In [50]:
predictions_rand.shape
predictions_orig.shape

(423363, 9)

(423431, 9)

### V. Compare the matrices between original and randomized model.

Compare train and test matrices between original and randomized model. Issues:   
- There are additional feature columns in original model that are not in randomized model. Not sure why.
- The same entity id is duplicated many times in the training matrix. No difference in any of the columns, purely duplicate rows.
- Features of the entity do not seem to have been randomized, such as days since ...

In [51]:
train_matrix_orig = pd.read_csv('model_521/25b3364739391116963fd4a1f1d1291a.csv', nrows=100)
train_orig_cols = set(train_matrix_orig.columns)

In [52]:
train_matrix_orig.head()


Unnamed: 0,entity_id,as_of_date,case_results_entity_id_1y_fine_max,case_results_entity_id_1y_fine_sum,case_results_entity_id_1y_fine_avg,case_results_entity_id_1y_jail_max,case_results_entity_id_1y_jail_sum,case_results_entity_id_1y_jail_avg,case_results_entity_id_1y_jail_prob_max,case_results_entity_id_1y_jail_prob_sum,...,num_prior_entity_id_10y_any_sum,num_prior_entity_id_10y_nonexcl_sum,num_prior_entity_id_10y_nonexcl_avg,num_prior_entity_id_50y_cases_sum,num_prior_entity_id_50y_bookings_sum,num_prior_entity_id_50y_any_sum,num_prior_entity_id_50y_nonexcl_sum,num_prior_entity_id_50y_nonexcl_avg,days_since_entity_id_50y_last_event_min,outcome
0,1,2017-01-01 00:00:00,0.0,0.0,0.0,0,0,0.0,0,0,...,0,0,0.0,3,0,3,3,1.0,5305,0
1,2,2017-01-01 00:00:00,0.0,0.0,0.0,0,0,0.0,0,0,...,2,2,1.0,3,1,4,4,1.0,1575,0
2,4,2017-01-01 00:00:00,0.0,0.0,0.0,0,0,0.0,0,0,...,1,1,1.0,3,0,3,3,1.0,2986,0
3,6,2017-01-01 00:00:00,0.0,0.0,0.0,0,0,0.0,0,0,...,0,0,0.0,2,0,2,2,1.0,6138,0
4,11,2017-01-01 00:00:00,0.0,0.0,0.0,0,0,0.0,0,0,...,2,2,1.0,1,1,2,2,1.0,1264,1


In [53]:
train_matrix_orig.days_since_entity_id_50y_last_event_min[train_matrix_orig.entity_id == 4]

2    2986
Name: days_since_entity_id_50y_last_event_min, dtype: int64

In [54]:
train_matrix_rand = pd.read_csv('experiment_output/matrices/0828a23f6b0d636924a7b9f0a4c89ec9.csv', nrows=100)
train_rand_cols = set(train_matrix_rand.columns)

In [56]:
train_matrix_rand.days_since_entity_id_50y_last_event_min[train_matrix_rand.entity_id == 4]

8     2986
9     2986
10    2986
11    2986
Name: days_since_entity_id_50y_last_event_min, dtype: int64

In [57]:
diff1 = train_orig_cols - train_rand_cols
diff1

diff2 = train_rand_cols - train_orig_cols
diff2

{'charge_info_entity_id_10y_chrg_cat_id_29_max',
 'charge_info_entity_id_10y_chrg_cat_id_29_sum',
 'charge_info_entity_id_1y_chrg_cat_id_29_max',
 'charge_info_entity_id_1y_chrg_cat_id_29_sum',
 'charge_info_entity_id_2y_chrg_cat_id_29_max',
 'charge_info_entity_id_2y_chrg_cat_id_29_sum',
 'charge_info_entity_id_50y_chrg_cat_id_29_max',
 'charge_info_entity_id_50y_chrg_cat_id_29_sum',
 'charge_info_entity_id_5y_chrg_cat_id_29_max',
 'charge_info_entity_id_5y_chrg_cat_id_29_sum',
 'freq_chgs_entity_id_7d_max_cat_id_29_max'}

set()

Issues:
- Even though the staging tables appear to be randomized, the features are not
- There are columns in original model not in randomized model

In [58]:
test_matrix_orig = pd.read_csv('model_521/2d6d75cc75f290801592df0ed32109fb.csv', nrows=100)
test_orig_cols = list(test_matrix_orig.columns)

In [59]:
test_matrix_orig.head()


Unnamed: 0,entity_id,as_of_date,case_results_entity_id_1y_fine_max,case_results_entity_id_1y_fine_sum,case_results_entity_id_1y_fine_avg,case_results_entity_id_1y_jail_max,case_results_entity_id_1y_jail_sum,case_results_entity_id_1y_jail_avg,case_results_entity_id_1y_jail_prob_max,case_results_entity_id_1y_jail_prob_sum,...,num_prior_entity_id_10y_any_sum,num_prior_entity_id_10y_nonexcl_sum,num_prior_entity_id_10y_nonexcl_avg,num_prior_entity_id_50y_cases_sum,num_prior_entity_id_50y_bookings_sum,num_prior_entity_id_50y_any_sum,num_prior_entity_id_50y_nonexcl_sum,num_prior_entity_id_50y_nonexcl_avg,days_since_entity_id_50y_last_event_min,outcome
0,1,2017-07-01 00:00:00,0,0,0.0,0,0,0.0,0,0,...,0,0,0.0,3,0,3,3,1.0,5486,0
1,2,2017-07-01 00:00:00,0,0,0.0,0,0,0.0,0,0,...,2,2,1.0,3,1,4,4,1.0,1756,0
2,4,2017-07-01 00:00:00,0,0,0.0,0,0,0.0,0,0,...,1,1,1.0,3,0,3,3,1.0,3167,0
3,6,2017-07-01 00:00:00,0,0,0.0,0,0,0.0,0,0,...,0,0,0.0,2,0,2,2,1.0,6319,0
4,11,2017-07-01 00:00:00,0,0,0.0,0,0,0.0,0,0,...,3,3,1.0,1,2,3,3,1.0,149,0


In [13]:
test_matrix_rand = pd.read_csv('experiment_output/matrices/500d25f0ba035bad035f1dd775f618bc.csv', nrows=100)
test_rand_cols = list(test_matrix_rand.columns)

In [60]:
test_matrix_rand.head()


Unnamed: 0,entity_id,as_of_date,demos_entity_id_50y_age_max,demos_entity_id_50y_age_first_case_max,demos_entity_id_50y_dob_imputed_max,demos_entity_id_50y_gender_male_max,demos_entity_id_50y_gender_imputed_name_max,demos_entity_id_50y_gender_imputed_pop_max,demos_entity_id_50y_race_H_max,demos_entity_id_50y_race_B_max,...,num_prior_entity_id_10y_bookings_sum,num_prior_entity_id_10y_any_sum,num_prior_entity_id_10y_nonexcl_sum,num_prior_entity_id_10y_nonexcl_avg,num_prior_entity_id_50y_cases_sum,num_prior_entity_id_50y_bookings_sum,num_prior_entity_id_50y_any_sum,num_prior_entity_id_50y_nonexcl_sum,num_prior_entity_id_50y_nonexcl_avg,outcome
0,1,2017-07-01 00:00:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,2,2017-07-01 00:00:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
2,4,2017-07-01 00:00:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
3,6,2017-07-01 00:00:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,11,2017-07-01 00:00:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


Grab the evaluation metrics for each of these models.

In [None]:
statement = "SELECT * FROM results.evaluations where model_id = {};".format(MODEL_ID_ORIG)
evaluations_orig = execute_sql(statement, dbname, user, host, password, port)
evaluations_orig

In [None]:
statement = "SELECT * FROM results.evaluations where model_id = {};".format(MODEL_ID_RAND)
evaluations_rand = execute_sql(statement, dbname, user, host, password, port)
evaluations_rand

## VI. Review the predictions and important features of randomized model