Purpose: Demonstrate how leakage can be detected using a data randomization process.  
Process: Create fake data using bernoulli GLM data-generating process. Intentionally leak information and then show that this code helps detect the leakage.

In [4]:
import pandas as pd
import sys
import numpy as np
import os


In [22]:
def randomize(filepath, index_col = None):
    '''
    Randomize column values of a file. Each column is randomized independently.
    
    Inputs:
        filepath (str): path to file to randomize; may be of type csv or txt
        index_col (str): optional name of column to use as index; will not be randomized
    
    Outputs:
        df (dataframe): dataframe representation of randomized data
        Output file will be generated, named as original file name + "_randomized"
    '''
    
    # Treat csv and txt differently
    filename, file_extension = os.path.splitext(filepath)
    if file_extension == '.csv':
        sep = ","
    elif file_extension == '.txt':
        sep = "\t"
        
    # Randomize each column
    df = pd.read_csv(filepath, sep = sep, index_col = index_col) 
    cols = df.columns
    for col in cols:
        print('Randomizing column ' + col)
        np.random.shuffle(df[col])
        
    # Print to new csv or txt
    new_file = filename + '_randomized' + file_extension
    df.to_csv(new_file)   
    
    return df

In [35]:
def generate_data(n):
    '''
    Generate n rows x 5 columns of random integers, drawn from discrete
    uniform distribution between -10 and 10
    Regression equation: y = intercept + x1 + x2 + x3 + x4 + x5 + error
    Intercept is column of 1's to capture model intercept
    Error is normally distributed with mean = 0 and variance = 1
    
    Inputs:
        n (int): number of observations to generate
    Outputs:
        Output files will be generated: fake_data.csv and fake_data_no_outcome.csv
    '''
    print("Generating {} observations...".format(n))
    df = pd.DataFrame(np.random.randint(-10,10, size=(n,5)), 
                 columns=['x1', 'x2', 'x3', 'x4', 'x5'])
    df['intercept'] = 1
    df['actual_y'] = df.sum(axis = 1) + np.random.normal()
    df.index.name = 'idx'
    df.to_csv('fake_data.csv')
    
    df_no_actual = df.drop(['actual_y'], axis = 1)
    df_no_actual.to_csv('fake_data_no_outcome.csv')
    return df


In [36]:
def test_randomization(file_no_outcome, file_with_outcome, index_col = None):
    ''' 
    Function to test the randomization function by comparing mean squared error 
    (MSE) before and after randomization
    MSE = 1/n * sum((actual - predicted)**2)
    '''
    # Before randomization
    orig_df = pd.read_csv(file_with_outcome, index_col = index_col)
    
    orig_df['pred_y'] = orig_df[['x1', 'x2', 'x3', 'x4', 'x5']].sum(axis = 1)
    orig_df['sq_error'] = (orig_df['actual_y'] - orig_df['pred_y'])**2
    orig_mse =  np.sum(orig_df['sq_error']) / orig_df.shape[0]
    print('original mean squared error = ' + str(orig_mse))
    
    # After randomization
    random_df = randomize(file_no_outcome, index_col)
    random_df = random_df.join(orig_df['actual_y'], how='inner')
    
    random_df['pred_y'] = random_df[['x1', 'x2', 'x3', 'x4', 'x5']].sum(axis = 1)
    random_df['sq_error'] = (random_df['actual_y'] - random_df['pred_y'])**2
    random_mse =  np.sum(random_df['sq_error']) / random_df.shape[0]
    print('random mean squared error = ' + str(random_mse))
    
    return random_df

In [37]:
generate_data(100)
file_no_outcome = 'fake_data_no_outcome.csv'
file_with_outcome = 'fake_data.csv'
df = test_randomization(file_no_outcome, file_with_outcome, 'idx')

Generating 100 observations...
original mean squared error = 2.1487900900386587
Randomizing column x1
Randomizing column x2
Randomizing column x3
Randomizing column x4
Randomizing column x5
Randomizing column intercept
random mean squared error = 295.96879009003874
