Purpose: Demonstrate how leakage can be detected using a data randomization process.  
Process: Create fake data using bernoulli GLM data-generating process (DGP). Intentionally leak information and then show that this code helps detect the leakage.

In [1]:
import pandas as pd
import sys
import numpy as np
import os
import math
from sklearn import linear_model
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn import metrics 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')

In [2]:
# Set seed
np.random.seed(0)

In [3]:
def randomize(filepath, index_col = None):
    '''
    Randomize column values of a file. Each column is randomized independently.
    
    Inputs:
        filepath (str): path to file to randomize; may be of type csv or txt
        index_col (str): optional name of column to use as index; will not be randomized
    
    Outputs:
        df (dataframe): dataframe representation of randomized data
        Output file will be generated, named as original file name + "_randomized"
    '''
    
    # Treat csv and txt differently
    filename, file_extension = os.path.splitext(filepath)
    if file_extension == '.csv':
        sep = ","
    elif file_extension == '.txt':
        sep = "\t"
        
    # Randomize each column
    df = pd.read_csv(filepath, sep = sep, index_col = index_col) 
    cols = df.columns
    for col in cols:
        print('... Randomizing column ' + col)
        df[col] = np.random.permutation(df[col])
        
    # Print to new csv or txt
    new_file = filename + '_randomized' + file_extension
    df.to_csv(new_file)   
    
    return df

In [13]:
def generate_data(period, n_rows, m_columns, threshold, betas, intercept):
    '''
    Generate fake data for a classification task, based on data generating process where:
    - Features are drawn iid from standard normal distribution
    - Regression equation: y = intercept + beta_1 * x1 + .. + beta_m * xm + error
    - Error is drawn from standard normal distribution
    - Outcome is 1 if p value from logistic function exceeds threshold, 0 otherwise
    
    Inputs:
        period (int): time period
        n_rows (int): number of observations to generate
        m_columns (int): number of features to generate
        betas (list): list of numerical values to use as coefficients; length must equal m_columns, 
            i.e. don't include intercept
        intercept (int): constant number to use as intercept of regression equation
        
    Outputs:
        X (dataframe): matrix of feature variables
        y (series): outcome variable
    '''
    print("Generating {} observations...".format(n_rows))
    
    # generate column names
    colnames = []
    for i in range(m_columns):
        colnames.append('x' + str(i + 1))
    
    # generate fake data
    df = pd.DataFrame(np.random.normal(size=(n_rows, m_columns)), columns = colnames)
    df.insert(loc = 0, column = 'intercept', value = intercept)
    betas = [1] + betas
    df['z'] = np.multiply(np.array(betas), df).sum(axis=1) + np.random.normal()
    df['pr'] = df['z'].apply(lambda x: 1/(1 + math.exp(x)))
    df['outcome'] = np.where(df['pr'] > threshold, 1, 0)
    df.index.name = 'idx'
    df['period'] = period
    
    return df

In [5]:
def run_logreg(X_train, y_train, X_test, y_test):
    
    print("Baseline accuracy: " + str(round(1 - np.mean(y_train), 2)))
    
    # Create a logistic regression object
    logreg = linear_model.LogisticRegression()

    # Train model with training set
    logreg.fit(X_train, y_train)

    # Make predictions with test set
    y_pred = logreg.predict(X_test)

    # Save probabilities
    probs = logreg.predict_proba(X_test)

    # Print accuracy and AUC
    print("Accuracy: " + str(metrics.accuracy_score(y_test, y_pred)))
    print("AUC Score: " + str(metrics.roc_auc_score(y_test, probs[:, 1])))

    # Print confusion matrix
    print(metrics.confusion_matrix(y_test, y_pred))
    
    return logreg

In [6]:
def logreg_pipeline(df):
    '''
    Put raw inputs through basic machine learning pipeline.
    
    Inputs:
        dataframe: Pandas dataframe representing raw input file
    
    OUtputs:
        logistic regression model object
    '''
    # Extract features and outcome from dataframe
    X = df.drop(['z', 'pr', 'outcome'], axis = 1)
    y = df['outcome']
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
   
    return run_logreg(X_train, y_train, X_test, y_test)

## Case 1: Stable DGP (same parameters and threshold over time)
### Non-temporal example with stable DGP

Generate data for 5 periods with constant DGP

In [7]:
frames = []
for i in range(5):
    period_df = generate_data(period = i, n_rows = 200, m_columns = 5, threshold = 0.8, betas = [1,2,3,4,5], intercept = 1)
    frames.append(period_df)
df = pd.concat(frames)
df.to_csv('fake_data.csv')

Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
Generating 200 observations...


Run original input file through pipeline.

In [8]:
logreg_pipeline(df)

Baseline accuracy: 0.6
Accuracy: 0.951515151515
AUC Score: 0.993717636308
[[186  10]
 [  6 128]]


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Put the input file through randomization process, then run it through the pipeline. Model performs no better than random, as expected.

In [9]:
df_random = randomize('fake_data.csv', 'idx') # creates csv named fake_data.csv
logreg_pipeline(df_random)

... Randomizing column intercept
... Randomizing column x1
... Randomizing column x2
... Randomizing column x3
... Randomizing column x4
... Randomizing column x5
... Randomizing column z
... Randomizing column pr
... Randomizing column outcome
... Randomizing column period
Baseline accuracy: 0.61
Accuracy: 0.584848484848
AUC Score: 0.475776256571
[[193   0]
 [137   0]]


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Try introducing an obvious case of data leakage by using the probability column as a feature. Run this through the pipeline. Model performs well, as expected.

In [10]:
df['leaky_col'] = df['pr']
logreg_pipeline(df)

Baseline accuracy: 0.6
Accuracy: 0.978787878788
AUC Score: 0.998730769231
[[194   6]
 [  1 129]]


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Put the input file through randomization process. Again, introduce data leakage into the pipeline. Model performs better than random, thus suggesting the existence of data leakage.

In [11]:
df_random = randomize('fake_data.csv', 'idx') # generates csv named "fake_data_randomized.csv"
df['leaky_col'] = df['pr']
logreg_pipeline(df)

... Randomizing column intercept
... Randomizing column x1
... Randomizing column x2
... Randomizing column x3
... Randomizing column x4
... Randomizing column x5
... Randomizing column z
... Randomizing column pr
... Randomizing column outcome
... Randomizing column period
Baseline accuracy: 0.6
Accuracy: 0.966666666667
AUC Score: 0.997206917661
[[190   8]
 [  3 129]]


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Case 2: Regime change - intercept
### The DGP is stable over time, then the intercept suddenly changes

Generate data for 5 periods with intercept change in period 2

In [15]:
frames = []
for i in [0,1]:
    period_df = generate_data(period = i, n_rows = 200, m_columns = 5, threshold = 0.8, betas = [1,2,3,4,5], intercept = 1)
    frames.append(period_df)
for i in [2,3,4]:
    period_df = generate_data(period = i, n_rows = 200, m_columns = 5, threshold = 0.8, betas = [1,2,3,4,5], intercept = 10) 
    frames.append(period_df)
df = pd.concat(frames)
df.to_csv('fake_data.csv')


Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
