## Purpose  
Demonstrate how leakage can be detected using a data randomization process.  
## Process
Create fake data using bernoulli GLM data-generating process (DGP). Intentionally leak information and then show that this code helps detect the leakage.

In [1]:
import pandas as pd
import sys
import numpy as np
import os
import math
from sklearn import linear_model
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn import metrics 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')

In [2]:
# Set seed
np.random.seed(0)

In [31]:
def randomize(filepath, index_col = None, do_not_randomize = None):
    '''
    Randomize column values of a file. Each column is randomized independently.
    
    Inputs:
        filepath (str): path to file to randomize; may be of type csv or txt
        index_col (str): optional name of column to use as index; will not 
            be randomized
        do_not_randomize (list): optional list of strings indicating names of 
            columns that should not be randomized
    Outputs:
        df (dataframe): dataframe representation of randomized data
        Output file will be generated, named as original file name + "_randomized"
    '''
    
    # Treat csv and txt differently
    filename, file_extension = os.path.splitext(filepath)
    if file_extension == '.csv':
        sep = ","
    elif file_extension == '.txt':
        sep = "\t"
        
    # Randomize column by column
    df = pd.read_csv(filepath, sep = sep, index_col = index_col) 
    
    if do_not_randomize:
        cols = [c for c in df.columns if c not in do_not_randomize]
    else:
        cols = df.columns
        
    for col in cols:
        print('... Randomizing column ' + col)
        df[col] = np.random.permutation(df[col])
        
    # Print to new csv or txt
    new_file = filename + '_randomized' + file_extension
    df.to_csv(new_file)   
    
    return df

In [4]:
def generate_data(period, n_rows, m_columns, threshold, betas, intercept):
    '''
    Generate fake data for a classification task, based on data generating process where:
    - Features are drawn iid from standard normal distribution
    - Regression equation: y = intercept + beta_1 * x1 + .. + beta_m * xm + error
    - Error is drawn from standard normal distribution
    - Outcome is 1 if p value from logistic function exceeds threshold, 0 otherwise
    
    Inputs:
        period (int): time period
        n_rows (int): number of observations to generate
        m_columns (int): number of features to generate
        betas (list): list of numerical values to use as coefficients; length must equal m_columns, 
            i.e. don't include intercept
        intercept (int): constant number to use as intercept of regression equation
        
    Outputs:
        X (dataframe): matrix of feature variables
        y (series): outcome variable
    '''
    print("Generating {} observations...".format(n_rows))
    
    # generate column names
    colnames = []
    for i in range(m_columns):
        colnames.append('x' + str(i + 1))
    
    # generate fake data
    df = pd.DataFrame(np.random.normal(size=(n_rows, m_columns)), columns = colnames)
    df.insert(loc = 0, column = 'intercept', value = intercept)
    betas = [1] + betas
    df['z'] = np.multiply(np.array(betas), df).sum(axis=1) + np.random.normal()
    df['pr'] = df['z'].apply(lambda x: 1/(1 + math.exp(x)))
    df['outcome'] = np.where(df['pr'] > threshold, 1, 0)
    df.index.name = 'idx'
    df['period'] = period
    
    return df

In [5]:
def run_logreg(X_train, y_train, X_test, y_test):
    
    print("Baseline accuracy: " + str(round(1 - np.mean(y_train), 2)))
    
    # Create a logistic regression object
    logreg = linear_model.LogisticRegression()

    # Train model with training set
    logreg.fit(X_train, y_train)

    # Make predictions with test set
    y_pred = logreg.predict(X_test)

    # Save probabilities
    probs = logreg.predict_proba(X_test)

    # Print accuracy and AUC
    print("Accuracy: " + str(metrics.accuracy_score(y_test, y_pred)))
    print("AUC Score: " + str(metrics.roc_auc_score(y_test, probs[:, 1])))

    # Print confusion matrix
    print(metrics.confusion_matrix(y_test, y_pred))
    
    return logreg#, X_test, y_test, y_pred

In [6]:
def get_X_y(df):
    '''
    Split data frame into X and y objects 
    '''
    X = df.drop(['z', 'pr', 'outcome'], axis = 1)
    y = df['outcome']
    return X, y

def logreg_pipeline(df, temporal = False):
    '''
    Put raw inputs through basic machine learning pipeline.
    
    Inputs:
        dataframe: Pandas dataframe representing raw input file
        temporal (booL): Indicator for whether a temporal TimeSeriesSplit
            should be use instead of non-temporal train_test_split
    OUtputs:
        list of logistic regression model objects
    '''
    
    # Save models
    models = []
    
    # Split into train and test sets
    if temporal: # do temporal train-test split
        periods = np.sort(df.period.unique())
        splits = TimeSeriesSplit(n_splits = max(periods))
        print("Time series split")
        for train_index, test_index in splits.split(periods):
            print("Train periods: ", train_index, "Test periods: ", test_index)
            df_train = df[df.period.isin(train_index)]
            df_test = df[df.period.isin(test_index)]
            X_train, y_train = get_X_y(df_train)
            X_test, y_test = get_X_y(df_test)
            models.append(run_logreg(X_train, y_train, X_test, y_test))

    else: # do random, non-temporal train-test split
        print("Train test split -- test size = 0.33")
        X, y = get_X_y(df)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        models.append(run_logreg(X_train, y_train, X_test, y_test))
    return models

## Case 1: Stable DGP (same parameters and threshold over time)
### Non-temporal example with stable DGP

Generate data for 5 periods with constant DGP

In [32]:
frames = []
for i in range(5):
    period_df = generate_data(period = i, n_rows = 200, m_columns = 5, threshold = 0.8, betas = [1,2,3,4,5], intercept = 1)
    frames.append(period_df)
df = pd.concat(frames)
df.to_csv('fake_data.csv')

Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
Generating 200 observations...


Run original input file through pipeline.

In [8]:
logreg_pipeline(df)

Train test split -- test size = 0.33
Baseline accuracy: 0.6
Accuracy: 0.951515151515
AUC Score: 0.993717636308
[[186  10]
 [  6 128]]


[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]

Put the input file through randomization process, then run it through the pipeline. Model performs no better than random, as expected.

In [33]:
df_random = randomize('fake_data.csv', 'idx', ['period', 'outcome']) # creates csv named fake_data_randomized.csv
logreg_pipeline(df_random)

... Randomizing column intercept
... Randomizing column x1
... Randomizing column x2
... Randomizing column x3
... Randomizing column x4
... Randomizing column x5
... Randomizing column z
... Randomizing column pr
Train test split -- test size = 0.33
Baseline accuracy: 0.59
Accuracy: 0.572727272727
AUC Score: 0.489920956091
[[182  11]
 [130   7]]


[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]

Try introducing an obvious case of data leakage by using the outcome column as a feature. Run this through the pipeline. Model performs well, as expected.

In [10]:
df['leaky_col'] = df['outcome']
logreg_pipeline(df)

Train test split -- test size = 0.33
Baseline accuracy: 0.6
Accuracy: 1.0
AUC Score: 1.0
[[200   0]
 [  0 130]]


[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]

Put the input file through randomization process. Again, introduce data leakage into the pipeline. Model performs better than random, thus suggesting the existence of data leakage.

In [34]:
df_random = randomize('fake_data.csv', 'idx', ['period', 'outcome']) # generates csv named "fake_data_randomized.csv"
df_random['leaky_col'] = df_random['outcome']
logreg_pipeline(df_random)

... Randomizing column intercept
... Randomizing column x1
... Randomizing column x2
... Randomizing column x3
... Randomizing column x4
... Randomizing column x5
... Randomizing column z
... Randomizing column pr
Train test split -- test size = 0.33
Baseline accuracy: 0.59
Accuracy: 1.0
AUC Score: 1.0
[[191   0]
 [  0 139]]


[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]

## Case 2: Regime change - intercept
### The DGP is stable over time, then the intercept suddenly changes

Generate data for 6 periods with intercept changes in periods 2 and 4  
Dataset is in temporal order by default

In [81]:
frames = []
for i in [0,1]:
    period_df = generate_data(period = i, n_rows = 200, m_columns = 5, threshold = 0.8, betas = [1,2,3,4,5], intercept = 1)
    frames.append(period_df)
for i in [2,3]:
    period_df = generate_data(period = i, n_rows = 200, m_columns = 5, threshold = 0.8, betas = [1,2,3,4,5], intercept = 2) 
    frames.append(period_df)
for i in [4,5]:
    period_df = generate_data(period = i, n_rows = 200, m_columns = 5, threshold = 0.8, betas = [1,2,3,4,5], intercept = 3) 
    frames.append(period_df)
df = pd.concat(frames)
df.to_csv('fake_data.csv')

Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
Generating 200 observations...
Generating 200 observations...


See what happens when you use regular split (thus introducing leakage) vs the correct time series split

Run pipeline using appropriate temporal train-test split

In [82]:
logreg_pipeline(df, temporal=True)

Time series split
Train periods:  [0] Test periods:  [1]
Baseline accuracy: 0.76
Accuracy: 0.745
AUC Score: 0.98686716792
[[105   0]
 [ 51  44]]
Train periods:  [0 1] Test periods:  [2]
Baseline accuracy: 0.64
Accuracy: 0.805
AUC Score: 0.977114121511
[[78 38]
 [ 1 83]]
Train periods:  [0 1 2] Test periods:  [3]
Baseline accuracy: 0.62
Accuracy: 0.835
AUC Score: 0.988204456094
[[76 33]
 [ 0 91]]
Train periods:  [0 1 2 3] Test periods:  [4]
Baseline accuracy: 0.6
Accuracy: 0.875
AUC Score: 0.991779448622
[[80 25]
 [ 0 95]]
Train periods:  [0 1 2 3 4] Test periods:  [5]
Baseline accuracy: 0.59
Accuracy: 0.915
AUC Score: 0.989298929893
[[84 15]
 [ 2 99]]


[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state

Put input file through randomization process, then run it through the pipeline. Model performs no better than random, as expected.

In [83]:
df_random = randomize('fake_data.csv', 'idx', ['period', 'outcome']) # creates csv named fake_data_randomized.csv
logreg_pipeline(df_random, temporal = True)

... Randomizing column intercept
... Randomizing column x1
... Randomizing column x2
... Randomizing column x3
... Randomizing column x4
... Randomizing column x5
... Randomizing column z
... Randomizing column pr
Time series split
Train periods:  [0] Test periods:  [1]
Baseline accuracy: 0.76
Accuracy: 0.525
AUC Score: 0.427468671679
[[105   0]
 [ 95   0]]
Train periods:  [0 1] Test periods:  [2]
Baseline accuracy: 0.64
Accuracy: 0.42
AUC Score: 0.402914614122
[[  0 116]
 [  0  84]]
Train periods:  [0 1 2] Test periods:  [3]
Baseline accuracy: 0.62
Accuracy: 0.45
AUC Score: 0.479685452163
[[  9 100]
 [ 10  81]]
Train periods:  [0 1 2 3] Test periods:  [4]
Baseline accuracy: 0.6
Accuracy: 0.475
AUC Score: 0.565614035088
[[ 8 97]
 [ 8 87]]
Train periods:  [0 1 2 3 4] Test periods:  [5]
Baseline accuracy: 0.59
Accuracy: 0.51
AUC Score: 0.477247724772
[[11 88]
 [10 91]]


[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state

Introduce data leakage by using non-temporal train-test split. Run this through the pipeline. Model performs well, as expected.

In [84]:
logreg_pipeline(df)

Train test split -- test size = 0.33
Baseline accuracy: 0.56
Accuracy: 0.94696969697
AUC Score: 0.985123966942
[[221  10]
 [ 11 154]]


[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]

Put the input file through randomization process. Again, introduce data leakage into the pipeline by using non-temporal train-test split. Model performs no better than random. EXPECTING TO SEE IT 

In [85]:
df_random = randomize('fake_data.csv', 'idx', ['period', 'outcome']) # generates csv named "fake_data_randomized.csv"
logreg_pipeline(df_random)

... Randomizing column intercept
... Randomizing column x1
... Randomizing column x2
... Randomizing column x3
... Randomizing column x4
... Randomizing column x5
... Randomizing column z
... Randomizing column pr
Train test split -- test size = 0.33
Baseline accuracy: 0.58
Accuracy: 0.55303030303
AUC Score: 0.555475113122
[[185  36]
 [141  34]]


[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]