In [0]:
import pandas as pd
import numpy as np
import random
import time
import multiprocessing
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

In [0]:
import warnings
warnings.filterwarnings('ignore')

Load dataset from google drive to google colab

In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
link = 'https://drive.google.com/open?id=11XyIbLKXB8ILlL_brU2wX7f5GKWOFG1k'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train_data_capped6.csv')  
trainDF = pd.read_csv('train_data_capped6.csv')

In [0]:
link = 'https://drive.google.com/open?id=14ciuYIzjQ4U4UxbjAu7XSl1JpNc0JIKg'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train_downsampled_data_capped6.csv')  
train_dsDF = pd.read_csv('train_downsampled_data_capped6.csv')

In [0]:
link = 'https://drive.google.com/open?id=1Qf25d-OOKITQm6P4eOYYqFHxk6i6pHa-'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test_data_capped6.csv')  
testDF = pd.read_csv('test_data_capped6.csv')

In [0]:
link = 'https://drive.google.com/open?id=14ssWruo6Or2s3m4WRxYzqTzB_Lb_eR7d'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('oot_data_capped6.csv')  
ootDF = pd.read_csv('oot_data_capped6.csv')

In [0]:
print('train:',trainDF.shape)
print('train_dsDF:',train_dsDF.shape)
print('test:',testDF.shape)
print('oot:',ootDF.shape)

train: (635996, 26)
train_dsDF: (101079, 26)
test: (159000, 26)
oot: (166493, 26)


**Define functions**


In [0]:
X_train = trainDF.iloc[:,:-1]
y_train = trainDF.iloc[:,-1]
X_train_ds = train_dsDF.iloc[:,:-1]
y_train_ds = train_dsDF.iloc[:,-1]
X_test = testDF.iloc[:,:-1]
y_test = testDF.iloc[:,-1]
X_oot = ootDF.iloc[:,:-1]
y_oot = ootDF.iloc[:,-1]

fdr_score: Calculate FDR

In [0]:
def fdr_score(y_pred_prob, y_obs, alpha=0.03):
  n_rows = int(round(len(y_obs)*alpha))
  numBads = sum(y_obs)
  df = pd.DataFrame([y_pred_prob,y_obs]).T
  df.columns = ['pred_prob','label']
  df = df.sort_values('pred_prob',ascending=False)
  return df.head(n_rows)['label'].sum()/numBads*100

trn_resample: Subsample the good records in the training data to match the specified good_bad_ratio


In [0]:
def trn_resample(X, y, good_bad_ratio=10):
  df = pd.concat([X, y], axis=1)
  goods = df[df.fraud_label==0]
  bads = df[df.fraud_label==1]
  n_bads = bads.shape[0]
  goods_downsampled = resample(goods,
                  replace=False,
                  n_samples=n_bads*good_bad_ratio)
  df_downsampled = pd.concat([goods_downsampled, bads])
  X_downsampled = df_downsampled.iloc[:,:-1].values
  y_downsampled = df_downsampled.iloc[:,-1].values
  return X_downsampled, y_downsampled

fit_avg_fdr: Refitting model multiple times to get averaged FDR on training/testing/oot data

In [0]:
def fit_avg_fdr(model, k=5):
  fdr_trainList = []
  fdr_testList = []
  fdr_ootList = []
  accuracyList = []
    
  for i in range(k):
    X_train_downsampled, y_train_downsampled = trn_resample(X_train, y_train)
    model.fit(X_train_downsampled, y_train_downsampled)
    
    y_train_pred_prob = model.predict_proba(X_train)[:,1]
    y_test_pred_prob = model.predict_proba(X_test)[:,1]
    y_oot_pred_prob = model.predict_proba(X_oot)[:,1]
    
    fdr_trainList.append(fdr_score(y_train_pred_prob,y_train))
    fdr_testList.append(fdr_score(y_test_pred_prob,y_test))
    fdr_ootList.append(fdr_score(y_oot_pred_prob,y_oot))
    accuracyList.append(model.score(X_train_downsampled, y_train_downsampled))
    
  score_summary = np.array([fdr_trainList, fdr_testList, fdr_ootList, accuracyList]).T
  avg_scores = score_summary.mean(0)
  
  return score_summary, avg_scores

cross_val_fdr: Performs k-fold cross validation and calculates the FDR on the validation set

In [0]:
import random
from sklearn.utils import resample

def cross_val_fdr(model, X, y, cv=5, verbose=True):
  arrX = X.values
  arry = y.values
  n = len(arry)
    
  # Calculate fold sizes
  fold_size = int(n/cv)
  fold_sizes = [fold_size]*cv
  remainder = n - fold_size*cv
  for i in range(remainder):
    fold_sizes[i]+=1
        
  # Split folds
  folds_X = []
  folds_y = []
  for fs in fold_sizes:
    index = random.choices(range(len(arrX)), k=fs)
    folds_X.append(arrX[index,:])
    folds_y.append(arry[index])
    arrX = np.delete(arrX, index, 0)
    arry = np.delete(arry, index, 0)
        
  # Fit model
  FDRs = []
  new = True
  for i in range(cv):
    X_tst = folds_X[i]
    y_tst = folds_y[i]
        
    # Combine folds
    for j in [j for j in range(cv) if j!=i]:
      if new:
        X_trn = folds_X[j]
        y_trn = folds_y[j]
        new = False
      else:
        X_trn = np.concatenate((X_trn, folds_X[j]), axis=0)
        y_trn = np.concatenate((y_trn, folds_y[j]), axis=0)
            
    #print("Train:", len(X_trn))
    #print("Train fraud", sum(y_trn))
    #print("Test:", len(X_tst))
    #print("Test fraud", sum(y_tst))
            
    # Resample training set
    X_trn_df = pd.DataFrame(X_trn)
    X_trn_df.columns = X.columns
    y_trn_df = pd.DataFrame(y_trn)
    y_trn_df.columns = [y.name]
    X_trn_downsampled, y_trn_downsampled = trn_resample(X_trn_df, y_trn_df)
    
    #print("Train resampled:", len(X_trn_downsampled))
    #print("Train resampled fraud:", sum(y_trn_downsampled))

    # Calculate FDR
    model.fit(X_trn_downsampled, y_trn_downsampled)
    y_pred_prob = model.predict_proba(X_tst)[:,1]
    fdr = fdr_score(y_pred_prob, y_tst)
    #print(fdr)
    FDRs.append(fdr)
    new = True
    if verbose:
      print('CV {} processed, FDR={:.2f}%'.format(i+1, fdr))
        
  return np.array(FDRs)

# one bug: total param combinations/njobs has to be >= 2
# set njobs smaller when you don't have that many param combinations

Randomized Search and Grid Search (brute force): Searches through random/all combinations of specified parameters and returns the combination with the highest FDR with k-fold validation. When n_iter is not specified in Randomized Search, n_combination/10 searches will be performed.

In [0]:
import itertools as it

def get_paramsList(params_grid):
  allNames = sorted(params_grid)
  combinations = it.product(*(params_grid[Name] for Name in allNames))
  all_params = list(combinations)
  return allNames, all_params

In [0]:
def split(a, n):
  k, m = divmod(len(a), n)
  return [a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

In [0]:
def main_func(model, X, y, param_names, all_params, cv, fdr_dict):
  max_fdr = 0
  best_params = 0
  
  for cur_params in all_params:
    starttime = time.time()
    params = dict(zip(param_names, cur_params))
    print(params)
    model.set_params(**params)
    fdr = cross_val_fdr(model, X, y, verbose=False, cv=cv).mean()
    print('{}, FDR: {:.2f}%, {} sec elapsed'.format(params, fdr, round(time.time() - starttime)))
    #print('fdr:', fdr)
    if fdr > max_fdr:
      max_fdr = fdr
      best_params = params
  
  fdr_dict[max_fdr] = best_params

In [0]:
def randomized_search_cv(model, params_grid, X, y, n_iter='auto', cv=5, n_jobs=1):
  
  tot_starttime = time.time()
  
  param_names, all_params = get_paramsList(params_grid)
  print("Total combination:", len(all_params))
  if n_iter=='auto':
    test_size = int(len(all_params)/10)
  else:
    test_size = n_iter
  print("Randomized search size:", test_size)
  
  selected = []
  for i in range(test_size):
    taken_params = random.choice(all_params)
    all_params.remove(taken_params)
    selected.append(taken_params)
  
  if n_jobs == -1:
    n_jobs = multiprocessing.cpu_count()
      
  paramsList = list(split(selected, n_jobs))
  
  manager = multiprocessing.Manager()
  fdr_dict = manager.dict()
  processes = []
  for i in range(n_jobs):
    p = multiprocessing.Process(target=main_func, 
                                args=(model, X, y, param_names, paramsList[i], cv, fdr_dict))
    processes.append(p)
    p.start()
      
  for process in processes:
    process.join()
  
  final_max_fdr = max(fdr_dict.keys())
  final_best_params = fdr_dict[final_max_fdr]
  
  print('Total time elapsed: {:.1f} minutes'.format((time.time() - tot_starttime)/60))
  
  return final_max_fdr, final_best_params

summary: Summarize FDR on training/testing/oot data after model is fitted

In [0]:
def summary(model):
  y_train_pred_prob = model.predict_proba(X_train)[:,1]
  y_test_pred_prob = model.predict_proba(X_test)[:,1]
  y_oot_pred_prob = model.predict_proba(X_oot)[:,1]
  print('Train FDR at 3%: {:.2f}%'.format(fdr_score(y_train_pred_prob,y_train)))
  print('Test FDR at 3%: {:.2f}%'.format(fdr_score(y_test_pred_prob,y_test)))
  print('OOT FDR at 3%: {:.2f}%'.format(fdr_score(y_oot_pred_prob,y_oot)))
    
#  print(pd.DataFrame([X_train.columns, model.feature_importances_]).T.sort_values(1,ascending=False).rename({0:"feature",1:"importance"},axis=1))

**Modeling**

Logistic Regression

In [0]:
test_grid = {
    'penalty': ["elasticnet","l1","l2"],
    'solver' : ["saga"],
    'l1_ratio' : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

In [0]:
lr = LogisticRegression()
lr_best_fdr, lr_best_params = randomized_search_cv(lr, test_grid, X_train, y_train, n_jobs=-1, n_iter=16)
lr_best_fdr, lr_best_params

Total combination: 33
Randomized search size: 16
{'l1_ratio': 0.3, 'penalty': 'l2', 'solver': 'saga'}
{'l1_ratio': 0.4, 'penalty': 'l1', 'solver': 'saga'}
{'l1_ratio': 0.8, 'penalty': 'l1', 'solver': 'saga'}
{'l1_ratio': 0.9, 'penalty': 'l2', 'solver': 'saga'}
{'l1_ratio': 0.3, 'penalty': 'l2', 'solver': 'saga'}, FDR: 55.59%, 91 sec elapsed
{'l1_ratio': 0.6, 'penalty': 'l1', 'solver': 'saga'}
{'l1_ratio': 0.9, 'penalty': 'l2', 'solver': 'saga'}, FDR: 55.50%, 93 sec elapsed
{'l1_ratio': 1, 'penalty': 'l1', 'solver': 'saga'}
{'l1_ratio': 0.4, 'penalty': 'l1', 'solver': 'saga'}, FDR: 55.00%, 100 sec elapsed
{'l1_ratio': 0.5, 'penalty': 'l1', 'solver': 'saga'}
{'l1_ratio': 0.8, 'penalty': 'l1', 'solver': 'saga'}, FDR: 54.87%, 101 sec elapsed
{'l1_ratio': 0.7, 'penalty': 'l1', 'solver': 'saga'}
{'l1_ratio': 0.6, 'penalty': 'l1', 'solver': 'saga'}, FDR: 55.79%, 101 sec elapsed
{'l1_ratio': 0.9, 'penalty': 'elasticnet', 'solver': 'saga'}
{'l1_ratio': 1, 'penalty': 'l1', 'solver': 'saga'}, FDR

(55.90990025203754, {'l1_ratio': 1, 'penalty': 'l1', 'solver': 'saga'})

In [0]:
# Default
model = LogisticRegression()
result = model.fit(X_train_ds,y_train_ds)
print('Default model summary:')
summary(model)

# Randomized search result
model = LogisticRegression(penalty='l1',solver='saga',l1_ratio=1)
result = model.fit(X_train_ds,y_train_ds)
print('Randomized search result summary:')
summary(model)

Default model summary:
Train FDR at 3%: 55.62%
Test FDR at 3%: 54.33%
OOT FDR at 3%: 52.68%
Randomized search result summary:
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%


In [0]:
for i in np.arange(0,1.1,0.1):
  model = LogisticRegression(penalty='elasticnet',solver='saga',l1_ratio=i)
  result = model.fit(X_train_ds,y_train_ds)

l1_ratio: 0.0
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 0.1
Train FDR at 3%: 55.60%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 0.2
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 0.30000000000000004
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 0.4
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 0.5
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 0.6000000000000001
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 0.7000000000000001
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 0.8
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 0.9
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%
l1_ratio: 1.0
Train FDR at 3%: 55.61%
Test FDR at 3%: 54.29%
OOT FDR at 3%: 52.68%


Neural Network (MLPClassifier)

In [0]:
test_grid1 = {
  'activation' : ['relu','tanh','logistic','identity'],
  'solver' : ['adam'],
  'alpha' : np.arange(0,0.001,0.0001),
  'learning_rate': ['constant','invscaling','adaptive'],
  'learning_rate_init' : np.arange(0,0.1,0.0001),
  'max_iter' : [100,200,300,400,500],
}

In [0]:
test_grid2 = {
  'activation' : ['relu','tanh','logistic','identity'],
  'solver' : ['sgd'],
  'alpha' : np.arange(0,0.001,0.0001),
  'learning_rate': ['constant','invscaling','adaptive'],
  'learning_rate_init' : np.arange(0,0.1,0.0001),
  'max_iter' : [100,200,300,400,500],
  'momentum' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
  'nesterovs_momentum' : [True, False]
}

In [0]:
nn1 = MLPClassifier()
nn1_best_fdr, nn1_best_params = randomized_search_cv(nn1, test_grid1, X_train, y_train, n_jobs=-1, n_iter=16)
nn1_best_fdr, nn1_best_params

Total combination: 600000
Randomized search size: 16
{'activation': 'relu', 'alpha': 0.0004, 'learning_rate': 'constant', 'learning_rate_init': 0.0669, 'max_iter': 200, 'solver': 'adam'}
{'activation': 'tanh', 'alpha': 0.0002, 'learning_rate': 'constant', 'learning_rate_init': 0.0694, 'max_iter': 500, 'solver': 'adam'}
{'activation': 'logistic', 'alpha': 0.0007, 'learning_rate': 'adaptive', 'learning_rate_init': 0.0024000000000000002, 'max_iter': 200, 'solver': 'adam'}
{'activation': 'identity', 'alpha': 0.0001, 'learning_rate': 'adaptive', 'learning_rate_init': 0.0064, 'max_iter': 400, 'solver': 'adam'}
{'activation': 'identity', 'alpha': 0.0001, 'learning_rate': 'adaptive', 'learning_rate_init': 0.0064, 'max_iter': 400, 'solver': 'adam'}, FDR: 55.30%, 340 sec elapsed
{'activation': 'logistic', 'alpha': 0.0009000000000000001, 'learning_rate': 'constant', 'learning_rate_init': 0.0407, 'max_iter': 200, 'solver': 'adam'}
{'activation': 'relu', 'alpha': 0.0004, 'learning_rate': 'constant'

(56.90327539047405,
 {'activation': 'logistic',
  'alpha': 0.0008,
  'learning_rate': 'invscaling',
  'learning_rate_init': 0.0397,
  'max_iter': 300,
  'solver': 'adam'})

In [0]:
nn2 = MLPClassifier()
nn2_best_fdr, nn2_best_params = randomized_search_cv(nn2, test_grid2, X_train, y_train, n_jobs=-1, n_iter=16)
nn2_best_fdr, nn2_best_params

Total combination: 12000000
Randomized search size: 16
{'activation': 'tanh', 'alpha': 0.0001, 'learning_rate': 'constant', 'learning_rate_init': 0.083, 'max_iter': 400, 'momentum': 0.1, 'nesterovs_momentum': True, 'solver': 'sgd'}
{'activation': 'identity', 'alpha': 0.0002, 'learning_rate': 'adaptive', 'learning_rate_init': 0.014700000000000001, 'max_iter': 200, 'momentum': 0.4, 'nesterovs_momentum': False, 'solver': 'sgd'}
{'activation': 'tanh', 'alpha': 0.00030000000000000003, 'learning_rate': 'adaptive', 'learning_rate_init': 0.0618, 'max_iter': 500, 'momentum': 0.9, 'nesterovs_momentum': True, 'solver': 'sgd'}
{'activation': 'identity', 'alpha': 0.0006000000000000001, 'learning_rate': 'constant', 'learning_rate_init': 0.0262, 'max_iter': 100, 'momentum': 0.1, 'nesterovs_momentum': True, 'solver': 'sgd'}
{'activation': 'identity', 'alpha': 0.0006000000000000001, 'learning_rate': 'constant', 'learning_rate_init': 0.0262, 'max_iter': 100, 'momentum': 0.1, 'nesterovs_momentum': True, 

(56.842084370159775,
 {'activation': 'relu',
  'alpha': 0.00030000000000000003,
  'learning_rate': 'constant',
  'learning_rate_init': 0.07590000000000001,
  'max_iter': 200,
  'momentum': 0.1,
  'nesterovs_momentum': False,
  'solver': 'sgd'})

In [0]:
# Default
model = MLPClassifier()
print('Default model summary:')
fit_avg_fdr(model)

Default model summary:


(array([[56.78528676, 55.46364824, 53.8139145 ,  0.95573759],
        [56.71999129, 55.8119286 , 53.8977368 ,  0.95566834],
        [56.85058222, 55.68132347, 53.8139145 ,  0.95583652],
        [56.79616933, 55.76839356, 54.02347024,  0.95595524],
        [56.75263902, 55.8119286 , 53.85582565,  0.95577716]]),
 array([56.78093373, 55.70744449, 53.88097234,  0.95579497]))

In [0]:
# Randomized search result 1
model = MLPClassifier(activation='logistic',alpha=0.0008,learning_rate='invscaling',learning_rate_init=0.0397,max_iter=300,solver='adam')
print('Randomized search result 1 summary:')
fit_avg_fdr(model)

Randomized search result 1 summary:


(array([[56.42616172, 55.72485851, 53.85582565,  0.95502528],
        [56.44792687, 55.89899869, 53.68818106,  0.95560898],
        [56.53498749, 55.8119286 , 53.98155909,  0.95551005],
        [56.28468821, 55.85546365, 53.64626991,  0.9557277 ],
        [56.3391011 , 55.76839356, 53.47862531,  0.95479773]]),
 array([56.40657308, 55.8119286 , 53.7300922 ,  0.95533395]))

In [0]:
# Randomized search result 2
model = MLPClassifier(activation='relu',alpha=0.0003,learning_rate='constant',learning_rate_init=0.07590000000000001,max_iter=200,solver='sgd',\
                      momentum=0.1,nesterovs_momentum=False)
print('Randomized search result 2 summary:')
fit_avg_fdr(model)

Randomized search result 2 summary:


(array([[56.58940037, 55.68132347, 53.93964795,  0.95535175],
        [56.55675264, 55.46364824, 53.64626991,  0.95546058],
        [56.63293068, 55.85546365, 53.98155909,  0.95552983],
        [56.68734356, 55.59425337, 53.93964795,  0.95555951],
        [56.66557841, 55.59425337, 53.85582565,  0.95525282]]),
 array([56.62640113, 55.63778842, 53.87259011,  0.9554309 ]))