In [6]:
import numpy as np
import pandas as pd
import gc
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
from sklearn import metrics

In [7]:
def changeDType(df,flag=False):
    
    if(flag):
        numericDtype = ['int32','int64','float64','float32']
    
    for i in df.columns:
        if (df[i].dtype == 'int64' or df[i].dtype == 'int32'):
            df[i] = pd.to_numeric(df[i],downcast='integer')
        
        if (df[i].dtype == 'float64' or df[i].dtype == 'float32'):
            df[i] = pd.to_numeric(df[i],downcast='float')

In [8]:
dfTrain = pd.read_csv('../input/preprocessedtrainv2/preprocessesTrainv2.csv')
dfTest = pd.read_csv('../input/preprocessedtestv2/preprocessedTestV2.csv')

In [9]:
changeDType(dfTrain)
changeDType(dfTest)

In [10]:
print(dfTrain.shape)
print(dfTest.shape)

(386593, 130)
(147635, 129)


In [11]:
def stratifiedKFoldWithGNB(df):
    
    train,test = train_test_split(df,test_size=0.2,random_state=9,shuffle=True)
    
    X = train.drop(['isFraud'],axis=1)
    Y = train['isFraud']
    
    model = GaussianNB()
    skf = StratifiedKFold(n_splits=9)
    
    for train_index,test_index in skf.split(X,Y):
        
        X_train = X.iloc[train_index]
        Y_train = Y.iloc[train_index]
        
        X_test = X.iloc[test_index]
        Y_test = Y.iloc[test_index]
        
        model.fit(X_train,Y_train)
        
        print("Accuracy ", model.score(X_test,Y_test))
        
        prob = model.predict_proba(X_test)[:,1]
        
        auc = metrics.roc_auc_score(Y_test,prob)
        print("validation-AUC : ",auc,"\n===============================")
    
    
    X_test = test.drop(['isFraud'],axis=1)
    Y_test = test['isFraud']
    
    prob = model.predict_proba(X_test)[:,1]
        
    auc = metrics.roc_auc_score(Y_test,prob)
    print("Test - AUC : ",auc,"\n===============================")

    return model;

In [12]:
def evaluateModel(df,model):
    
    df = df - df.min()/df.max()-df.min()
    
    y_pred = model.predict(df)
    
    result = pd.DataFrame(y_pred)
    
    result.columns = ['isFraud']
    result.to_csv("./result_treeBased.csv")
    
    print("Result file saved")
    

In [None]:
model = stratifiedKFoldWithGNB(dfTrain)

In [None]:
evaluateModel(dfTest,model)

In [13]:
# 0.733 without setting any parameters of DT

In [40]:
def stratifiedKFoldWithDecisionTree(df):
    
    train,test = train_test_split(df,test_size=0.30,stratify= df['isFraud'])
    
    X = train.drop(['isFraud'],axis=1)
    Y = train['isFraud']
    
    X = X - X.min()/X.max()-X.min()
    
    model = DecisionTreeClassifier(min_samples_split=0.01)
    skf = StratifiedKFold(n_splits=9)
    
    for train_index,test_index in skf.split(X,Y):
        
        X_train = X.iloc[train_index]
        Y_train = Y.iloc[train_index]
        
        X_test = X.iloc[test_index]
        Y_test = Y.iloc[test_index]
        
        model.fit(X_train,Y_train)
        
        print("Accuracy ", model.score(X_test,Y_test))
        
        prob = model.predict_proba(X_test)[:,1]
        
        auc = metrics.roc_auc_score(Y_test,prob)
        print("validation-AUC : ",auc,"\n===============================")
    
    
    X_test = test.drop(['isFraud'],axis=1)
    Y_test = test['isFraud']
    
    X_test = X_test - X_test.min()/X_test.max() - X_test.min()
    
    prob = model.predict_proba(X_test)[:,1]
        
    auc = metrics.roc_auc_score(Y_test,prob)
    print("Test - AUC : ",auc,"\n===============================")

    return model

In [22]:
#depth 45
# trainsize 0.65
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9662977687045593
validation-AUC :  0.7506544309523122 
Accuracy  0.9670857060993517
validation-AUC :  0.7643644200566445 
Accuracy  0.9657605386626553
validation-AUC :  0.7514000790860482 
Accuracy  0.9645428172343398
validation-AUC :  0.7234155987965025 
Accuracy  0.9648651552594821
validation-AUC :  0.7381985204087266 
Accuracy  0.9648997134670487
validation-AUC :  0.745280865235026 
Accuracy  0.9642550143266476
validation-AUC :  0.7396150301405812 
Accuracy  0.9671203438395416
validation-AUC :  0.7425861501872023 
Accuracy  0.9664398280802292
validation-AUC :  0.7514153411236382 
Test - AUC :  0.7436405835411604 
Result file saved


In [20]:
#depth 45 
# train size .65
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9652591239568783
validation-AUC :  0.7501015260304369 
Accuracy  0.9643995558898321
validation-AUC :  0.7504744792060841 
Accuracy  0.9666559220658286
validation-AUC :  0.7565628607958159 
Accuracy  0.9665126607213209
validation-AUC :  0.7546340071140681 
Accuracy  0.9649726012678629
validation-AUC :  0.7470190315012488 
Accuracy  0.965580229226361
validation-AUC :  0.738707779291131 
Accuracy  0.9644340974212035
validation-AUC :  0.7486145933977456 
Accuracy  0.9662965616045845
validation-AUC :  0.7536650966183576 
Accuracy  0.9661174785100286
validation-AUC :  0.7583354669887279 
Test - AUC :  0.7405147737421733 
Result file saved


In [25]:
# max_depth=45,min_samples_split=0.15
# train size .65
# test acc - 0.56

model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9697002256366176
validation-AUC :  0.8162067470234019 
Accuracy  0.9699509329895061
validation-AUC :  0.8085621416794447 
Accuracy  0.9699151176533791
validation-AUC :  0.811468757150155 
Accuracy  0.9698793023172523
validation-AUC :  0.816664973761183 
Accuracy  0.969485333619856
validation-AUC :  0.8227330655863252 
Accuracy  0.9695558739255015
validation-AUC :  0.8147285661050738 
Accuracy  0.9691618911174785
validation-AUC :  0.832645348651722 
Accuracy  0.969878223495702
validation-AUC :  0.8132167514678503 
Accuracy  0.9699498567335244
validation-AUC :  0.8126248440421068 
Test - AUC :  0.8183615046875131 
Result file saved


In [29]:
#depth  45
# train size 0.7
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9660780205527287
validation-AUC :  0.7528473117062886 
Accuracy  0.9664438458212777
validation-AUC :  0.7538237524241657 
Accuracy  0.9642488942099837
validation-AUC :  0.7473655128441489 
Accuracy  0.9660768923772781
validation-AUC :  0.76412410979652 
Accuracy  0.9678395636557137
validation-AUC :  0.7594945487976298 
Accuracy  0.9659106026340295
validation-AUC :  0.7589514314100126 
Accuracy  0.9666755354529732
validation-AUC :  0.7655517401537173 
Accuracy  0.9659438605826792
validation-AUC :  0.7527312259853011 
Accuracy  0.9649461221231874
validation-AUC :  0.7453563945234142 
Test - AUC :  0.7380288550463399 
Result file saved


In [32]:
#depth  55
# train size 0.7
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9657454521267751
validation-AUC :  0.751965765040716 
Accuracy  0.9666766437194453
validation-AUC :  0.7520987147413718 
Accuracy  0.9666101300342546
validation-AUC :  0.7586290920062342 
Accuracy  0.9661434082745777
validation-AUC :  0.7525946887798064 
Accuracy  0.967440468271917
validation-AUC :  0.7630483360982468 
Accuracy  0.9663096980178263
validation-AUC :  0.7569717645895313 
Accuracy  0.9651456698150858
validation-AUC :  0.743999678703003 
Accuracy  0.9654449913529334
validation-AUC :  0.7516279065366381 
Accuracy  0.9662764400691766
validation-AUC :  0.7446863248701644 
Test - AUC :  0.7403981587164249 
Result file saved


In [34]:
#depth  60
# train size 0.7
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9676078353121155
validation-AUC :  0.7557786294182975 
Accuracy  0.9654461405434168
validation-AUC :  0.7473423888738308 
Accuracy  0.9658784794971566
validation-AUC :  0.7658402697658833 
Accuracy  0.9648463482772383
validation-AUC :  0.7552588639929275 
Accuracy  0.9678395636557137
validation-AUC :  0.7700232635427053 
Accuracy  0.9677730477584142
validation-AUC :  0.7562224746197977 
Accuracy  0.9663762139151257
validation-AUC :  0.7735679740629775 
Accuracy  0.9664759877610749
validation-AUC :  0.7580313998424426 
Accuracy  0.966342955966476
validation-AUC :  0.7565163134800317 
Test - AUC :  0.7487510723671457 
Result file saved


In [37]:
# max_depth=85,min_samples_split=0.25)
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9698360437660049
validation-AUC :  0.8061214208347638 
Accuracy  0.970035584821577
validation-AUC :  0.810860189289565 
Accuracy  0.9696032458678373
validation-AUC :  0.8093257799464868 
Accuracy  0.9695689769854995
validation-AUC :  0.8199753090547333 
Accuracy  0.9699015564719968
validation-AUC :  0.8131142619103917 
Accuracy  0.9699680723692963
validation-AUC :  0.8137220031704248 
Accuracy  0.9693694292936011
validation-AUC :  0.8285243394565854 
Accuracy  0.9697685246773979
validation-AUC :  0.8135689069737603 
Accuracy  0.9710323267260875
validation-AUC :  0.8068310271827843 
Test - AUC :  0.8063844443090914 
Result file saved


In [39]:
# max_depth=50,min_samples_split=0.25)
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.970035584821577
validation-AUC :  0.8221376498358006 
Accuracy  0.9699690711363863
validation-AUC :  0.8077216135846972 
Accuracy  0.9703681532475307
validation-AUC :  0.8197099305356632 
Accuracy  0.9696022349341492
validation-AUC :  0.8111730399858108 
Accuracy  0.9698350405746974
validation-AUC :  0.8113352655310458 
Accuracy  0.9691366236530531
validation-AUC :  0.8058080144606078 
Accuracy  0.969635492882799
validation-AUC :  0.8071009650229598 
Accuracy  0.9703339098044432
validation-AUC :  0.8246864734630163 
Accuracy  0.9704336836503924
validation-AUC :  0.8186472253947147 
Test - AUC :  0.8100868524811953 
Result file saved


In [41]:
# min_samples_split=0.01)
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9698693006086002
validation-AUC :  0.8672780905825258 
Accuracy  0.9703681532475307
validation-AUC :  0.8601701567115494 
Accuracy  0.9694037048122651
validation-AUC :  0.8442009458666628 
Accuracy  0.9695024610882
validation-AUC :  0.8572557136141927 
Accuracy  0.9696687508314488
validation-AUC :  0.8693222942862509 
Accuracy  0.9705001995476918
validation-AUC :  0.8640521894543443 
Accuracy  0.9702341359584941
validation-AUC :  0.8677293719039724 
Accuracy  0.9699348144206466
validation-AUC :  0.8714728253637625 
Accuracy  0.9698017826260477
validation-AUC :  0.8563495074900435 
Test - AUC :  0.8633491122305389 
Result file saved
