In [6]:
import numpy as np
import pandas as pd
import gc
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
from sklearn import metrics

In [7]:
def changeDType(df,flag=False):
    
    if(flag):
        numericDtype = ['int32','int64','float64','float32']
    
    for i in df.columns:
        if (df[i].dtype == 'int64' or df[i].dtype == 'int32'):
            df[i] = pd.to_numeric(df[i],downcast='integer')
        
        if (df[i].dtype == 'float64' or df[i].dtype == 'float32'):
            df[i] = pd.to_numeric(df[i],downcast='float')

In [8]:
dfTrain = pd.read_csv('../input/preprocessedtrainv2/preprocessesTrainv2.csv')
dfTest = pd.read_csv('../input/preprocessedtestv2/preprocessedTestV2.csv')

In [9]:
changeDType(dfTrain)
changeDType(dfTest)

In [10]:
print(dfTrain.shape)
print(dfTest.shape)

(386593, 130)
(147635, 129)


In [11]:
def stratifiedKFoldWithGNB(df):
    
    train,test = train_test_split(df,test_size=0.2,random_state=9,shuffle=True)
    
    X = train.drop(['isFraud'],axis=1)
    Y = train['isFraud']
    
    model = GaussianNB()
    skf = StratifiedKFold(n_splits=9)
    
    for train_index,test_index in skf.split(X,Y):
        
        X_train = X.iloc[train_index]
        Y_train = Y.iloc[train_index]
        
        X_test = X.iloc[test_index]
        Y_test = Y.iloc[test_index]
        
        model.fit(X_train,Y_train)
        
        print("Accuracy ", model.score(X_test,Y_test))
        
        prob = model.predict_proba(X_test)[:,1]
        
        auc = metrics.roc_auc_score(Y_test,prob)
        print("validation-AUC : ",auc,"\n===============================")
    
    
    X_test = test.drop(['isFraud'],axis=1)
    Y_test = test['isFraud']
    
    prob = model.predict_proba(X_test)[:,1]
        
    auc = metrics.roc_auc_score(Y_test,prob)
    print("Test - AUC : ",auc,"\n===============================")

    return model;

In [12]:
def evaluateModel(df,model):
    
    df = df - df.min()/df.max()-df.min()
    
    y_pred = model.predict(df)
    
    result = pd.DataFrame(y_pred)
    
    result.columns = ['isFraud']
    result.to_csv("./result_treeBased.csv")
    
    print("Result file saved")
    

In [None]:
model = stratifiedKFoldWithGNB(dfTrain)

In [None]:
evaluateModel(dfTest,model)

In [13]:
# 0.733 without setting any parameters of DT

In [26]:
def stratifiedKFoldWithDecisionTree(df):
    
    train,test = train_test_split(df,test_size=0.35,stratify= df['isFraud'])
    
    X = train.drop(['isFraud'],axis=1)
    Y = train['isFraud']
    
    X = X - X.min()/X.max()-X.min()
    
    model = DecisionTreeClassifier(max_depth=45)
    skf = StratifiedKFold(n_splits=9)
    
    for train_index,test_index in skf.split(X,Y):
        
        X_train = X.iloc[train_index]
        Y_train = Y.iloc[train_index]
        
        X_test = X.iloc[test_index]
        Y_test = Y.iloc[test_index]
        
        model.fit(X_train,Y_train)
        
        print("Accuracy ", model.score(X_test,Y_test))
        
        prob = model.predict_proba(X_test)[:,1]
        
        auc = metrics.roc_auc_score(Y_test,prob)
        print("validation-AUC : ",auc,"\n===============================")
    
    
    X_test = test.drop(['isFraud'],axis=1)
    Y_test = test['isFraud']
    
    X_test = X_test - X_test.min()/X_test.max() - X_test.min()
    
    prob = model.predict_proba(X_test)[:,1]
        
    auc = metrics.roc_auc_score(Y_test,prob)
    print("Test - AUC : ",auc,"\n===============================")

    return model

In [22]:
#depth 45
# trainsize 0.65
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9662977687045593
validation-AUC :  0.7506544309523122 
Accuracy  0.9670857060993517
validation-AUC :  0.7643644200566445 
Accuracy  0.9657605386626553
validation-AUC :  0.7514000790860482 
Accuracy  0.9645428172343398
validation-AUC :  0.7234155987965025 
Accuracy  0.9648651552594821
validation-AUC :  0.7381985204087266 
Accuracy  0.9648997134670487
validation-AUC :  0.745280865235026 
Accuracy  0.9642550143266476
validation-AUC :  0.7396150301405812 
Accuracy  0.9671203438395416
validation-AUC :  0.7425861501872023 
Accuracy  0.9664398280802292
validation-AUC :  0.7514153411236382 
Test - AUC :  0.7436405835411604 
Result file saved


In [20]:
#depth 45 
# train size .65
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9652591239568783
validation-AUC :  0.7501015260304369 
Accuracy  0.9643995558898321
validation-AUC :  0.7504744792060841 
Accuracy  0.9666559220658286
validation-AUC :  0.7565628607958159 
Accuracy  0.9665126607213209
validation-AUC :  0.7546340071140681 
Accuracy  0.9649726012678629
validation-AUC :  0.7470190315012488 
Accuracy  0.965580229226361
validation-AUC :  0.738707779291131 
Accuracy  0.9644340974212035
validation-AUC :  0.7486145933977456 
Accuracy  0.9662965616045845
validation-AUC :  0.7536650966183576 
Accuracy  0.9661174785100286
validation-AUC :  0.7583354669887279 
Test - AUC :  0.7405147737421733 
Result file saved


In [25]:
# max_depth=45,min_samples_split=0.15
# train size .65
# test acc - 0.56

model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9697002256366176
validation-AUC :  0.8162067470234019 
Accuracy  0.9699509329895061
validation-AUC :  0.8085621416794447 
Accuracy  0.9699151176533791
validation-AUC :  0.811468757150155 
Accuracy  0.9698793023172523
validation-AUC :  0.816664973761183 
Accuracy  0.969485333619856
validation-AUC :  0.8227330655863252 
Accuracy  0.9695558739255015
validation-AUC :  0.8147285661050738 
Accuracy  0.9691618911174785
validation-AUC :  0.832645348651722 
Accuracy  0.969878223495702
validation-AUC :  0.8132167514678503 
Accuracy  0.9699498567335244
validation-AUC :  0.8126248440421068 
Test - AUC :  0.8183615046875131 
Result file saved


In [None]:
#depth  45
model = stratifiedKFoldWithDecisionTree(dfTrain)
evaluateModel(dfTest,model)