In [15]:
import numpy as np
import pandas as pd
import gc
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
from sklearn import metrics

In [17]:
def changeDType(df,flag=False):
    
    if(flag):
        numericDtype = ['int32','int64','float64','float32']
    
    for i in df.columns:
        if (df[i].dtype == 'int64' or df[i].dtype == 'int32'):
            df[i] = pd.to_numeric(df[i],downcast='integer')
        
        if (df[i].dtype == 'float64' or df[i].dtype == 'float32'):
            df[i] = pd.to_numeric(df[i],downcast='float')

In [18]:
dfTrain = pd.read_csv('../input/preprocessedtrainv2/preprocessesTrainv2.csv')
dfTest = pd.read_csv('../input/preprocessedtestv2/preprocessedTestV2.csv')

In [19]:
changeDType(dfTrain)
changeDType(dfTest)

In [20]:
print(dfTrain.shape)
print(dfTest.shape)

(386593, 130)
(147635, 129)


In [24]:
def stratifiedKFoldWithGNB(df):
    
    train,test = train_test_split(df,test_size=0.2,random_state=9,shuffle=True)
    
    X = train.drop(['isFraud'],axis=1)
    Y = train['isFraud']
    
    model = GaussianNB()
    skf = StratifiedKFold(n_splits=9)
    
    for train_index,test_index in skf.split(X,Y):
        
        X_train = X.iloc[train_index]
        Y_train = Y.iloc[train_index]
        
        X_test = X.iloc[test_index]
        Y_test = Y.iloc[test_index]
        
        model.fit(X_train,Y_train)
        
        print("Accuracy ", model.score(X_test,Y_test))
        
        prob = model.predict_proba(X_test)[:,1]
        
        auc = metrics.roc_auc_score(Y_test,prob)
        print("validation-AUC : ",auc,"\n===============================")
    
    
    X_test = test.drop(['isFraud'],axis=1)
    Y_test = test['isFraud']
    
    prob = model.predict_proba(X_test)[:,1]
        
    auc = metrics.roc_auc_score(Y_test,prob)
    print("Test - AUC : ",auc,"\n===============================")

    return model;

In [27]:
def evaluateModel(df,model):
    
    y_pred = model.predict(df)
    
    result = pd.DataFrame(y_pred)
    
    result.columns = ['isFraud']
    result.to_csv("./result_treeBased.csv")
    
    print("Result file saved")
    

In [26]:
model = stratifiedKFoldWithGNB(dfTrain)

Accuracy  0.9168606681410779
validation-AUC :  0.7762230886195978 
Accuracy  0.9180246769875451
validation-AUC :  0.7886925256321891 
Accuracy  0.9170643696892097
validation-AUC :  0.7768770459713715 
Accuracy  0.9186357816319404
validation-AUC :  0.7775932996077486 
Accuracy  0.9168897683622396
validation-AUC :  0.7861269554193199 
Accuracy  0.9159003608427424
validation-AUC :  0.7812587205446654 
Accuracy  0.915667559073449
validation-AUC :  0.7720066334654947 
Accuracy  0.9160725198614789
validation-AUC :  0.7910392923761957 
Accuracy  0.9150248814131479
validation-AUC :  0.7775599680233957 
Test - AUC :  0.7857209774547483 


In [29]:
evaluateModel(dfTest,model)

Result file saved
