In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [4]:
def changeDType(df,flag=False):
    
    if(flag):
        numericDtype = ['int32','int64','float64','float32']
    
    for i in df.columns:
        if (df[i].dtype == 'int64' or df[i].dtype == 'int32'):
            df[i] = pd.to_numeric(df[i],downcast='integer')
        
        if (df[i].dtype == 'float64' or df[i].dtype == 'float32'):
            df[i] = pd.to_numeric(df[i],downcast='float')

In [5]:
dfTest = pd.read_csv('/kaggle/input/preprocessedtestv3/preprocessedTestV3.csv')
dfTrain = pd.read_csv('/kaggle/input/preprocessedtrainv3/preprocessedTrainV3.csv')
changeDType(dfTrain)
changeDType(dfTest)

Hyper opt Tune

In [6]:
def evaluateModel(df,model):
    
    df = df - df.min()/df.max()-df.min()
    
    y_pred = model.predict(df)
    
    result = pd.DataFrame(y_pred)
    
    result.columns = ['isFraud']
    result.to_csv("./result_xgb_final.csv")
    
    print("Result file saved")

In [7]:
train,test = train_test_split(dfTrain,test_size=0.25,stratify=dfTrain['isFraud'])

XTrain = train.drop(['isFraud'],axis=1)
YTrain = train['isFraud']

XTest = test.drop(['isFraud'],axis=1)
YTest = test['isFraud']


XTrain = XTrain - XTrain.min()/XTrain.max()-XTrain.min()
XTest = XTest - XTest.min()/XTest.max()-XTest.min()



In [83]:


from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.choice('max_depth', range(10,80)),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', range(30,200))
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, XTrain, YTrain, cv = 5).mean()
    model.fit(XTrain,YTrain)
    prob = model.predict_proba(XTest)[:,1]
    auc = metrics.roc_auc_score(YTest,prob)

    # We aim to maximize accuracy, therefore we return it as a negative value
    print("AUC : ",auc)
    print("cross val score : ",accuracy)
    return {'loss': -auc, 'status': STATUS_OK }

In [84]:

from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 50,
            trials= trials)
best


AUC :                                                 
0.7400204837547408                                    
cross val score :                                     
0.9650097237548263                                    
AUC :                                                                              
0.5                                                                                
cross val score :                                                                  
0.9650097237548263                                                                 
AUC :                                                                              
0.7979725097315361                                                               
cross val score :                                                                
0.9650097237548263                                                               
AUC :                                                                              
0.5                           

{'criterion': 1,
 'max_depth': 25,
 'max_features': 2,
 'min_samples_leaf': 0.00117120112564284,
 'min_samples_split': 0.07848095125098334,
 'n_estimators': 17}

In [8]:
hyperjmodel= RandomForestClassifier(criterion= 'gini',
 max_depth= 25,
 max_features= 'log2',
 min_samples_leaf= 0.00117120112564284,
 min_samples_split= 0.07848095125098334,
 n_estimators= 17)

In [9]:
mod=hyperjmodel.fit(XTrain,YTrain)

In [10]:
Xfinal=(dfTest-dfTest.min())/(dfTest.max()- dfTest.min())
Yprdicted =mod.predict(Xfinal)

In [12]:

    
    result = pd.DataFrame(Yprdicted)
    
    result.columns = ['isFraud']
    result.to_csv("./result_rdf_js.csv")
    
    print("Result file saved")

Result file saved


In [59]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

paramSpace = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', range(5,80)),
        #'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', range(50,200))
    }

In [38]:
def tuneParams():
    trialModels = Trials()
    
    bestParam = fmin(fn=objectiveFn,space=paramSpace,algo=tpe.suggest,max_evals=99,trials=trialModels)
    
    print(bestParam)
    return [bestParam,trialModels]

In [39]:
def rdmClassifier(df,params=None,splits=5):
    
    tmpPara = {'n_estimator':550,'eval_metric':'auc','verbosity':1,'n_jobs':-1,'reg_alpha':8.0,'colsample_bytree':0.8,'learning_rate':0.17500000000000002,'gamma':0.6000000000000001,'reg_lambda':1.20000000000000001,'max_depth':10,'min_child_weight':1.0,'scale_pos_weight':60.0,'subsample':0.700000000000001,'objective':'binary:logistic'}
    
    if(params!=None):
        tmpPara = params
    
    X_train = df.drop(['isFraud'],axis=1)
    Y_train = df['isFraud']
    
    X_train = X_train - X_train.min()/X_train.max() - X_train.min()
    
    skf = StratifiedKFold(n_splits=splits)
#     if()
    rdmc = RandomForestClassifie(**tmpPara)
    
    for trainIndx,testIndx in skf.split(X_train,Y_train):
        
        Xtrain = X_train.iloc[trainIndx]
        Ytrain = Y_train.iloc[trainIndx]
        
        Xtest = X_train.iloc[testIndx]
        Ytest = Y_train.iloc[testIndx]
    
        xgbc.fit(Xtrain,Ytrain)
        
        print("Accuracy : ",rdmc.score(Xtest,Ytest))
        
        prob = xgbc.predict_proba(Xtest)[:,1]
        
        auc = metrics.roc_auc_score(Ytest,prob)
        print("validation-AUC : ",auc,"\n===============================")
        
        
    return xgbc

In [None]:
param_models = tuneParams()

In [None]:
param = {
    'n_estimators' : [30,50,65,75,90,100,110,120,135,145,150,165,175,184,190,200,205,215],
    'criterion':['gini','entropy'],
    'max_depth' : [None,10,12,15,17,20,24,29,35,40,45,50],
    'min_samples_split' : [0.2,0.25,0.3,0.35,0.4,0.5,0.58,0.67,0.74,0.84,0.95,1.0],
    'max_features' : ['auto',0.2,0.3,0.4,0.5,0.6,0.65,0.7,0.75,0.8],
    'bootstrap' : [True],
    'max_samples' : [0.3,0.35,0.4,0.5,0.55,0.65,0.7,0.75,0.8],
    'n_jobs':[-1]
 }

In [13]:
def stratifiedKFoldWithRandomForest(df):
    
    train,test = train_test_split(df,test_size=0.2)
    
    X = train.drop(['isFraud'],axis=1)
    Y = train['isFraud']
    
    X = X - X.min()/X.max() - X.min()
    
    model = RandomForestClassifier(n_estimators=100,n_jobs=-1)
    skf = StratifiedKFold(n_splits=5)
    
    for train_index,test_index in skf.split(X,Y):
        
        X_train = X.iloc[train_index]
        Y_train = Y.iloc[train_index]
        
        X_test = X.iloc[test_index]
        Y_test = Y.iloc[test_index]
        
        model.fit(X_train,Y_train)
        
        print("Accuracy ", model.score(X_test,Y_test))
        
        prob = model.predict_proba(X_test)[:,1]
        
        auc = metrics.roc_auc_score(Y_test,prob)
        print("validation-AUC : ",auc,"\n===============================")
    
    
    X_test = test.drop(['isFraud'],axis=1)
    Y_test = test['isFraud']
    
    prob = model.predict_proba(X_test)[:,1]
        
    auc = metrics.roc_auc_score(Y_test,prob)
    print("Test - AUC : ",auc,"\n===============================")

    return model

In [None]:
def evaluateModel(df,model):
    
    df = df - df.min() / df.max()-df.min()
    
    y_pred = model.predict(df)
    
    result = pd.DataFrame(y_pred)
    
    result.columns = ['isFraud']
    result.to_csv("./result_rf.csv")
    
    print("Result file saved")


In [None]:
model = stratifiedKFoldWithRandomForest(dfTrain)
evaluateModel(dfTest,model)

In [None]:
def tuneParams(df,para):
#     (n_estimators=100, *, criterion="gini", max_depth=None, min_samples_split=2,
# min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
# n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None
  
    params = para
    
    rf = RandomForestClassifier()
    
    gridSearchCV = RandomizedSearchCV(estimator=rf,n_iter=30,cv=7,scoring='roc_auc',n_jobs=-1,verbose=2,param_distributions=params)
    
    train,test = train_test_split(df,test_size=0.23)
    
    X_train = train.drop(['isFraud'],axis=1)
    Y_train = train['isFraud']
    
    X_train = X_train - X_train.min()/X_train.max() - X_train.min()
    
    gridSearchCV.fit(X_train,Y_train)
    
    bestParams = gridSearchCV.best_params_
    
    print(bestParams)
    
    return gridSearchCV.best_estimator_


In [None]:
model = tuneParams(dfTrain)

In [None]:
train,test = train_test_split(dfTrain,test_size=0.23)

X_train = train.drop(['isFraud'],axis=1)
Y_train = train['isFraud']

X_train = X_train - X_train.min()/X_train.max() - X_train.min()

model.fit(X_train,Y_train)

X_test = test.drop(['isFraud'],axis=1)
Y_test = test['isFraud']

X_test = X_test - X_test.min()/X_test.max()-X_test.min()

prob = model.predict_proba(X_test)[:,1]

auc = metrics.roc_auc_score(Y_test,prob)
print("validation-AUC : ",auc,"\n===============================")


In [None]:
evaluateModel(dfTest,model)

## tunning 2

In [None]:
param = {
    'n_estimators' : [30,50,65,75,90,100,110,120,135,145,150,165,175,184,190,200,205,215],
    'criterion':['gini','entropy'],
    'max_depth' : [None,10,12,15,17,20,24,29,35,40,45,50],
    'min_samples_split' : [0.2,0.25,0.3,0.35,0.4,0.5,0.58,0.67,0.74,0.84,0.95,1.0],
    'max_features' : ['auto',0.2,0.3,0.4,0.5,0.6,0.65,0.7,0.75,0.8],
    'bootstrap' : [True],
    'max_samples' : [0.3,0.35,0.4,0.5,0.55,0.65,0.7,0.75,0.8],
    'n_jobs':[-1]
 }

model2 = tuneParams(dfTrain,param)

In [None]:
model2

In [None]:
evaluateModel(dfTest,model2)