In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [3]:
def changeDType(df,flag=False):
    
    if(flag):
        numericDtype = ['int32','int64','float64','float32']
    
    for i in df.columns:
        if (df[i].dtype == 'int64' or df[i].dtype == 'int32'):
            df[i] = pd.to_numeric(df[i],downcast='integer')
        
        if (df[i].dtype == 'float64' or df[i].dtype == 'float32'):
            df[i] = pd.to_numeric(df[i],downcast='float')

In [4]:
dfTest = pd.read_csv('../input/preprocessedtestv2/preprocessedTestV2.csv')
dfTrain = pd.read_csv('../input/preprocessedtrainv2/preprocessesTrainv2.csv')
changeDType(dfTrain)
changeDType(dfTest)

In [5]:
def stratifiedKFoldWithRandomForest(df):
    
    train,test = train_test_split(df,test_size=0.2)
    
    X = train.drop(['isFraud'],axis=1)
    Y = train['isFraud']
    
    X = X - X.min()/X.max() - X.min()
    
    model = RandomForestClassifier(n_estimators=100,n_jobs=-1)
    skf = StratifiedKFold(n_splits=5)
    
    for train_index,test_index in skf.split(X,Y):
        
        X_train = X.iloc[train_index]
        Y_train = Y.iloc[train_index]
        
        X_test = X.iloc[test_index]
        Y_test = Y.iloc[test_index]
        
        model.fit(X_train,Y_train)
        
        print("Accuracy ", model.score(X_test,Y_test))
        
        prob = model.predict_proba(X_test)[:,1]
        
        auc = metrics.roc_auc_score(Y_test,prob)
        print("validation-AUC : ",auc,"\n===============================")
    
    
    X_test = test.drop(['isFraud'],axis=1)
    Y_test = test['isFraud']
    
    prob = model.predict_proba(X_test)[:,1]
        
    auc = metrics.roc_auc_score(Y_test,prob)
    print("Test - AUC : ",auc,"\n===============================")

    return model

In [6]:
def evaluateModel(df,model):
    
    df = df - df.min() / df.max()-df.min()
    
    y_pred = model.predict(df)
    
    result = pd.DataFrame(y_pred)
    
    result.columns = ['isFraud']
    result.to_csv("./result_rf.csv")
    
    print("Result file saved")


In [7]:
model = stratifiedKFoldWithRandomForest(dfTrain)
evaluateModel(dfTest,model)

Accuracy  0.9797106135316466
validation-AUC :  0.90879403535621 
Accuracy  0.9793549430118826
validation-AUC :  0.9169977549234725 
Accuracy  0.9791771077520006
validation-AUC :  0.9100250360706923 
Accuracy  0.9802602861531
validation-AUC :  0.9130266570259747 
Accuracy  0.9806964788049277
validation-AUC :  0.9143387997178959 
Test - AUC :  0.8917185501753043 
Result file saved


In [25]:
def tuneParams(df,para):
#     (n_estimators=100, *, criterion="gini", max_depth=None, min_samples_split=2,
# min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
# n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None
  
    params = para
    
    rf = RandomForestClassifier()
    
    gridSearchCV = RandomizedSearchCV(estimator=rf,n_iter=30,cv=7,scoring='roc_auc',n_jobs=-1,verbose=2,param_distributions=params)
    
    train,test = train_test_split(df,test_size=0.23)
    
    X_train = train.drop(['isFraud'],axis=1)
    Y_train = train['isFraud']
    
    X_train = X_train - X_train.min()/X_train.max() - X_train.min()
    
    gridSearchCV.fit(X_train,Y_train)
    
    bestParams = gridSearchCV.best_params_
    
    print(bestParams)
    
    return gridSearchCV.best_estimator_


In [18]:
model = tuneParams(dfTrain)

Fitting 7 folds for each of 30 candidates, totalling 210 fits
[CV] END bootstrap=True, criterion=gini, max_depth=10, max_features=0.4, max_samples=None, min_samples_split=0.67, n_estimators=205, n_jobs=-1; total time=   7.6s
[CV] END bootstrap=True, criterion=gini, max_depth=10, max_features=0.4, max_samples=None, min_samples_split=0.67, n_estimators=205, n_jobs=-1; total time=   7.7s
[CV] END bootstrap=True, criterion=gini, max_depth=10, max_features=0.4, max_samples=None, min_samples_split=0.67, n_estimators=205, n_jobs=-1; total time=   7.6s
[CV] END bootstrap=True, criterion=gini, max_depth=10, max_features=0.4, max_samples=None, min_samples_split=0.67, n_estimators=205, n_jobs=-1; total time=   7.9s
[CV] END bootstrap=True, criterion=gini, max_depth=20, max_features=0.5, max_samples=0.5, min_samples_split=1, n_estimators=145, n_jobs=-1; total time=   0.5s
[CV] END bootstrap=True, criterion=gini, max_depth=20, max_features=0.5, max_samples=0.5, min_samples_split=1, n_estimators=145

105 fits failed out of a total of 210.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 1054, in __call__
    self.retrieve()
  File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 933, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/opt/conda/lib/python3.7/mul

{'n_jobs': -1, 'n_estimators': 150, 'min_samples_split': 0.2, 'max_samples': 0.35, 'max_features': 0.3, 'max_depth': 12, 'criterion': 'gini', 'bootstrap': True}


In [21]:
train,test = train_test_split(dfTrain,test_size=0.23)

X_train = train.drop(['isFraud'],axis=1)
Y_train = train['isFraud']

X_train = X_train - X_train.min()/X_train.max() - X_train.min()

model.fit(X_train,Y_train)

X_test = test.drop(['isFraud'],axis=1)
Y_test = test['isFraud']

X_test = X_test - X_test.min()/X_test.max()-X_test.min()

prob = model.predict_proba(X_test)[:,1]

auc = metrics.roc_auc_score(Y_test,prob)
print("validation-AUC : ",auc,"\n===============================")


validation-AUC :  0.8118387196907935 


In [24]:
evaluateModel(dfTest,model)

Result file saved


## tunning 2

In [26]:
param = {
    'n_estimators' : [30,50,65,75,90,100,110,120,135,145,150,165,175,184,190,200,205,215],
    'criterion':['gini','entropy'],
    'max_depth' : [None,10,12,15,17,20,24,29,35,40,45,50],
    'min_samples_split' : [0.2,0.25,0.3,0.35,0.4,0.5,0.58,0.67,0.74,0.84,0.95,1.0],
    'max_features' : ['auto',0.2,0.3,0.4,0.5,0.6,0.65,0.7,0.75,0.8],
    'bootstrap' : [True],
    'max_samples' : [0.3,0.35,0.4,0.5,0.55,0.65,0.7,0.75,0.8],
    'n_jobs':[-1]
 }

model2 = tuneParams(dfTrain,param)

Fitting 7 folds for each of 30 candidates, totalling 210 fits
[CV] END bootstrap=True, criterion=entropy, max_depth=12, max_features=0.8, max_samples=0.7, min_samples_split=0.95, n_estimators=90, n_jobs=-1; total time=   3.5s
[CV] END bootstrap=True, criterion=entropy, max_depth=12, max_features=0.8, max_samples=0.7, min_samples_split=0.95, n_estimators=90, n_jobs=-1; total time=   3.6s
[CV] END bootstrap=True, criterion=entropy, max_depth=12, max_features=0.8, max_samples=0.7, min_samples_split=0.95, n_estimators=90, n_jobs=-1; total time=   3.6s
[CV] END bootstrap=True, criterion=entropy, max_depth=12, max_features=0.8, max_samples=0.7, min_samples_split=0.95, n_estimators=90, n_jobs=-1; total time=   3.6s
[CV] END bootstrap=True, criterion=entropy, max_depth=12, max_features=0.8, max_samples=0.7, min_samples_split=0.95, n_estimators=90, n_jobs=-1; total time=   3.3s
[CV] END bootstrap=True, criterion=entropy, max_depth=12, max_features=0.8, max_samples=0.7, min_samples_split=0.95, n

In [1]:
model2

NameError: name 'model2' is not defined

In [28]:
evaluateModel(dfTest,model2)

Result file saved
