In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from lightgbm.sklearn import LGBMRegressor

from sklearn.model_selection import GridSearchCV


import sklearn
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve,auc
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score


## Import Data

In [18]:
df=pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [19]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.16598e-15,3.416908e-16,-1.37315e-15,2.086869e-15,9.604066e-16,1.490107e-15,-5.556467e-16,1.177556e-16,-2.406455e-15,...,1.656562e-16,-3.44485e-16,2.578648e-16,4.471968e-15,5.340915e-16,1.687098e-15,-3.666453e-16,-1.220404e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


### Check the balance of Data

In [20]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

## Split Data to Train and Test

In [21]:
features=df.iloc[:,:-1]
labels=df.iloc[:,-1]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(features,labels, test_size=0.2,random_state=100, stratify=labels)

In [23]:
for dataset in [y_train, y_test]:
    print(round(len(dataset)/len(labels),2))
    print(dataset.value_counts())

0.8
0    227451
1       394
Name: Class, dtype: int64
0.2
0    56864
1       98
Name: Class, dtype: int64


## Normalized Data

In [24]:
Scaled=MinMaxScaler(feature_range = (0, 1))
#it makes more scence to fit on train data and then transform for all train, test, validation data.

In [25]:
X_train=Scaled.fit(X_train).transform(X_train)

In [41]:
X_test=Scaled.transform(X_test)

# Create XGBOOST HYperparameter Toning

In [36]:
from sklearn.model_selection import KFold, cross_val_score


In [37]:
num_folds=2
kf = KFold(n_splits=num_folds, random_state=random_state)

In [28]:
random_state=42


In [38]:
param_grid={'learning_rate': np.logspace(-3, -1, 3),
            'max_depth':  np.linspace(5,12,8,dtype = int),
            'n_estimators': np.linspace(800,1200,5, dtype = int),
            'random_state': [random_state]}

In [39]:
model = LGBMRegressor(random_state=random_state)

In [41]:
gs=GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', 
                n_jobs=-1, cv=kf, verbose=False)

In [None]:
gs.fit(X_train, y_train)


In [None]:
gs_test_score=mean_squared_error(test_targets, gs.predict(test_data))


In [None]:
print("Best MSE {:.3f} params {}".format(-gs.best_score_, gs.best_params_))


In [11]:
# defining the space for hyperparameter tuning

'''
hp.choice(label, options) — Returns one of the options, which should be a list or tuple.
hp.randint(label, upper) — Returns a random integer between the range [0, upper).
hp.uniform(label, low, high) — Returns a value uniformly between low and high.
hp.quniform(label, low, high, q) — Returns a value round(uniform(low, high) / q) * q, i.e it rounds the decimal values and returns an integer
hp.normal(label, mean, std) — Returns a real value that’s normally-distributed:mean, std.
'''

parameters={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180
}

In [14]:
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.model_selection import GridSearchCV

gb=GradientBoostingClassifier()


In [15]:
cv=GridSearchCV(gb, parameters, cv=5)

ValueError: Parameter values for parameter (max_depth) need to be a sequence(but not a string) or np.ndarray.

In [None]:

def hyperparameter_tuning(parameters):
    clf=xgboost.XGBClassifier(n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                         reg_alpha = int(space['reg_alpha']),min_child_weight=space['min_child_weight'],
                         colsample_bytree=space['colsample_bytree'])
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)

    pred = clf.predict(X_test)
    f1score = f1_score(y_test, pred>0.5)
    print ("SCORE:", f1score)
    #change the metric if you like
    return {'loss':-f1score, 'status': STATUS_OK }


# run the hyper paramter tuning
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print (best)