In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../input/creditcard.csv')

In [None]:
df.head()

In [None]:
%matplotlib inline
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.combine import SMOTEENN 

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

np.random.seed(5)

In [None]:
df.Class.value_counts()

In [None]:
class_ = df.Class # since class is preserved in Python, use class_ instead
df.drop('Class', axis=1, inplace=True)
df.insert(0, 'Class', class_)
df.head()

 **Check missing values**

In [None]:
df.isnull().any()

In [None]:
fraud_rate = 100*df.Class.value_counts() / df.shape[0]
fraud_rate

In [None]:
df.describe().T

In [None]:
# Overview of fraud and normal transactions
fraud_summary = df.groupby('Class')
q = fraud_summary.mean().T
q

In [None]:
corr = df.corr()
# plot heat map
fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            ax = ax,
            cmap='YlGnBu')
plt.title('Heatmap of Correlation Matrix')

A one-sample t-test checks whether a sample mean differs from the population mean. Let us test to see whether the average amount of transaction classified as fraud differs from the entire population.

In [None]:
amount_population = df.Amount.mean()
amount_fraud = df[df.Class == 1].Amount.mean()
print('mean amount of population: {}, mean amount of fraud transaction: {}'.format(amount_population, amount_fraud))
import scipy.stats as stats
stats.ttest_1samp(df[df['Class']==1]['Amount'], df[df['Class']==0]['Amount'].mean())

In [None]:
degree_freedom = len(df[df['Class']==1])
conf_level = 0.99

LQ = stats.t.ppf((1-conf_level)/2,degree_freedom)  # Left Quartile

RQ = stats.t.ppf((1+conf_level)/2,degree_freedom)  # Right Quartile

print ('The t-distribution left quartile range is: ' + str(LQ))
print ('The t-distribution right quartile range is: ' + str(RQ))
stats.ttest_1samp(df[df['Class']==1]['Amount'], df[df['Class']==0]['Amount'].mean())

3.4. Distribution plots

In [None]:
# For computational efficiency, only visualize pairwise relationships among several features, 
# including two principal components
sns.pairplot(df.loc[:, ['Class', 'Amount', 'Time', 'V1', 'V2']], hue='Class')

In [None]:
# Kernel Density Plot
fig = plt.figure(figsize=(16,9),)
ax=sns.kdeplot(df.loc[(df['Class'] == 0), 'Time'] , color='b', shade=True,label='normal transaction')
ax=sns.kdeplot(df.loc[(df['Class'] == 1), 'Time'] , color='r', shade=True, label='fraud transaction')
plt.title('Transaction amount distribution - normal V.S. fraud')

In [None]:
sns.lmplot(x='Time', y='Amount', data=df,
           fit_reg=False, # No regression line
           hue='Class')   # Color by evolution stage

In [None]:
sns.jointplot(x='Time', y='Amount', data=df[df['Class']==0], color='b')
sns.jointplot(x='Time', y='Amount', data=df[df['Class']==1], color='r')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=5)
X = df.drop(['Class', 'Time'], axis=1)
X = StandardScaler().fit_transform(X.values)
y = df['Class'].values
for train_index, test_index in sss.split(X, y):
    X_train_ = X[train_index, :]
    y_train_ = y[train_index]
    X_test = X[test_index, :]
    y_test = y[test_index]

In [None]:
y_train_pos = y_train_[y_train_ == 1]
y_test_pos = y_test[y_test == 1]
print('# positive in train data: {}, {}%'.format(y_train_pos.shape[0], y_train_pos.shape[0]*100. / y_train_.shape[0]))
print('# positive in test data: {}, {}%'.format(y_test_pos.shape[0], y_test_pos.shape[0]*100. / y_test.shape[0]))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold

def kfold_cv(Model, X, y, n_splits=10, smote=False, verbose=False):
    """
    Args:
        model: object that has fit, predict_proba methods
        X: array
        y: array
        n_splits: number of splits
    """
    skf = StratifiedKFold(n_splits, random_state=5, shuffle=True)
    C = np.logspace(-3, 3, num=7, base=10)
    def sub_cv(model):
        kfold = skf.split(X, y)
        scores = 0
        recall = 0
        if smote:
            sme = SMOTEENN(random_state=5)
        i = 0
        for train_index, test_index in kfold:
            X_train_ = X[train_index, :]
            y_train_ = y[train_index]
            X_test = X[test_index, :]
            y_test = y[test_index]
            if smote:
                X_train, y_train = sme.fit_sample(X_train_, y_train_)
            else:
                X_train = X_train_
                y_train = y_train_
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_score = model.predict_proba(X_test)[:, 1]
            score = roc_auc_score(y_test, y_score, average='micro')
            if verbose:
                print('Trained {} th model, AUC score: {}'.format(i+1, score))
            scores += score
            recall += recall_score(y_test, y_pred)
            i += 1
        return scores / i, recall / i
    bestC = 0
    bestauc = 0
    bestrecall = 0
    for c in C:
        model = Model(class_weight='balanced', C=c)
        auc, recall = sub_cv(model)
        if recall > bestrecall:
            bestauc = auc
            bestC = c
            bestrecall = recall
        print('C: {}, AUC: {}, recall: {}, best C: {}'.format(c, auc, recall, bestC))
    return bestC, bestauc, bestrecall

In [None]:
Model = LogisticRegression
bestC, bestauc, bestrecall = kfold_cv(Model, X_train_, y_train_, n_splits=5, verbose=False)
print('Best C: {}'.format(bestC))

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

def getCvMetrics(cfr, X, y):    
    #Stratified K-Fold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
    #iterate through the training and test cross validation segments and
    #run the classifier on each one, aggregating the results into a list
    print('Iterating through the training and cross validation sets...')
    ploss = []
    aucs = []
    for train, cval in cv.split(X, y):
        cfr.fit(X[train], y[train])
        loss = log_loss(y[cval], cfr.predict_proba(X[cval]))
        auc = roc_auc_score(y[cval], cfr.predict_proba(X[cval])[:, 1])
        print(loss, auc)
        ploss.append(loss)
        aucs.append(auc)

    #print out the mean of the cross-validated results
    print('Mean log-loss: %f. Mean AUC %f' % 
                            (np.array(ploss).mean(), np.array(aucs).mean()))

print('Loading data...')
dfcard = pd.read_csv('../input/creditcard.csv')
print(dfcard.shape)
# print(dfcard.head())
# print(dfcard.describe())
# print(dfcard.info())

# print('Fraud times')
# print(dfcard.loc[dfcard.Class==1, 'Time'].describe())
# print('No Fraud times')
# print(dfcard.loc[dfcard.Class==0, 'Time'].describe())

fraud = dfcard.Class
print('Fraud ratio: %f' % fraud.mean())
print(fraud.value_counts())

train=dfcard.drop(['Class'],axis=1)
print('Train set shape {}'.format(train.shape))
print(train.columns)

print('Training...')
Xtrain, Xval, ytrain, yval = train_test_split(train, fraud, test_size=0.2, stratify=fraud, 
                                                                    random_state=4)
#clf = xgboost.XGBClassifier(max_depth=6, learning_rate = 0.05, 
                #subsample = 0.9, colsample_bytree = 0.9, 
 #               n_estimators=100, base_score=0.0017, nthread=-1) #0.97172
#clf = GradientBoostingClassifier(max_depth=6, learning_rate = 0.05, 
                #subsample = 0.9, colsample_bytree = 0.9, 
#               n_estimators=100) # 0.867587
clf = ExtraTreesClassifier(n_estimators=200, class_weight='balanced', max_depth=7, 
                            random_state=12, n_jobs=-1) # 0.982360
#clf = LogisticRegression(C=1000, class_weight='balanced')
print('Val AUC: %f'%roc_auc_score(yval, clf.fit(Xtrain, ytrain).predict_proba(Xval)[:,1]))
# getCvMetrics(clf, train.values, fraud) 
# xgb: 0.982599 [0.9871420, 0.988217, 0.983688, 0.983114, 0.970831]
# etrees: 0.979302 [0.990166, 0.982376, 0.978804, 0.976500, 0.968663]
#scores = cross_val_score(clf, train.values, fraud, cv=5, scoring='roc_auc')
#print('Validation AUCs (5-fold)')
#print(scores.mean(), scores) 
# xgb: 0.971972 [ 0.98227762  0.95253806  0.95207493  0.98965631  0.98331328]

In [None]:
import sklearn
sklearn.metrics.confusion_matrix(yval,clf.predict(Xval))