In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
%matplotlib inline
from imblearn.over_sampling import SMOTE
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, recall_score,precision_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, accuracy_score


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv("../input/creditcard.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
#Check for missing values in the data
data.apply(lambda x: sum(x.isnull()))

In [None]:
#check for categorical varaibles in the data
data.apply(lambda x: len(x.unique()))

In [None]:
#Compute correlation matrix
corr = data.drop(['Class'], axis=1).corr()
import seaborn as sns
get_ipython().magic(u'matplotlib inline')
#Generate a mask for upper  traingle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] =True

#Set up matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

#Generate a custom diverging color map
cmap= sns.diverging_palette(220, 10, as_cmap= True)


#Draw the heat map with the mask and correct aspect ratio

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, xticklabels=5, yticklabels=5,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

In [None]:
sns.countplot(data['Class'])
print(data['Class'].value_counts())

Following points are clear from the above analysis.
     1. base_line_accuracy itself measures to be 99.8%
     2. It is very much clear that dataset is highly imbalanced.
     3. So dataset needs be "undersampled" or "oversampled" to make sure that output labels are of same number.

In [None]:
del data['Time']

In [None]:
train_df, test_df = train_test_split(data, test_size = 0.2, random_state = 144)

In [None]:
train_df, cv_df = train_test_split(train_df, test_size = 0.2, random_state = 144)

In [None]:
print(train_df.shape)
print(cv_df.shape)
print(test_df.shape)

In [None]:
x_train = train_df.iloc[:, train_df.columns != "Class"]
y_train = train_df.iloc[:, train_df.columns == "Class"]

In [None]:
oversampler = SMOTE(random_state=144)
x_os_train, y_os_train = oversampler.fit_sample(x_train, y_train)

In [None]:
print("No.od variables with class labels as 1: ", len(y_os_train[y_os_train==1]))
print("No.of variables with class labels as 0: ", len(y_os_train[y_os_train==0]))

In [None]:
def printing_Kfold_scores(x_train, y_train):
    fold = KFold(len(y_train), 5,  shuffle=False)
    #
    #Defining C-parameters
    
    c_parm_range = [0.01,0.1,1,10]
    
    results_table= pd.DataFrame(index=range(len(c_parm_range),2), columns=['C_Parameter','Mean recall score'])
    
    results_table['C_Parameter'] = c_parm_range
    
    j=0
    for c_parm in c_parm_range:
        print("___________________________________________")
        print("C parameter: ", c_parm)
        print("___________________________________________")
        print("    ")
        recall_accs =[]
        for iteration, indices in enumerate(fold, start=1):
            #Call logistic with each c-parameter
            lr = LogisticRegression(C= c_parm, penalty='l1')
            
            lr.fit(x_train.iloc[indices[0],:], y_train.iloc[indices[0],:].values.ravel())
            
            y_pred_undersample=lr.predict(x_train.iloc[indices[1],:].values)
            
            recall_acc = recall_score(y_train.iloc[indices[1],:].values, y_pred_undersample)
            
            recall_accs.append(recall_acc)
            
            print("Iteration : ", iteration, " : recall score = ",recall_acc)
        
        results_table.loc[j, "Mean recall score"] = np.mean(recall_acc)
        
        j+=1
        
        print("                ")
        print("Mean recall score ", np.mean(recall_accs))
        print("                ")

In [None]:
best_c = printing_Kfold_scores(pd.DataFrame(x_os_train), pd.DataFrame(y_os_train))

In [None]:
x_cv = cv_df.iloc[:, cv_df.columns != "Class"]
y_cv = cv_df.iloc[:, cv_df.columns == "Class"]

In [None]:
model = LogisticRegression(C=1, penalty= 'l1')
model.fit(x_os_train,y_os_train)

In [None]:
y_pred = model.predict(x_cv)
confusion_matrix = confusion_matrix(y_cv, y_pred)
print(confusion_matrix)

In [None]:
print(classification_report(y_cv, y_pred))

print("Accuracy:",accuracy_score(y_cv, y_pred))
print("Precision:",precision_score(y_cv, y_pred))
print("Recall:",recall_score(y_cv, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_cv, model.predict(x_cv))
fpr, tpr, thresholds = roc_curve(y_cv, model.predict_proba(x_cv)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
logit_roc_auc

In [None]:
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
optimal_threshold

In [None]:
y_pred_th = y_cv.applymap(lambda x: 1 if x > optimal_threshold else 0)

In [None]:
print(classification_report(y_cv, y_pred_th))

In [None]:
confusion_matrix = confusion_matrix(y_cv, y_pred_th)
print(confusion_matrix)