**Import the Data and import useful libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plot
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report, precision_score
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

data= pd.read_csv('../input/creditcard.csv')

**Check if classes are imbalanced**

In [None]:
count_classes = pd.value_counts(data['Class'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plot.title("Fraud class histogram")
plot.xlabel("Class")
plot.ylabel("Frequency")

There is a clear imbalance between the two classes. We will need to sample the data to make the two classes equally represented in the training and test data.

Now let's what ranges the data has 

In [None]:
data.describe()

All features show a mean very close to zero except for Amount.

**Normalize the Amount column and drop the not so useful Time column**

In [None]:
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time','Amount'],axis=1)
data.head()

In [None]:
# Number of data points in the minority class
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)

nonfraud_indices = data[data.Class == 0].index

random_nonfraud_indices = np.random.choice(nonfraud_indices, number_records_fraud, replace = False)
random_nonfraud_indices = np.array(random_nonfraud_indices)

# concatenate everything together
sampled_indices = np.concatenate([fraud_indices,random_nonfraud_indices])

# dataset
sampled_data = data.iloc[sampled_indices,:]

X_sampled = sampled_data.ix[:, sampled_data.columns != 'Class']
y_sampled = sampled_data.ix[:, sampled_data.columns == 'Class']

# Showing ratio
print("Percentage of legit transactions: ", len(sampled_data[sampled_data.Class == 0])/len(sampled_data))
print("Percentage of fraud transactions: ", len(sampled_data[sampled_data.Class == 1])/len(sampled_data))
print("Total number of transactions in sampled data: ", len(sampled_data))

**Divide into training and test**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sampled,y_sampled,test_size = 0.3, random_state = 0)

**Create the first classifier by fitting a decision tree**

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

**Let's evaluate the decision tree with the precision recall curve**
First get the predictions for  the test data

In [None]:
y_test_pred = clf.predict_proba(X_test)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_test_pred[:,1])

In [None]:
plot.plot(recall, precision, color='navy', label='Precision-Recall curve')
plot.ylabel('precision')
plot.xlabel('recall')

For this data set. We need to evaluate the classifiers with a recall metric because our priority is to detect fraudulent transactions. Undetected fraud cases are more problematic than cases where we wrongly classify fraud cases as legit transactions. Let's see our best recall score.

In [None]:
recall[recall <1 ].max()

**Now let's train a logistic regression model on the sae data and compare it to our tree classifier**

Define a function which does k-fold cross validation training for multiple values of the C parameter and returns the C parameter which results in the best model with regards to the recall metric

In [None]:
def best_Kfold_Cparam(x_train_data,y_train_data,c_param_range,k):
    fold = KFold(len(y_train_data),k,shuffle=True) 
    
    results = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score', 'Mean '])
    j = 0
    for c_param in c_param_range:
        print('C parameter: ', c_param)
        
        recall_accs = []
        for iteration, indices in enumerate(fold,start=1):
            train_indices = indices[0]
            validation_indices = indices[1]
            
            lr = LogisticRegression(C = c_param, penalty = 'l1')
            lr.fit(x_train_data.iloc[train_indices,:],y_train_data.iloc[train_indices,:].values.ravel())
           
            y_pred_sample = lr.predict(x_train_data.iloc[validation_indices,:].values)

            recall_acc = recall_score(y_train_data.iloc[validation_indices,:].values,y_pred_sample)
            recall_accs.append(recall_acc)            

        # save all recall scores associated with their c parameter
        results.ix[j,'Mean recall score'] = np.mean(recall_accs)
        results.ix[j,'C_parameter'] = c_param
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')

    best_c = results.loc[results['Mean recall score'].idxmax()]['C_parameter']
    
    print('Best model w/ regards to recall has C parameter ', best_c)
    
    return best_c

In [None]:
best = best_Kfold_Cparam(X_train,y_train,[0.00001,0.0001,0.001,0.01,0.1,1,10],10)

0.001 is our best C parameter value. Now train a logistic classifier for real

In [None]:
flr = LogisticRegression(C = 0.001, penalty = 'l1')
flr.fit(X_train,y_train.values.ravel())

Now let's evaluate our regression model and compare it to the decision tree previously created

In [None]:
y_pred_test = flr.predict_proba(X_test)

In [None]:
precision_2, recall_2, thresholds_2 = precision_recall_curve(y_test, y_pred_test[:,1])

In [None]:
plot.plot(recall, precision, color='navy', label='Precision-Recall curve')
plot.plot(recall_2, precision_2, color='red', label='Precision-Recall curve')
plot.ylabel('precision')
plot.xlabel('recall')

Recall stays higher for the regression model for a wider range of precision values
Our regression model seems to be doing better !

Let's try AdaBoost'

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

AdaBoostClf = AdaBoostClassifier(n_estimators=200)



In [None]:
AdaBoostClf = AdaBoostClf.fit(X=X_train, y=y_train.values.ravel())

Evaluate the model with precision-recall

In [None]:
y_test_pred = AdaBoostClf.predict_proba(X_test)

In [None]:
precision_3, recall_3, thresholds_3 = precision_recall_curve(y_test, y_test_pred[:,1])

In [None]:
plot.plot(recall, precision, color='navy', label='Precision-Recall curve')
plot.plot(recall_2, precision_2, color='red', label='Precision-Recall curve')
plot.plot(recall_3, precision_3, color='green', label='Precision-Recall curve')
plot.ylabel('precision')
plot.xlabel('recall')