In [None]:
#The number of fraudulent transactions is 429 out of 284807, only small part of whole dataset belongs to the fraudulent transactions 
#Since dataset has 31 features, it is necessary to find what differences that fraudulent transactions have. 
#There are not any null values in dataset.  
#Fraudulent instances account for only %0.172 of all transactions. 
#Therefore, the dataset has a strong imbalanced nature and the problem is two-class classification. 
#From the summary results of data in Python, all features between V1 to V28 have the same mean value as zero. 
#It can be interpreted that all anonymn features were normalized with mean 0. 
#As mentioned in Data explanation, these variables are the result of PCA transformations.
#Therefore, PCA transformation is not applied into the dataset again 
#Feauture selection is carried out by following EDA part results
#Only ‘Amount’ feature’s mean is different than zero, thus, “Amount” feature may need transformations for equity between variables while developing the machine learning model. 


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns #to visualization
import matplotlib.pyplot as plt # to plot the graphs
import matplotlib.gridspec as gridspec # to do the grid of plots
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score ,auc, roc_curve
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, neural_network
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore') #ignore warning messages 

In [None]:
data = pd.read_csv('../input/creditcard.csv')
data.head() #for obtaining first five rows of dataset
data.info() #information about dataset
data.describe()
data[["Time","Amount"]].describe() #important stats from original values
data[data.Amount>10000]### there is only seven points after 10.000, thus these values should be excluded from dataset
data_new=data[data.Amount<10000]
data_new.describe()

In [None]:
timedelta = pd.to_timedelta(data_new['Time'], unit='s')
data_new['Time_min'] = (timedelta.dt.components.minutes).astype(int) #new variable for further analysis
data_new['Time_hour'] = (timedelta.dt.components.hours).astype(int)  #new variable for further analysis


#Exploring the distribuition by Class types throught hours and minutes
plt.figure(figsize=(12,5))
sns.distplot(data_new[data_new['Class'] == 0]["Time_hour"],
             color='g')
sns.distplot(data_new[data_new['Class'] == 1]["Time_hour"],
             color='r')
plt.title('Fraud x Normal Transactions by Hours (Red: Fraud; Green:Normal)', fontsize=12)
plt.xlim([-1,25])
plt.show()


#Exploring the distribuition by Class types throught hours and minutes
plt.figure(figsize=(12,5))
sns.distplot(data_new[data_new['Class'] == 0]["Time_min"],
             color='g')
sns.distplot(data_new[data_new['Class'] == 1]["Time_min"],
             color='r')
plt.title('Fraud x Normal Transactions by minutes', fontsize=12)
plt.xlim([-1,61])
plt.show()


#To clearly the data of frauds and no frauds
df_fraud = data_new[data_new['Class'] == 1]
df_normal = data_new[data_new['Class'] == 0]

print("Fraud transaction statistics")
print(df_fraud["Amount"].describe())
print("\nNormal transaction statistics")
print(df_normal["Amount"].describe())


#Feature engineering to a better visualization of the values

data_new['Amount_log'] = np.log(data_new.Amount + 0.01) #logaritmic transformation due to skewness of fraud transaction distributon
plt.figure(figsize=(14,6))
#distribution of amount by time variable:
plt.subplot(121)
ax = sns.boxplot(x ="Class",y="Amount",
                 data=data_new)
ax.set_title("Class x Amount", fontsize=20)
ax.set_xlabel("Is Fraud?", fontsize=16)
ax.set_ylabel("Amount(US)", fontsize = 16)

plt.subplot(122)
ax1 = sns.boxplot(x ="Class",y="Amount_log", data=data_new)
ax1.set_title("Class x Amount", fontsize=20)
ax1.set_xlabel("Is Fraud?", fontsize=16)
ax1.set_ylabel("Amount(Log)", fontsize = 16)

plt.subplots_adjust(hspace = 0.6, top = 0.8)

plt.show()


#Looking the Amount and time distribuition of FRAUD transactions
ax = sns.lmplot(y="Amount", x="Time_min", fit_reg=False,aspect=1.8,
                data=data_new, hue='Class')
plt.title("Amounts by Minutes of Frauds and Normal Transactions",fontsize=8)
plt.show()


ax = sns.lmplot(y="Amount", x="Time_hour", fit_reg=False,aspect=1.8,
                data=data_new, hue='Class')
plt.title("Amounts by Hour of Frauds and Normal Transactions", fontsize=8)

plt.show()

### distribution of each classes for syntethic variables between V1-V28

plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(data_new[data_new.iloc[:, 1:29].columns]):
   ax = plt.subplot(gs[i])
   sns.distplot(data_new[cn][data_new.Class == 1], bins=50)
   sns.distplot(data_new[cn][data_new.Class == 0], bins=50)
   ax.set_xlabel('')
   ax.set_title('feature: ' + str(cn))
plt.show()


##Corelation matrix for whole data
colormap = plt.cm.Greens
plt.figure(figsize=(14,12))

sns.heatmap(data.corr(),linewidths=0.1,vmax=1.0,
            square=True, cmap = colormap, linecolor='white', annot=True)
plt.show()


##


#Feauture selection

data_new2 = data_new[["V3","V4","V9","V10","V11","V12","V17","V19","Amount","Class"]]

#correlation matrix
f, (ax1, ax2) = plt.subplots(1,2,figsize =( 15, 8))

sns.heatmap(data_new2.query('Class==1').drop(['Class'],1).corr(), vmax = .8, square=True, ax = ax1, cmap = 'YlGnBu')
ax1.set_title('Fraud')

sns.heatmap(data_new2.query('Class==0').drop(['Class'],1).corr(), vmax = .8, square=True, ax = ax2, cmap = 'YlGnBu');
ax2.set_title('Normal')

plt.show()


In [None]:
#Since PCA is already made on dataset, only important variables which are found by EDA are used in machine learning model.
#Random-under sampling, random-over sampling, SMOTE sampling is used for data-level strategies for imbalanced credit card dataset.

In [None]:
###Machine Learning Model:

###packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score ,auc, roc_curve
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, neural_network
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve
import itertools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_auc_score



#####After EDA Part Reduced Dataset& Analysis

df = pd.read_csv('../input/creditcard.csv')
print('The number of normal transactions : ' + str(sum(df.Class == 0)))
print('The number of frauds : ' + str(sum(df.Class == 1)))
print ('The percentage of fraud of all transactions : ' + str(float(sum(df.Class == 1))/float(len(df.Class))*100.0))
pd.value_counts(df['Class'])
df_reduced = df.drop(['V1','V2','V5','V6','V7','V8','V14','V13','V15','V16' ,'V18','V20','V21','V22','V23','V24','V25','V26','V27','V28'], axis =1)
df_reduced['Amount_Stand'] = StandardScaler().fit_transform(df_reduced['Amount'].values.reshape(-1, 1)) ###Standart normalizer on Amount
df_reduced = df_reduced.drop(['Time', 'Amount'], axis=1)
df_reduced.head()


In [None]:
##special thanks to this amazing work: https://www.kaggle.com/vincentlugat/votingclassifier-f1-score-0-88-data-viz

#####confusion_Matrix

# confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title = 'Confusion matrix"',
                          cmap = plt.cm.Blues) :
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])) :
        plt.text(j, i, cm[i, j],
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


####recall-F1 Score-Precision

def show_metrics():
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    print('Precision =     {:.3f}'.format(tp/(tp+fp)))
    print('Recall    =     {:.3f}'.format(tp/(tp+fn)))
    print('F1_score  =     {:.3f}'.format(2*(((tp/(tp+fp))*(tp/(tp+fn)))/
                                                 ((tp/(tp+fp))+(tp/(tp+fn))))))

# precision-recall curve
def plot_precision_recall():
    plt.step(recall_score, precision_score, color = 'b', alpha = 0.2,
             where = 'post')
    plt.fill_between(recall_score, precision_score, step ='post', alpha = 0.2,
                 color = 'b')

    plt.plot(recall_score, precision_score, linewidth=2)
    plt.xlim([0.0,1])
    plt.ylim([0.0,1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall Curve')
    plt.show();

# ROC curve
def plot_roc():
    plt.plot(fpr, tpr, label = 'ROC curve', linewidth = 2)
    plt.plot([0,1],[0,1], 'k--', linewidth = 2)
    plt.xlim([0.0,0.001])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show();
#feature importance plot
def plot_feature_importance(model):
    tmp = pd.DataFrame({'Feature': predictors, 'Feature importance': model.feature_importances_})
    tmp = tmp.sort_values(by='Feature importance',ascending=False)
    plt.figure(figsize = (12,12))
    plt.title('Features importance',fontsize=14)
    s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
    s.set_xticklabels(s.get_xticklabels(),rotation=90)
    plt.show()


In [None]:
#####Logistic Regression

#####LogisticRegression with Three Sampling Method 
#SMOTE
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = SMOTE().fit_sample(X, y)  ## Random Oversampler(), SMOTE; Random Undersampler
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )
##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)
y_pred = logistic_reg.predict(X_test)
y_score = logistic_reg.decision_function(X_test)

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Logistic Regression Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()


In [None]:
#####Logistic Regression

df = pd.read_csv('../input/creditcard.csv')
print('The number of normal transactions : ' + str(sum(df.Class == 0)))
print('The number of frauds : ' + str(sum(df.Class == 1)))
print ('The percentage of fraud of all transactions : ' + str(float(sum(df.Class == 1))/float(len(df.Class))*100.0))
pd.value_counts(df['Class'])
df_reduced = df.drop(['V1','V2','V5','V6','V7','V8','V14','V13','V15','V16' ,'V18','V20','V21','V22','V23','V24','V25','V26','V27','V28'], axis =1)
df_reduced['Amount_Stand'] = StandardScaler().fit_transform(df_reduced['Amount'].values.reshape(-1, 1)) ###Standart normalizer on Amount
df_reduced = df_reduced.drop(['Time', 'Amount'], axis=1)

#####LogisticRegression with Three Sampling Method 
#Random.Oversampler
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomOverSampler().fit_sample(X, y)  ## Random Oversampler(), SMOTE; Random Undersampler
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )
##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)
y_pred = logistic_reg.predict(X_test)
y_score = logistic_reg.decision_function(X_test)

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Logistic Regression Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()


In [None]:
#####Logistic Regression

#####LogisticRegression with Three Sampling Method 
#Random.Undersampler
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomUnderSampler().fit_sample(X, y)  ## Random Oversampler(), SMOTE; Random Undersampler
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )
##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)
y_pred = logistic_reg.predict(X_test)
y_score = logistic_reg.decision_function(X_test)

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Logistic Regression Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()


In [None]:
#####random forest with three sampling methods

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = SMOTE().fit_sample(X, y)
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )
##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_jobs = -1,
                                random_state = 42)

random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
y_score = random_forest.predict_proba(X_test)[:,1]

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Random Forest Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()


In [None]:
#####random forest with three sampling methods

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomOverSampler().fit_sample(X, y)
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )
##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_jobs = -1,
                                random_state = 42)

random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
y_score = random_forest.predict_proba(X_test)[:,1]

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Random Forest Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()


In [None]:
#####random forest with three sampling methods

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomUnderSampler().fit_sample(X, y)
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )
##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_jobs = -1,
                                random_state = 42)

random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
y_score = random_forest.predict_proba(X_test)[:,1]

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Random Forest Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()
### Feature importance:
predictors = ['V3','V4','V9','V10','V11','V12','V17','V19','AmountSTD']
plot_feature_importance(random_forest)

In [None]:
### Decision tree algorithm with three sampling methods

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = SMOTE().fit_sample(X, y)
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )

##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.8, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class

decision_tree=tree.DecisionTreeClassifier(max_depth=3)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
y_score = decision_tree.predict_proba(X_test)[:,1]

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Decision Tree Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()
predictors = ['V3','V4','V9','V10','V11','V12','V17','V19','AmountSTD']
plot_feature_importance(decision_tree)


In [None]:
### Decision tree algorithm with three sampling methods

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomOverSampler().fit_sample(X, y)
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )

##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.8, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class

decision_tree=tree.DecisionTreeClassifier(max_depth=3)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
y_score = decision_tree.predict_proba(X_test)[:,1]

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Decision Tree Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()
predictors = ['V3','V4','V9','V10','V11','V12','V17','V19','AmountSTD']
plot_feature_importance(decision_tree)


In [None]:
### Decision tree algorithm with three sampling methods

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomUnderSampler().fit_sample(X, y)
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )

##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.8, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class

decision_tree=tree.DecisionTreeClassifier(max_depth=3)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
y_score = decision_tree.predict_proba(X_test)[:,1]

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Decision Tree Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()
predictors = ['V3','V4','V9','V10','V11','V12','V17','V19','AmountSTD']
plot_feature_importance(decision_tree)


In [None]:
##Support vector classifier with three sampling methods

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = SMOTE().fit_sample(X, y)
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )
##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class

svm.LinearSVC()
svmlinear=svm.LinearSVC()

svmlinear.fit(X_train, y_train)
y_pred = svmlinear.predict(X_test)
y_score = svmlinear.decision_function(X_test)
# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'SVC Confusion matrix')
plt.show()
show_metrics()

## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()





In [None]:
##Support vector classifier with three sampling methods

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomOverSampler().fit_sample(X, y)
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )
##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class

svm.LinearSVC()
svmlinear=svm.LinearSVC()

svmlinear.fit(X_train, y_train)
y_pred = svmlinear.predict(X_test)
y_score = svmlinear.decision_function(X_test)
# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'SVC Confusion matrix')
plt.show()
show_metrics()

## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()





In [None]:
##Support vector classifier with three sampling methods

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomUnderSampler().fit_sample(X, y)
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )
##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class

svm.LinearSVC()
svmlinear=svm.LinearSVC()

svmlinear.fit(X_train, y_train)
y_pred = svmlinear.predict(X_test)
y_score = svmlinear.decision_function(X_test)
# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'SVC Confusion matrix')
plt.show()
show_metrics()

## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()





In [None]:

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = SMOTE().fit_sample(X, y)## RandomUnderSampler, RandomOverSampler
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )

##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.8, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
Naive_bayes_model=naive_bayes.GaussianNB()
Naive_bayes_model.fit(X_train, y_train)
y_pred = Naive_bayes_model.predict(X_test)
y_score = Naive_bayes_model.(X_test)

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Naive Bayes Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

In [None]:

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomUnderSampler().fit_sample(X, y)## RandomUnderSampler, RandomOverSampler
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )

##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.8, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
Naive_bayes_model=naive_bayes.GaussianNB()
Naive_bayes_model.fit(X_train, y_train)
y_pred = Naive_bayes_model.predict(X_test)
y_score = Naive_bayes_model.(X_test)

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Naive Bayes Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
print('ROC AUC Score:',metrics.roc_auc_score(y_test, y_pred))

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

In [None]:

X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
X_resample, y_resample = RandomOverSampler().fit_sample(X, y)## RandomUnderSampler, RandomOverSampler
print ( 'The number of transactions after resampling : ' + str(len(X_resample)))
print ('If the number of frauds is equal to the number of normal tansactions? ' + str(sum(y_resample == 0) == sum(y_resample == 1))
       )

##training size 0.8
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.8, random_state=3)
X = df_reduced.drop('Class', axis=1)
y = df_reduced.Class
Naive_bayes_model=naive_bayes.GaussianNB()
Naive_bayes_model.fit(X_train, y_train)
y_pred = Naive_bayes_model.predict(X_test)
y_score = Naive_bayes_model.(X_test)

# Confusion matrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm,
                      classes = class_names,
                      title = 'Naive Bayes Confusion matrix')
plt.show()
show_metrics()
## ROC_AUC Curve Score
metrics.roc_auc_score(y_test, y_pred)

# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()
# Precision-recall curve
precision_score, recall_score, thresholds = precision_recall_curve(y_test, y_score)
plot_precision_recall()

In [None]:
#### Model Training#####

## Summary for below part: SMOTE sampling with training size 0.8 provides better results.


##As mentioned above, in order to deal with imbalanced between classes in data sampling is needed. 
#Total number of instances in dataset are 284807. 
#Only 492 of instances are fraudulent transactions.  
#As mentioned above, fraudulent instances account for %0.172 of all transactions.  
#In order to solve the imbalanced issue, sampling methods are used. 
#These methods are SMOTE, random-under sampling, and random-over sampling. 
#Kernels above are repeated agaib by changin the sampling method and  training size.
#For three sampling methods, the training sizes from 50% to 90% are tested. 
#In random-under sampling, the majority class decreases randomly from 284315 to 492, which is the size of the minority class.  
#In random-over sampling and SMOTE, the minority class size increases from 492 to 284315. 
#total number of instance in the dataset is 568630 after resampling.

#In the logistic regression model with random-over sampling, train size’s change does not highly affect precision, recall, F1 score and AUC values. 
#However, increases in training size result as increases in accuracy metrics. 
#In SMOTE sampling, train size change slightly increases precision, recall, F1 score and AUC values. 
#In random-under sampling, both accuracy metrics have better values than over-sampling methods. 
#However, ROC Curve graph is similar to the straight line. 
#This shows that in random-under sampling, the logistic regression classifier’s prediction becomes independent from the label, 
#which is not better than a classifier that guesses randomly. 

#In the Naive-Bayes model with random-over sampling& SMOTE sampling, 
#train size ‘s change again does not highly affect accuracy metrics’ results. 
#Both over-sampling methods provide worse accuracy results than random-under sampling. 
#However, this is similar to the logistic regression model performance with random-under sampling. 
#The reason is an over-fitted model because of the lost information by under-sampling method. 

#In the random forest model with random-over sampling& SMOTE sampling, train size’s change affects accuracy metrics’ result.
#In random-over sampling, when train size increase from 50% to 80%, precision increases from 0.944 to 1.
#Also, recall and F1 score increase to 1. Moreover, the AUC value increases from 0.93 to 0.99. 
#For sampling methods, SMOTE provides the best performance in terms of accuracy metrics and AUC values which is 0.99 when train size is 50%. Train size’s changes also do not affect performance in SMOTE sampling.

#In the decision tree algorithm with SMOTE sampling, train size change does not affect accuracy metrics’ result. 
#Also, the AUC value does not improve by changing the train size. Only sampling method affects model’s performance. 
#Random-under sampling method decreases the model’s performance. 
#AUC value is 0.91 in random-under sampling, which is smaller when SMOTE sampling method is used. 

#In the support vector classifier algorithm with SMOTE sampling, train size’s change does not affect accuracy metrics’ result. 
#Also, the AUC value does not increase, when train size increases from 50% to 80%. 
#Random-over sampling also provides similar results to SMOTE sampling. 
#Like other algorithms, random under-sampling provides worse results in terms of accuracy metrics and AUC results.


In [None]:
### Parameter tuning-grid search

###Following table is obtained by using kernels above and grid search


#In this part, both methodologies and performance with sampling methods’ choices will be compared. 
#As mentioned above, in EDA part, following variables are excluded from data: V1, V2, V5, V6, V7, V8, V13, V14, V15, V16, V17 V18, V20, V21, V22, V23, V24, V25, V26, V27, V28. 
#Moreover, in the EDA part, ‘Time’ feature does not show a significant effect on defining a transaction as if fraudulent or not. Thus, ‘Time’ feature is also excluded from data.
# Since variables between V1-V28 are all transformed and anonymous variables, another PCA transformation is not applied to the dataset again. As suggested in the ‘About Data’ section, Area Under Curve (AUC) is used as a key performance indicator.

#As the data-level approach solution for imbalanced data, random-over sampling and SMOTE sampling provides similar results which are far better than random-under sampling.
#Therefore, SMOTE sampling is chosen for further analysis. SMOTE sampling provides slightly better results in logistic regression and random forest.  Since SMOTE sampling & logistic regression, decision-tree and random forest provides better results, grid search is applied to these three algorithms for parameter-tuning.  

#According to the results at Random Forest, Logistic Regression and Decision Tree algorithms provide better results at AUC values. 
#For Precision metric, Random Forest, Logistic regression and SVC provide good results. Support Vector Classifier (SVC) provides a good result at precision however, its Recall score of SVC’s is 0.608 which is not satisfactory as other algorithms.

# Best parameters for logistic regression:
#C:100, Weight: Balanced, Penalty: L1

# Best parameters for decision tree:
# Maximum depth:9

# Best parameters for random forest:
#n_estimators = 500, max_features = 3,min_samples_leaf = 1, min_samples_split = 10,




In [None]:
# Further research:
#In further research, as a data-level approach, cluster based data approach and informed over sampling may be used to investigate the data.  
#MCC and Kappa can be used for comparing the various algorithms as an evaluation metric. 
#Moreover, ensemble methods such as gradient tree boosting and XG boosting can be used for analyzing data further.


In [None]:
#In this study the following works are used. Since this is my first EDA and machine learning analysis, sorry for the possible errors:)
#I have to thank all the people for valueable Kaggle works.
#https://github.com/ireneliu521/Credit-Card-Fraud_J2D_Project_Python/blob/master/Credit%20Card%20Fraud%20Detection.ipynb
#https://www.kaggle.com/vincentlugat/votingclassifier-f1-score-0-88-data-viz
#https://www.kaggle.com/kabure/credit-card-fraud-prediction-rf-smote
#https://www.kaggle.com/qianchao/smote-with-imbalance-data