##  Credit Card Fraud Detection

### Exploratory Data Analysis (EDA)

In [None]:
# Import 

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

%matplotlib inline 

RANDOM_SEED = 33

plt.style.use('bmh')

In [None]:
# We started the analysis process, studying the available data

import os
print(os.listdir("../input"))
df = pd.read_csv('../input/creditcard.csv') 
df.head()

In [None]:
df.info()

In [None]:
# Remove unnecessary columns
df2 = df.drop('Time', axis=1)
df2.head()

In [None]:
# density por normed (deprecated)

bins=80
plt.figure(figsize=(20,4))
plt.hist(df2.Class[df2.Class==1],bins=bins,density=True,alpha=0.8,label='Fraud',color='red')
plt.hist(df2.Class[df2.Class==0],bins=bins,density=True,alpha=0.8,label='Not Fraud',color='blue')
plt.legend(loc='upper right')
plt.xlabel('Valor')
plt.ylabel('% de Registros')
plt.title('Transacciones vs Valor')
plt.show()

In [None]:
print(df2['Class'].value_counts())
sns.countplot(x = 'Class', data = df2)

plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

 Dataset (df2) is totally unbalanced.

### Correlation of variables

In [None]:
# We generate the correlation matrix and look only at those variables with a high correlation level

corr_base = df2.corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr_base[(corr_base >= 0.5) | (corr_base <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

In [None]:
# We study the rest of variables: Class= 1:Fraud , Class= 0:No Fraud
y = df2.Class
x = df2.drop('Class',axis=1)

In [None]:
#PCA      

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=29, whiten=True)
sklearn_pca.fit(x)
features_pca = pd.DataFrame(data = sklearn_pca.transform(x))

In [None]:
n_dim = 29
plt.figure(figsize=(12, 5))
rects1 = plt.bar(np.arange(n_dim),sklearn_pca.explained_variance_, color='r')
print(sklearn_pca.explained_variance_) 

In [None]:
# Group of features. The generation of these graphs takes some computing time.

x_scaled=(x-x.min())/(x.max()-x.min()) 
sub_df1=pd.concat([y,x_scaled.iloc[:,0:10]],axis=1)
sub_df2=pd.concat([y,x_scaled.iloc[:,10:20]],axis=1)
sub_df3=pd.concat([y,x_scaled.iloc[:,20:30]],axis=1)

sub_df11=pd.melt(sub_df1,id_vars="Class",var_name="Variable",value_name='Valor')
sub_df22=pd.melt(sub_df2,id_vars="Class",var_name="Variable",value_name='Valor')
sub_df33=pd.melt(sub_df3,id_vars="Class",var_name="Variable",value_name='Valor')

plt.figure(figsize=(20,8))
sns.violinplot(x="Variable",y="Valor",hue="Class",data=sub_df11, split=True)
plt.figure(figsize=(20,8))
sns.violinplot(x="Variable",y="Valor",hue="Class",data=sub_df22, split=True)
plt.figure(figsize=(20,8))
sns.violinplot(x="Variable",y="Valor",hue="Class",data=sub_df33, split=True)
plt.figure(figsize=(20,8))



### DataSet balancing and data normalization


At this point we have left the dataset with only those variables that a-priori can provide some type of information regarding fraud. Now let's analyze the dataset at the information type level.

As you will see below, a common problem that we find in the operational analytics is that the dataset is unbalanced, so there are many more samples of a class or label, than of the other class.

This is especially important when you detect anomalies or situations of fraud detection. There are many more legal transactions  that fraudulent.

In [None]:
count_classes = pd.value_counts(df2['Class'], sort = True).sort_index()
labels = 'Fraud', 'Not Fraud'
sizes = [count_classes[1]/(count_classes[1]+count_classes[0]), count_classes[0]/(count_classes[1]+count_classes[0])]
explode = (0, 0.5,)  
colors = ['red', 'lightblue']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, colors=colors, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=45)
ax1.axis('equal')  
plt.title("Distribution of the Dataset in labeled classes")
plt.show()

The dataset is very unbalanced. There are techniques to improve the balance of the same, but the most important thing is that these types of situations should be taken into account when analyzing the results of the models that we apply. We will see it when we analyze the results obtained.

In [None]:
df2.shape

#### DataSet Normalization

We will analyze within the dataset those columns whose minimum value is less than -1 and maximum value greater than 1. 
To do this we will support the function "describe" a DataFrame that gives us all this information and more.

In [None]:
tt = df2.describe().transpose()
tt[(tt['max']>1) & (tt['min']< -1)]


In [None]:
plt.figure(figsize=(20,8))
plt.hist(df2.Amount, bins=50)

In [None]:
# We normalize all the columns

columns_to_norm = ['V1','V2','V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 
                   'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

In [None]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler() 
df2[columns_to_norm]=min_max_scaler.fit_transform(df2[columns_to_norm])

In [None]:
tt = df2.describe().transpose()
tt[(tt['max']>1) & (tt['min']< -1)]

There are no more variables to normalize, all are in the expected ranges.


In [None]:
# We generate a help function for the rest of modules of face to visualize the arrays of confusion.

from sklearn.metrics import confusion_matrix, classification_report, auc, precision_recall_curve, roc_curve
def plot_confusion_matrix(y_test, pred):
    
    y_test_legit = y_test.value_counts()[0]
    y_test_fraud = y_test.value_counts()[1]
    
    cfn_matrix = confusion_matrix(y_test, pred)
    cfn_norm_matrix = np.array([[1.0 / y_test_legit,1.0/y_test_legit],[1.0/y_test_fraud,1.0/y_test_fraud]])
    norm_cfn_matrix = cfn_matrix * cfn_norm_matrix

    fig = plt.figure(figsize=(12,5))
    ax = fig.add_subplot(1,2,1)
    sns.heatmap(cfn_matrix,cmap='coolwarm_r',linewidths=0.5,annot=True,ax=ax)
    plt.title('Confusion matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    ax = fig.add_subplot(1,2,2)
    sns.heatmap(norm_cfn_matrix,cmap='coolwarm_r',linewidths=0.5,annot=True,ax=ax)

    plt.title('Standardized Confusion matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
    print('---Report de classifition---')
    print(classification_report(y_test,pred))

#### Data preparation for learning and launch of Model

The first step is to separate data in training and test data. In this exercise a simple approximation is used, separating the data in 80% training, 20% test. More advanced methods can be applied, including later validation data or a greater randominzaci髇 of the data to be obtained, but this approximation is sufficient for this case.

It is necessary to separate the data into variables and labels.


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df2, test_size=0.2, random_state=RANDOM_SEED)
Y_train = X_train['Class']
X_train = X_train.drop(['Class'], axis=1)
Y_test = X_test['Class']
X_test = X_test.drop(['Class'], axis=1)

Let's apply a regression model from start. It is a linear regularization model that uses a Stochastic Gradient Descent (SGD).It is a model that is strong enough for a first test.


In [None]:
from sklearn import metrics

sgd_clf=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

sgd_clf.fit(X_train, Y_train) 
Y_train_predicted=sgd_clf.predict(X_train)
Y_test_predicted=sgd_clf.predict(X_test)

plot_confusion_matrix(Y_test, Y_test_predicted)

#### Unbalance the dataset

A strategy to follow is to reduce the number of correct samples, to try to balance the dataset. Since it is not an especially large dataset it is necessary to be careful with this operation, because reducing drastically the number of samples will penalize the model by not having enough data for its learning.

In [None]:
from sklearn.utils import shuffle

Train_Data= pd.concat([X_train, Y_train], axis=1)
X_1 =Train_Data[ Train_Data["Class"]==1 ]
X_0=Train_Data[Train_Data["Class"]==0]

X_0=shuffle(X_0,random_state=42).reset_index(drop=True)
X_1=shuffle(X_1,random_state=42).reset_index(drop=True)

ALPHA=1.15 

X_0=X_0.iloc[:round(len(X_1)*ALPHA),:]
data_d=pd.concat([X_1, X_0])

count_classes = pd.value_counts(data_d['Class'], sort = True).sort_index()
labels = 'Fraud', 'Not Fraud'
sizes = [count_classes[1]/(count_classes[1]+count_classes[0]), count_classes[0]/(count_classes[1]+count_classes[0])]
explode = (0, 0.05,)
colors = ['red', 'lightblue']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, colors=colors, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=45)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("Distribuci髇 del dataset en clases")
plt.show()

In [None]:
data_d.head()

In [None]:
data_d.shape

In [None]:
# Convertimos el dataframe a matriz(array).
dataset=data_d.values

In [None]:
Y_d=data_d['Class']
X_d=data_d.drop(['Class'],axis=1)

In [None]:
sgd_clf_d=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

sgd_clf_d.fit(X_d, Y_d) 
Y_test_predicted=sgd_clf_d.predict(X_test)

plot_confusion_matrix(Y_test, Y_test_predicted)






#### We will execute the same balanced dataset with a more complex and powerful model such as RandomForest, and analyze the results.

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
rf =RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0, n_jobs=-1)
rf.fit(X_d, Y_d) 
Y_test_predicted=rf.predict(X_test)

plot_confusion_matrix(Y_test, Y_test_predicted)

### This is my first kernel. If you liked my kernel, so far I appreciate a vote in favor.