In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as dc
from sklearn.svm import SVC as svm
from sklearn.linear_model import LogisticRegression as lr
from sklearn.naive_bayes import GaussianNB as nb


from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.metrics import accuracy_score as acc

In [2]:
creditcard = pd.read_csv('creditcard.csv', encoding='ISO-8859-1')

In [3]:
frauds = creditcard[creditcard.Class == 1]

#Downsampling the normal credit card transactions
normals = creditcard[creditcard.Class == 0]

#upsampling the fraus credit card transactions
sets = frauds
for i in range(0, int(len(normals)/len(frauds))):
    sets = frauds.append(sets)

#shuffling the multipled frauds data now
sets = sets[0:len(normals)]

In [4]:
normals = normals.append(sets)
zeros = normals[normals.Amount==0]
ones = normals[normals.Amount == 1]

normals = normals[(normals.Amount!=0) & (normals.Amount!=1)]
#normals = normals[normals.Amount >= 1]
normals = normals.append(zeros[:1400]).append(ones[0:10000])
ones = ones[10000:].append(zeros[1400:])

In [5]:
rand=42
#shuffling the balanced data now
normals = shuffle(normals, random_state = rand)
'''
splitted the data set into features(x) and targets(y)
'''
y = normals['Class']
del normals['Class']
x = normals
#splitted the x&y dataframes into training and test datasets
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.1, random_state = rand)


## Decision Tree

In [6]:
clf = dc(random_state= rand)
clf.fit(x_train, y_train)
features = normals.columns
scores = clf.feature_importances_

y_result = clf.predict(x_test)


print(cm(y_test, y_result))
#recall score
print(recall(y_test, y_result))
#precision score
print(round(precision(y_test, y_result),3))
#accuracy score
print(round(acc(y_test, y_result),3))
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
fpr, tpr, thresholds = roc_curve(y_test, y_result)
print(auc(fpr, tpr))

[[27903     9]
 [    0 20463]]
1.0
1.0
1.0
0.99983877902


Observações:
- Treinamento super rápido
- Acurácia superior à 98%

## SVM

In [None]:
clf = svm(random_state= rand)
clf.fit(x_train, y_train)
features = normals.columns

y_result = clf.predict(x_test)


print(cm(y_test, y_result))
#recall score
print(recall(y_test, y_result))
#precision score
print(round(precision(y_test, y_result),3))
#accuracy score
print(round(acc(y_test, y_result),3))
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
fpr, tpr, thresholds = roc_curve(y_test, y_result)
print(auc(fpr, tpr))

## Logistic Regression

In [7]:
clf = lr(random_state= rand)
clf.fit(x_train, y_train)
features = normals.columns

y_result = clf.predict(x_test)


print(cm(y_test, y_result))
#recall score
print(recall(y_test, y_result))
#precision score
print(round(precision(y_test, y_result),3))
#accuracy score
print(round(acc(y_test, y_result),3))
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
fpr, tpr, thresholds = roc_curve(y_test, y_result)
print(auc(fpr, tpr))

[[27448   464]
 [ 3084 17379]]
0.849288960563
0.974
0.927
0.916332643079


## Nayve Bayes

In [9]:
clf = nb()
clf.fit(x_train, y_train)
features = normals.columns

y_result = clf.predict(x_test)


print(cm(y_test, y_result))
#recall score
print(recall(y_test, y_result))
#precision score
print(round(precision(y_test, y_result),3))
#accuracy score
print(round(acc(y_test, y_result),3))
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
fpr, tpr, thresholds = roc_curve(y_test, y_result)
print(auc(fpr, tpr))

[[27632   280]
 [ 5161 15302]]
0.747788691785
0.982
0.888
0.868878582063


Comentários:
    - Absurdamente mais rápido
    - Acurária não tão boa