# Fraud detection

Original: 
* https://github.com/oreilly-mlsec/book-resources/blob/master/chapter2/logistic-regression-fraud-detection.ipynb
* https://github.com/oreilly-mlsec/book-resources/blob/master/chapter2/select-from-model-nslkdd.ipynb

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Read in the data from the CSV file
df = pd.read_csv('payment_fraud.csv')

In [3]:
df.head(30)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
0,29,1,4.745402,paypal,28.204861,0
1,725,1,4.742303,storecredit,0.0,0
2,845,1,4.921318,creditcard,0.0,0
3,503,1,4.886641,creditcard,0.0,0
4,2000,1,5.040929,creditcard,0.0,0
5,119,1,4.962055,paypal,0.0,0
6,2000,1,4.921349,paypal,0.0,0
7,371,1,4.876771,creditcard,0.0,0
8,2000,1,4.748314,creditcard,0.0,0
9,4,1,4.461622,creditcard,0.0,0


In [4]:
df.loc[df['label'] == 1].head(30)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
109,1,4,4.836982,creditcard,0.0,1
243,1,1,4.836982,paypal,0.002778,1
280,1,1,4.921318,creditcard,0.0,1
362,1,1,4.52458,paypal,0.0,1
420,1,1,4.748314,paypal,0.0,1
422,1,1,4.745402,creditcard,0.003472,1
461,1,1,4.921318,creditcard,0.0,1
504,1,1,4.921349,creditcard,0.007639,1
578,1,2,4.505662,creditcard,0.0,1
648,1,1,4.895263,paypal,0.007639,1


In [5]:
df.describe()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label
count,39221.0,39221.0,39221.0,39221.0,39221.0
mean,857.563984,1.084751,4.748232,122.641326,0.014278
std,804.788212,0.566899,0.38936,283.569177,0.118636
min,1.0,1.0,0.421214,0.0,0.0
25%,72.0,1.0,4.742303,0.0,0.0
50%,603.0,1.0,4.886641,0.0125,0.0
75%,1804.0,1.0,4.962055,87.510417,0.0
max,2000.0,29.0,5.040929,1999.580556,1.0


In [6]:
df['label'].value_counts()

0    38661
1      560
Name: label, dtype: int64

In [7]:
# Convert categorical feature into dummy variables with one-hot encoding
df = pd.get_dummies(df, columns=['paymentMethod'])
df.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
4692,39,1,4.962055,0.001389,0,0,1,0
5279,14,1,4.745402,0.0,0,1,0,0
10430,2000,1,4.962055,1010.061111,0,1,0,0


In [8]:
# Split dataset up into train and test sets
# Try 0.33, 0.1, 0.9996
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('label', axis=1), df['label'],
    test_size=0.1, random_state=17)

In [9]:
# Initialize and train classifier model
clf = LogisticRegression().fit(X_train, y_train)

# Make predictions on test set
y_pred = clf.predict(X_test)



In [10]:
# Compare test set predictions with ground truth labels
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_test, y_pred))

1.0
[[3873    0]
 [   0   50]]


In [11]:
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
sfm = SelectFromModel(clf, prefit=True)

In [12]:
X_train_new = sfm.transform(X_train)
print("Original num features: {}, selected num features: {}"
      .format(X_train.shape[1], X_train_new.shape[1]))

Original num features: 7, selected num features: 1


In [13]:
indices = np.argsort(clf.feature_importances_)[::-1]

In [14]:
for idx, i in enumerate(indices):
    print("{}.\t{} - {}".format(idx, X_train.columns[i], clf.feature_importances_[i]))

0.	accountAgeDays - 1.0
1.	paymentMethod_storecredit - 0.0
2.	paymentMethod_paypal - 0.0
3.	paymentMethod_creditcard - 0.0
4.	paymentMethodAgeDays - 0.0
5.	localTime - 0.0
6.	numItems - 0.0
