# Fraud detection

Original: https://github.com/oreilly-mlsec/book-resources/blob/master/chapter2/logistic-regression-fraud-detection.ipynb

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
# Read in the data from the CSV file
df = pd.read_csv('payment_fraud.csv')

In [7]:
df.head(30)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
0,29,1,4.745402,paypal,28.204861,0
1,725,1,4.742303,storecredit,0.0,0
2,845,1,4.921318,creditcard,0.0,0
3,503,1,4.886641,creditcard,0.0,0
4,2000,1,5.040929,creditcard,0.0,0
5,119,1,4.962055,paypal,0.0,0
6,2000,1,4.921349,paypal,0.0,0
7,371,1,4.876771,creditcard,0.0,0
8,2000,1,4.748314,creditcard,0.0,0
9,4,1,4.461622,creditcard,0.0,0


In [13]:
df.describe()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label
count,39221.0,39221.0,39221.0,39221.0,39221.0
mean,857.563984,1.084751,4.748232,122.641326,0.014278
std,804.788212,0.566899,0.38936,283.569177,0.118636
min,1.0,1.0,0.421214,0.0,0.0
25%,72.0,1.0,4.742303,0.0,0.0
50%,603.0,1.0,4.886641,0.0125,0.0
75%,1804.0,1.0,4.962055,87.510417,0.0
max,2000.0,29.0,5.040929,1999.580556,1.0


In [15]:
df['label'].value_counts()

0    38661
1      560
Name: label, dtype: int64

In [16]:
# Convert categorical feature into dummy variables with one-hot encoding
df = pd.get_dummies(df, columns=['paymentMethod'])
df.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
20546,2000,1,4.886641,0.0,0,0,1,0
5768,1534,1,4.886641,0.0,0,0,1,0
21485,1065,1,5.017904,0.06875,0,1,0,0


In [59]:
# Split dataset up into train and test sets
# Try 0.33, 0.1, 0.9996
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('label', axis=1), df['label'],
    test_size=0.1, random_state=17)

In [60]:
# Initialize and train classifier model
clf = LogisticRegression().fit(X_train, y_train)

# Make predictions on test set
y_pred = clf.predict(X_test)



In [61]:
# Compare test set predictions with ground truth labels
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_test, y_pred))

1.0
[[3873    0]
 [   0   50]]
