In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Read in the data from the CSV file
df = pd.read_csv('datasets/payment_fraud.csv')

In [3]:
df.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
34018,2000,1,5.034622,creditcard,521.127083,0
34335,610,1,4.886641,creditcard,0.0,0
21421,2,1,4.745402,creditcard,0.000694,0


In [4]:
# Convert categorical feature into dummy variables with one-hot encoding
df = pd.get_dummies(df, columns=['paymentMethod'])
df.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
14191,5,1,4.876771,0.000694,0,1,0,0
2501,69,1,4.742303,0.0,0,1,0,0
31437,1491,1,4.921349,128.216667,0,0,1,0


In [5]:
# Split dataset up into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('label', axis=1), df['label'],
    test_size=0.33, random_state=17)

In [6]:
X_train.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
33864,1217,1,4.742303,0.0,1,0,0
7123,1601,1,4.886641,0.045833,1,0,0
7081,2000,1,4.921318,248.470833,0,1,0


In [7]:
X_test.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
8744,130,1,5.040929,128.990972,0,1,0
21697,2,1,5.034622,1.899306,1,0,0
29727,87,1,4.895263,0.529167,0,1,0


In [8]:
y_train.sample(3)

10940    0
13550    0
29611    0
Name: label, dtype: int64

In [9]:
y_test.sample(3)

17795    0
8705     0
8916     0
Name: label, dtype: int64

In [10]:
# Initialize and train classifier model
clf = LogisticRegression().fit(X_train, y_train)

# Make predictions on test set
y_pred = clf.predict(X_test)



In [11]:
# Compare test set predictions with ground truth labels
accuracy_score(y_pred, y_test)

0.999922738159623

In [12]:
print(confusion_matrix(y_test, y_pred))

[[12753     0]
 [    1   189]]
