In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [31]:
# Read in the data from the CSV file
df = pd.read_csv('datasets/payment_fraud.csv')

In [32]:
df.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
14249,2000,1,4.52458,creditcard,0.0,0
753,2000,1,4.876771,paypal,0.001389,0
38932,512,1,5.017904,paypal,0.000694,0


In [33]:
# Convert categorical feature into dummy variables with one-hot encoding
df = pd.get_dummies(df, columns=['paymentMethod'])
df.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
10888,2000,1,4.876771,0.00625,0,1,0,0
37531,3,1,4.962055,2.073611,0,1,0,0
26890,420,1,4.921349,0.0,0,1,0,0


In [34]:
# Split dataset up into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('label', axis=1), df['label'],
    test_size=0.33, random_state=17)

In [35]:
X_train.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
5609,2000,1,4.965339,0.005556,1,0,0
30592,2000,1,4.461622,0.010417,1,0,0
10230,997,1,3.575983,0.006944,1,0,0


In [36]:
X_test.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
28033,248,1,4.836982,63.867361,1,0,0
5996,21,1,4.742303,20.833333,1,0,0
36716,142,1,5.034622,141.965972,1,0,0


In [37]:
y_train.sample(3)

33165    0
34486    0
33880    0
Name: label, dtype: int64

In [38]:
y_test.sample(3)

5933     0
14637    0
33441    0
Name: label, dtype: int64

In [39]:
# Initialize and train classifier model
clf = LogisticRegression().fit(X_train, y_train)

# Make predictions on test set
y_pred = clf.predict(X_test)



In [40]:
# Compare test set predictions with ground truth labels
accuracy_score(y_pred, y_test)

0.999922738159623

In [41]:
print(confusion_matrix(y_test, y_pred))

[[12753     0]
 [    1   189]]
