In [2]:
# example of evaluating a decision tree with random undersampling
from numpy import mean
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("data\processed_dataset1.csv")
df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'Time', 'Recorded_times',
       'Categorized_times', 'Categorized_trasaction_purpose'],
      dtype='object')

In [4]:
# Logistic Regression
# define dataset
X = np.array(df[['lat','long']].to_numpy()) 
y = df["is_fraud"]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 70)

# define pipeline
steps = [('under', RandomUnderSampler()), ('model', LogisticRegression())]
pipeline = Pipeline(steps=steps)
clf = pipeline.fit(X,y)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)
print()
#evaluation on training data
y_pred = clf.predict(X_train)

print("Training dataset:")
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print()

#evaluation on test data
print("Testing dataset:")
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

F1 Score: 0.472

Training dataset:
[[172178 215315]
 [   654    856]]
              precision    recall  f1-score   support

           0       1.00      0.44      0.61    387493
           1       0.00      0.57      0.01      1510

    accuracy                           0.44    389003
   macro avg       0.50      0.51      0.31    389003
weighted avg       0.99      0.44      0.61    389003


Testing dataset:
[[74119 91962]
 [  286   349]]
              precision    recall  f1-score   support

           0       1.00      0.45      0.62    166081
           1       0.00      0.55      0.01       635

    accuracy                           0.45    166716
   macro avg       0.50      0.50      0.31    166716
weighted avg       0.99      0.45      0.61    166716



In [27]:
# Decision Tree
# define dataset
# X = np.array(df[['cc_num','amt','zip','lat','long','city_pop','merch_lat','merch_long']].to_numpy()) # F1 = 0.862
# X = np.array(df[['amt','lat','long','city_pop','merch_lat','merch_long']].to_numpy()) # F1 = 0.862
X = np.array(df[['lat','long']].to_numpy()) # F1 = 0.770
y = df["is_fraud"]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 70)

# define pipeline
steps = [('under', RandomUnderSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
clf = pipeline.fit(X,y)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)
print()
#evaluation on training data
y_pred = clf.predict(X_train)

print("Training dataset:")
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print()

#evaluation on test data
print("Testing dataset:")
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

F1 Score: 0.771

Training dataset:
[[299103  88390]
 [    21   1489]]
              precision    recall  f1-score   support

           0       1.00      0.77      0.87    387493
           1       0.02      0.99      0.03      1510

    accuracy                           0.77    389003
   macro avg       0.51      0.88      0.45    389003
weighted avg       1.00      0.77      0.87    389003


Testing dataset:
[[127985  38096]
 [     8    627]]
              precision    recall  f1-score   support

           0       1.00      0.77      0.87    166081
           1       0.02      0.99      0.03       635

    accuracy                           0.77    166716
   macro avg       0.51      0.88      0.45    166716
weighted avg       1.00      0.77      0.87    166716



In [28]:
# KNN
X = np.array(df[['lat','long']].to_numpy()) # F1 = 0.770
y = df["is_fraud"]

# Train Test split 
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 70)

# define pipeline
steps = [('under', RandomUnderSampler()), ('model', neighbors.KNeighborsClassifier())]
pipeline = Pipeline(steps=steps)
clf = pipeline.fit(X,y)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)
print()

#evaluation on training data

y_pred = clf.predict(X_train)

print("Training dataset:")
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print()

#evaluation on test data

y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

F1 Score: 0.777

Training dataset:
[[307097  80396]
 [   372   1138]]
              precision    recall  f1-score   support

           0       1.00      0.79      0.88    387493
           1       0.01      0.75      0.03      1510

    accuracy                           0.79    389003
   macro avg       0.51      0.77      0.46    389003
weighted avg       0.99      0.79      0.88    389003


[[131536  34545]
 [   155    480]]
              precision    recall  f1-score   support

           0       1.00      0.79      0.88    166081
           1       0.01      0.76      0.03       635

    accuracy                           0.79    166716
   macro avg       0.51      0.77      0.46    166716
weighted avg       1.00      0.79      0.88    166716



In [29]:
# Random Forest
X = np.array(df[['lat','long']].to_numpy()) # F1 = 0.770
y = df["is_fraud"]

# Train Test split 
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 70)

# define pipeline
steps = [('under', RandomUnderSampler()), ('model', RandomForestClassifier())]
pipeline = Pipeline(steps=steps)
clf = pipeline.fit(X,y)

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)
print()

#evaluation on training data

y_pred = clf.predict(X_train)

print("Training dataset:")
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))
print()

#evaluation on test data

y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

F1 Score: 0.777

Training dataset:
[[300134  87359]
 [    17   1493]]
              precision    recall  f1-score   support

           0       1.00      0.77      0.87    387493
           1       0.02      0.99      0.03      1510

    accuracy                           0.78    389003
   macro avg       0.51      0.88      0.45    389003
weighted avg       1.00      0.78      0.87    389003


[[128404  37677]
 [     8    627]]
              precision    recall  f1-score   support

           0       1.00      0.77      0.87    166081
           1       0.02      0.99      0.03       635

    accuracy                           0.77    166716
   macro avg       0.51      0.88      0.45    166716
weighted avg       1.00      0.77      0.87    166716

