In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix 
import joblib

In [2]:
# read in data 
df = pd.read_csv('../../../data/model_data.csv')
df.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,1,0
1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,1,0
2,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,1
3,181.0,181.0,0.0,21182.0,0.0,1,0,1,0,0,0
4,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,1,0


In [3]:
# split into label (isFraud) and data, drop label from data 
y = df['isFraud'].to_numpy()

df.drop(columns=['isFraud'], inplace=True)
X = df.to_numpy()

In [4]:
# rescale x_train to prepare for model
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(y.reshape(-1,1))

In [5]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [6]:
# oversample using smote to account for class imbalance
oversample = SMOTE(random_state=1)
X_train, y_train = oversample.fit_resample(X_train,y_train)

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
# make predictions
y_pred = model.predict(X_test)

# compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Accuracy: {accuracy: .6f} \nPrecision: {precision: .6f} \nRecall: {recall: .6f}')

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
# save model 
joblib.dump(model, '../../../trained_models/decision_tree.pkl')