In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score

In [13]:
# read in data 
df = pd.read_csv('model_data.csv')
df.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,cash_in,cash_out,debit,payment,transfer
0,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,1,0
1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,1,0
2,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,1
3,181.0,181.0,0.0,21182.0,0.0,1,0,1,0,0,0
4,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,1,0


In [14]:
# split into label (isFraud) and data, drop label from data 
y = df['isFraud'].to_numpy()

df.drop(columns=['isFraud'], inplace=True)
X = df.to_numpy()

In [15]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# rescale x_train to prepare for model
scaler = MinMaxScaler(feature_range=(0,1))
x_train = scaler.fit_transform(X_train)

In [22]:
# compute class weights to balance data
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.array([0,1]), y=y_train)
class_weights = {0:class_weights[0], 1:class_weights[1]}
class_weights

{0: 0.5006458110800959, 1: 387.61011270179716}

In [23]:
# create and train model
model = RandomForestClassifier(random_state=1, class_weight=class_weights)
model.fit(X_train, y_train)

In [24]:
# make predictions
y_pred = model.predict(X_test)

# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9996754481644354