In [11]:
import pandas as pd
from sklearn.utils import resample

In [2]:
df = pd.read_csv('../data/raw/data.csv')

In [3]:
# check the columns on the dataset
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [9]:
# Taking a look at how many transactions are fraud
df['isFraud'].value_counts()

# need this later when undersampling
NUMBER_OF_FRAUD_TRANSACTIONS = len(df[df['isFraud']==1])

In [6]:
clean_df = df.drop(['step', 'nameDest', 'nameOrig', 'isFlaggedFraud'], axis=1)
clean_df.to_csv('../data/processed/model_data.csv')

## Drop Columns 

Drop 3 columns: step, nameDest and nameOrig since we do not need the names of the transaction columns for training our model. Another thing we will have to do is rebalance our dataset so that majority of our data is non-fraudlent otherwise the model might just label everything as not fraudulent and still have a high accuracy.

Additionally the model does not need the naive model column isFlaggedFraud that can be removed.

## Rebalance
There are two different approaches we acn take to rebalanced the dataset:

Oversampling or undersampling each with different advantages or disadvantage. We will go with undersampling to avoid overfitting our model this will mean we will be removing some of our non-fraudlent data.

In [16]:
# only the fraudulent transactions
df_fraud = df[df['isFraud']==0]

# undersample the nonfraud transactions
df_undersampled_nonfraud = resample(df_fraud, 
                                    replace=False,
                                    n_samples=NUMBER_OF_FRAUD_TRANSACTIONS,
                                    random_state=42)

# combine both nonfraud and fraud transactions
df_rebalanced = pd.concat([df[df['isFraud']==1], df_undersampled_nonfraud])

# Check to see the dataset has been rebalanced
print(df_rebalanced['isFraud'].value_counts())

isFraud
1    8213
0    8213
Name: count, dtype: int64


In [17]:
# save rebalanced dataset

df_rebalanced.to_csv('../data/processed/balanced_data.csv')