In [70]:
import pandas as pd
from sklearn.utils import resample

In [71]:
# raw data without any cleaning
RAW_DATA_FILEPATH = '../data/raw/data.csv'

# dropped unnessary columns
MODEL_DATA_FILEPATH = '../data/processed/model_data.csv'

# rebalanced data
BALANCED_DATA_FILEPATH = '../data/processed/balanced_data.csv'

# rebalanced and hot encoded the type column
ENCODED_DATA_FILEPATH = '../data/processed/encoded_data.csv'


In [72]:
df = pd.read_csv(RAW_DATA_FILEPATH)

In [73]:
# check the columns on the dataset
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [74]:
# Taking a look at how many transactions are fraud
df['isFraud'].value_counts()


isFraud
0    6354407
1       8213
Name: count, dtype: int64

In [75]:
# drop columns not needed for training model
clean_df = df.drop(['step', 'nameDest', 'nameOrig', 'isFlaggedFraud'], axis=1)
clean_df.to_csv(MODEL_DATA_FILEPATH, index=False)

In [76]:
# seperate into fraud and nonfraud transactions
df_fraud = clean_df[clean_df['isFraud']==1]
df_nonfraud = clean_df[clean_df['isFraud']==0]

In [77]:
# need this later when undersampling
NUMBER_OF_FRAUD_TRANSACTIONS = len(df_fraud)
print(NUMBER_OF_FRAUD_TRANSACTIONS)

8213


## Drop Columns 

Drop 3 columns: step, nameDest and nameOrig since we do not need the names of the transaction columns for training our model. Another thing we will have to do is rebalance our dataset so that majority of our data is non-fraudlent otherwise the model might just label everything as not fraudulent and still have a high accuracy.

Additionally the model does not need the naive model column isFlaggedFraud that can be removed.

## Rebalance
There are two different approaches we acn take to rebalanced the dataset:

Oversampling or undersampling each with different advantages or disadvantage. We will go with undersampling to avoid overfitting our model this will mean we will be removing some of our non-fraudlent data.

In [78]:



# undersample the nonfraud transactions
df_undersampled_nonfraud = resample(df_nonfraud, 
                                    replace=False,
                                    n_samples=NUMBER_OF_FRAUD_TRANSACTIONS,
                                    random_state=42)

# combine both nonfraud and fraud transactions
df_rebalanced = pd.concat([df_fraud, df_undersampled_nonfraud])

# Check to see the dataset has been rebalanced
print(df_rebalanced['isFraud'].value_counts())

isFraud
1    8213
0    8213
Name: count, dtype: int64


In [79]:
df_rebalanced

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
2,TRANSFER,181.00,181.00,0.00,0.00,0.00,1
3,CASH_OUT,181.00,181.00,0.00,21182.00,0.00,1
251,TRANSFER,2806.00,2806.00,0.00,0.00,0.00,1
252,CASH_OUT,2806.00,2806.00,0.00,26202.00,0.00,1
680,TRANSFER,20128.00,20128.00,0.00,0.00,0.00,1
...,...,...,...,...,...,...,...
4779354,CASH_OUT,265820.49,0.00,0.00,899770.81,1165591.30,0
1913121,CASH_IN,186719.44,159.00,186878.44,0.00,0.00,0
1989584,CASH_OUT,136972.48,0.00,0.00,1140442.13,1277414.61,0
3085033,PAYMENT,9935.66,286287.65,276351.98,0.00,0.00,0


In [80]:
# save rebalanced dataset

df_rebalanced.to_csv(BALANCED_DATA_FILEPATH, index=False)

Encoding the categorical column using one hot encoding instead of label encoding since the payment type doesn't have an order to them.

In [81]:
# Perform One-Hot Encoding with true and false replaced as 1 and 0
df_encoded = pd.get_dummies(df_rebalanced, columns=['type']).replace({True: 1, False: 0})
print(df_encoded)

            amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
2           181.00         181.00            0.00            0.00   
3           181.00         181.00            0.00        21182.00   
251        2806.00        2806.00            0.00            0.00   
252        2806.00        2806.00            0.00        26202.00   
680       20128.00       20128.00            0.00            0.00   
...            ...            ...             ...             ...   
4779354  265820.49           0.00            0.00       899770.81   
1913121  186719.44         159.00       186878.44            0.00   
1989584  136972.48           0.00            0.00      1140442.13   
3085033    9935.66      286287.65       276351.98            0.00   
4712423     691.16           0.00            0.00            0.00   

         newbalanceDest  isFraud  type_CASH_IN  type_CASH_OUT  type_DEBIT  \
2                  0.00        1             0              0           0   
3                

In [82]:
df_encoded.to_csv(ENCODED_DATA_FILEPATH, index=False)