# **Phase 2**

In [108]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [109]:
DATASET_PATH = 'financial_transactions.csv'

In [110]:
df = pd.read_csv(DATASET_PATH)

### Better Column Names

In [111]:
df.rename(columns={'type':'paymentType', 'nameOrig':'accSender',
                    'oldbalanceOrg':'oldBalanceSender', 'newbalanceOrig':
                    'newBalanceSender', 'nameDest':'accRecipient',
                    'oldbalanceDest':'oldBalanceRecipient',
                    'newbalanceDest':'newBalanceRecipient'}, inplace=True)

### Print DF info

In [112]:
print (df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column               Dtype  
---  ------               -----  
 0   step                 int64  
 1   paymentType          object 
 2   amount               float64
 3   accSender            object 
 4   oldBalanceSender     float64
 5   newBalanceSender     float64
 6   accRecipient         object 
 7   oldBalanceRecipient  float64
 8   newBalanceRecipient  float64
 9   isFraud              int64  
 10  isFlaggedFraud       int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None


### Feature Engineering

Create two new columns showing how much the transaction changed the accounts of the sender and recipient, as a proportion of their account before the transaction. These features shows the magnitude of the transaction relative to their account.

In [113]:
# We added a +1 to the end to prevent the number going to infinity
df['senderAccChangeRate'] = (df['oldBalanceSender'] - df['newBalanceSender']) / (df['oldBalanceSender'] + 1)
df['recipientAccChangeRate'] = (df['oldBalanceRecipient'] - df['newBalanceRecipient']) / (df['oldBalanceRecipient'] + 1)


In [114]:
df.head()

Unnamed: 0,step,paymentType,amount,accSender,oldBalanceSender,newBalanceSender,accRecipient,oldBalanceRecipient,newBalanceRecipient,isFraud,isFlaggedFraud,senderAccChangeRate,recipientAccChangeRate
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0.057834,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0.087731,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0.994505,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0.994505,0.999953
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0.280788,0.0


## **Data Preprocessing**

### One-Hot Encoding

In [115]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [116]:
df = pd.get_dummies(df, columns=['paymentType'], drop_first=True)*1
df.head()

Unnamed: 0,step,amount,accSender,oldBalanceSender,newBalanceSender,accRecipient,oldBalanceRecipient,newBalanceRecipient,isFraud,isFlaggedFraud,senderAccChangeRate,recipientAccChangeRate,paymentType_CASH_OUT,paymentType_DEBIT,paymentType_PAYMENT,paymentType_TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0.057834,0.0,0,0,1,0
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0.087731,0.0,0,0,1,0
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0.994505,0.0,0,0,0,1
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0.994505,0.999953,1,0,0,0
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0.280788,0.0,0,0,1,0


### Label Encoding
Label-encode the sender and recipient account columns. These are categorical columns with hundreds of thousands of values, so we will have to label encoding rather than one-hot. After encoding, drop the original column to save some memory.

In [117]:
label_cols = ['accSender', 'accRecipient']

for col in label_cols:
    encoder = LabelEncoder()
    df[col + 'Encoded'] = encoder.fit_transform(df[col])
    df.drop(columns=[col], inplace=True)

### Splitting Data

In [118]:
from sklearn.model_selection import train_test_split

In [119]:
X = df.drop(columns=['isFraud'])
y = df['isFraud']

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=777)

In [121]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5090096 entries, 4694331 to 1420926
Data columns (total 15 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   step                    int64  
 1   amount                  float64
 2   oldBalanceSender        float64
 3   newBalanceSender        float64
 4   oldBalanceRecipient     float64
 5   newBalanceRecipient     float64
 6   isFlaggedFraud          int64  
 7   senderAccChangeRate     float64
 8   recipientAccChangeRate  float64
 9   paymentType_CASH_OUT    int64  
 10  paymentType_DEBIT       int64  
 11  paymentType_PAYMENT     int64  
 12  paymentType_TRANSFER    int64  
 13  accSenderEncoded        int64  
 14  accRecipientEncoded     int64  
dtypes: float64(7), int64(8)
memory usage: 621.3 MB


In [124]:
numerical_columns = X_train.select_dtypes(include=['float64']).columns
print(numerical_columns)
for c in numerical_columns:
    print(c)

Index(['amount', 'oldBalanceSender', 'newBalanceSender', 'oldBalanceRecipient',
       'newBalanceRecipient', 'senderAccChangeRate', 'recipientAccChangeRate'],
      dtype='object')
amount
oldBalanceSender
newBalanceSender
oldBalanceRecipient
newBalanceRecipient
senderAccChangeRate
recipientAccChangeRate


### Scaling

We picked RobustScaler because it is more appropriate when scaling datasets that are dependent on outliers. This decision was a recommendation gained from our Literature Review.

In [126]:
from sklearn.preprocessing import RobustScaler 

scaler = RobustScaler()

for col in numerical_columns:
    # Fit and transform the train data
    X_train[col] = scaler.fit_transform(X_train[[col]])

    # Only transform on the test data, to prevent any data leakage
    X_test[col] = scaler.transform(X_test[[col]])

### Upsampling (SMOTE)

In [134]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=777)

X_train_up, y_train_up = smote.fit_resample(X_train, y_train)

training_data = X_train_up.copy()
training_data['isFraud'] = y_train_up

testing_data = X_test.copy()
testing_data['isFraud'] = y_test


training_data.to_csv('training.csv', index=False)
testing_data.to_csv('testing.csv', index=False)

KeyboardInterrupt: 