# **Phase 2**

In [25]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [26]:
DATASET_PATH = 'financial_transactions.csv'

In [27]:
df = pd.read_csv(DATASET_PATH)

### Better Column Names

In [28]:
df.rename(columns={'type':'paymentType', 'nameOrig':'accSender',
                    'oldbalanceOrg':'oldBalanceSender', 'newbalanceOrig':
                    'newBalanceSender', 'nameDest':'accRecipient',
                    'oldbalanceDest':'oldBalanceRecipient',
                    'newbalanceDest':'newBalanceRecipient'}, inplace=True)

### Print DF info

In [29]:
print (df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column               Dtype  
---  ------               -----  
 0   step                 int64  
 1   paymentType          object 
 2   amount               float64
 3   accSender            object 
 4   oldBalanceSender     float64
 5   newBalanceSender     float64
 6   accRecipient         object 
 7   oldBalanceRecipient  float64
 8   newBalanceRecipient  float64
 9   isFraud              int64  
 10  isFlaggedFraud       int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None


### Feature Engineering

Create two new columns showing how much the transaction changed the accounts of the sender and recipient, as a proportion of their account before the transaction. These features shows the magnitude of the transaction relative to their account.

In [30]:
df['senderAccChangeRate'] = (df['oldBalanceSender'] - df['newBalanceSender']) / df['oldBalanceSender']

df['recipientAccChangeRate'] = (df['oldBalanceRecipient'] - df['newBalanceRecipient']) / df['oldBalanceRecipient']

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 13 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   step                    int64  
 1   paymentType             object 
 2   amount                  float64
 3   accSender               object 
 4   oldBalanceSender        float64
 5   newBalanceSender        float64
 6   accRecipient            object 
 7   oldBalanceRecipient     float64
 8   newBalanceRecipient     float64
 9   isFraud                 int64  
 10  isFlaggedFraud          int64  
 11  senderAccChangeRate     float64
 12  recipientAccChangeRate  float64
dtypes: float64(7), int64(3), object(3)
memory usage: 631.1+ MB


## **Data Preprocessing**

### One-Hot Encoding

In [32]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [33]:
df = pd.get_dummies(df, columns=['paymentType'], drop_first=True)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 16 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   step                    int64  
 1   amount                  float64
 2   accSender               object 
 3   oldBalanceSender        float64
 4   newBalanceSender        float64
 5   accRecipient            object 
 6   oldBalanceRecipient     float64
 7   newBalanceRecipient     float64
 8   isFraud                 int64  
 9   isFlaggedFraud          int64  
 10  senderAccChangeRate     float64
 11  recipientAccChangeRate  float64
 12  paymentType_CASH_OUT    bool   
 13  paymentType_DEBIT       bool   
 14  paymentType_PAYMENT     bool   
 15  paymentType_TRANSFER    bool   
dtypes: bool(4), float64(7), int64(3), object(2)
memory usage: 606.8+ MB


### Label Encoding
Label-encode the sender and recipient account columns. These are categorical columns with hundreds of thousands of values, so we will have to label encoding rather than one-hot. After encoding, drop the original column to save some memory.

In [35]:
label_cols = ['accSender', 'accRecipient']

for col in label_cols:
    encoder = LabelEncoder()
    df[col + 'Encoded'] = encoder.fit_transform(df[col])
    df.drop(columns=[col], inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 16 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   step                    int64  
 1   amount                  float64
 2   oldBalanceSender        float64
 3   newBalanceSender        float64
 4   oldBalanceRecipient     float64
 5   newBalanceRecipient     float64
 6   isFraud                 int64  
 7   isFlaggedFraud          int64  
 8   senderAccChangeRate     float64
 9   recipientAccChangeRate  float64
 10  paymentType_CASH_OUT    bool   
 11  paymentType_DEBIT       bool   
 12  paymentType_PAYMENT     bool   
 13  paymentType_TRANSFER    bool   
 14  accSenderEncoded        int64  
 15  accRecipientEncoded     int64  
dtypes: bool(4), float64(7), int64(5)
memory usage: 606.8 MB


### Splitting Data

In [None]:
from sklearn.model_selection import train_test_split