In [8]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy import stats 
from sklearn.decomposition import PCA

In [9]:
fraud = pd.read_csv('../data/fraud_condensed.csv')

In [10]:
fraud.head()

Unnamed: 0.1,Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,2756117,CASH_OUT,55927.11,C91071084,0.0,0.0,C1746219350,1048520.2,1104447.31,0
1,4057875,PAYMENT,29122.09,C393762821,0.0,0.0,M2027704414,0.0,0.0,0
2,2280857,PAYMENT,3869.64,C2079721859,151596.28,147726.64,M1634590148,0.0,0.0,0
3,6254632,CASH_IN,174386.75,C181101377,4059153.48,4233540.23,C214774813,73993503.45,73819116.7,0
4,2191412,TRANSFER,303014.67,C57948388,11069.0,0.0,C1978381162,680884.12,983898.79,0


In [11]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      50000 non-null  int64  
 1   type            50000 non-null  object 
 2   amount          50000 non-null  float64
 3   nameOrig        50000 non-null  object 
 4   oldbalanceOrg   50000 non-null  float64
 5   newbalanceOrig  50000 non-null  float64
 6   nameDest        50000 non-null  object 
 7   oldbalanceDest  50000 non-null  float64
 8   newbalanceDest  50000 non-null  float64
 9   isFraud         50000 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 3.8+ MB


In [12]:
fraud.shape

(50000, 10)

In [13]:
fraud.isna().sum()

Unnamed: 0        0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64

In [14]:
# observe the amount of fraud counts this column 
fraud['isFraud'].value_counts()

isFraud
0    49940
1       60
Name: count, dtype: int64

Since the 'isFlaggedFraud' column had 8000+ missing fraud counts, we are going to drop this column.

# Forming of Dummy Variables

In [15]:
fraud_bool = pd.get_dummies(fraud, prefix =['nameOrig', 'nameDest', 'type'],drop_first=True)
fraud_bool

Unnamed: 0.1,Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,nameOrig_CASH_OUT,nameOrig_DEBIT,nameOrig_PAYMENT,...,type_M998988388,type_M999085705,type_M999176707,type_M999194256,type_M999310424,type_M999435593,type_M999548170,type_M999550644,type_M99966635,type_M999852366
0,2756117,55927.11,0.00,0.00,1048520.20,1104447.31,0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,4057875,29122.09,0.00,0.00,0.00,0.00,0,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,2280857,3869.64,151596.28,147726.64,0.00,0.00,0,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,6254632,174386.75,4059153.48,4233540.23,73993503.45,73819116.70,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2191412,303014.67,11069.00,0.00,680884.12,983898.79,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,4525983,1359482.83,40408.00,0.00,1728018.41,3087501.24,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
49996,5540802,6379.32,20060.36,13681.05,0.00,0.00,0,False,False,True,...,False,False,False,False,False,False,False,False,False,False
49997,4171646,10000000.00,19879.00,0.00,215647.80,18215626.13,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
49998,2457488,233730.40,3653993.47,3887723.87,4611574.01,4377843.61,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
fraud_encode = fraud_bool.astype(int)
fraud_encode

Unnamed: 0.1,Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,nameOrig_CASH_OUT,nameOrig_DEBIT,nameOrig_PAYMENT,...,type_M998988388,type_M999085705,type_M999176707,type_M999194256,type_M999310424,type_M999435593,type_M999548170,type_M999550644,type_M99966635,type_M999852366
0,2756117,55927,0,0,1048520,1104447,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4057875,29122,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2280857,3869,151596,147726,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,6254632,174386,4059153,4233540,73993503,73819116,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2191412,303014,11069,0,680884,983898,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,4525983,1359482,40408,0,1728018,3087501,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49996,5540802,6379,20060,13681,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
49997,4171646,10000000,19879,0,215647,18215626,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49998,2457488,233730,3653993,3887723,4611574,4377843,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# the newly transformed dataset will be saved in this file name
fraud_encode.to_csv('../data/fraud_condensed_new.csv')

KeyboardInterrupt: 