In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv(
    'C:/Users/Noela_tm/Downloads/creditcard.csv'
)

###What did you do with the data in the context of exploration?
###What did you find? Why does that matter?
###What would your proposed next steps be?
###What business problem are you intending to solve using ML with the data?


In [5]:
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 

###What did you do with the data in the context of exploration?

#First, I wanted to see some general information about 
#the number of entries, columns I have available, and datatypes included
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [9]:
#Next, I visualized the first 5 rows of the data and the last five rows of data
#to get a better understanding of what it looks like
df.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [10]:
df.tail(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [11]:
#Next, I dropped the columns that will not be relevant to my analysis. 
#In this case, I don't think I will use the before and after balance information for the customer and recipient
df=df.drop(['oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest'], axis=1)
df.head()

Unnamed: 0,step,type,amount,nameOrig,nameDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0,0
1,1,PAYMENT,1864.28,C1666544295,M2044282225,0,0
2,1,TRANSFER,181.0,C1305486145,C553264065,1,0
3,1,CASH_OUT,181.0,C840083671,C38997010,1,0
4,1,PAYMENT,11668.14,C2048537720,M1230701703,0,0


In [14]:
#To make it easier for me to understand, I am going to adjust the column headers. 
df=df.rename(columns={"step":"hours","nameOrig":"originator","nameDest":"recipient","isFlaggedFraud":"flagged"})
df.head()

Unnamed: 0,hours,type,amount,Originator,Recipient,isFraud,flagged
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0,0
1,1,PAYMENT,1864.28,C1666544295,M2044282225,0,0
2,1,TRANSFER,181.0,C1305486145,C553264065,1,0
3,1,CASH_OUT,181.0,C840083671,C38997010,1,0
4,1,PAYMENT,11668.14,C2048537720,M1230701703,0,0


In [19]:
#Now I will check for duplicate rows to see if there is anything to remove from my dataset.
#I found that there were zero duplicates contained and therefore, I did not have to remove any.
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows:",duplicate_rows_df.shape)

number of duplicate rows: (0, 7)


In [20]:
#Next I will check for null values, to see if there is anything to remove from my dataset.
#I found that there were no null values contained and therefore, I did not have to remove any.
print(df.isnull().sum())

hours         0
type          0
amount        0
Originator    0
Recipient     0
isFraud       0
flagged       0
dtype: int64


In [None]:
#I am also curious about the amount of Fraud that was both detected in the isFraud column
#and flagged as fraud, so I will sum each column to see the total amount. 
#This is allowing me to understand if I will have a Class Imbalance on my hands. A Class Imbalance is where the an item I am looking for, such as fraud, has an uneven distribution within the dataset. This can cause machine learning algorithms t have a low predicitve accuracy.
#To solve this, there are a variety of techniques to correct this imbalance, such as Over Sampling, Under Sampling, and SMOTE.
isFraudTotal=df['isFraud'].sum()
flaggedTotal=df['flagged'].sum()
print("Count of Fraud:",isFraudTotal)
print("Count of Flags:",flaggedTotal)

In [None]:
###What would your proposed next steps be?
###What business problem are you intending to solve using ML with the data?