In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score



In [2]:
# Importing the datset 
dataset = pd.read_csv('Synthetic_Financial_datasets_log.csv')
df = dataset # Making a copy of Original Dataset
df.info # Understanding dataset

<bound method DataFrame.info of          step      type      amount     nameOrig  oldbalanceOrg  \
0           1   PAYMENT     9839.64  C1231006815      170136.00   
1           1   PAYMENT     1864.28  C1666544295       21249.00   
2           1  TRANSFER      181.00  C1305486145         181.00   
3           1  CASH_OUT      181.00   C840083671         181.00   
4           1   PAYMENT    11668.14  C2048537720       41554.00   
...       ...       ...         ...          ...            ...   
6362615   743  CASH_OUT   339682.13   C786484425      339682.13   
6362616   743  TRANSFER  6311409.28  C1529008245     6311409.28   
6362617   743  CASH_OUT  6311409.28  C1162922333     6311409.28   
6362618   743  TRANSFER   850002.52  C1685995037      850002.52   
6362619   743  CASH_OUT   850002.52  C1280323807      850002.52   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
0             160296.36  M1979787155            0.00            0.00        0   
1

In [3]:
# Removing Uncessary Columns
df.drop(['step','isFlaggedFraud','nameOrig','nameDest'], inplace= True, axis = 1)

In [6]:
# Feature Engineering
encoder = OneHotEncoder()
a = encoder.fit_transform(df[['type']]).toarray() # Changing Categorical column in to numerical columns
encode_type = pd.DataFrame(a, columns= encoder.get_feature_names_out()) 
df['Cash_IN'],df['Cash_Out'],df['Debit'],df['Payment'],df['Transfer'] = encode_type['type_CASH_IN'],encode_type['type_CASH_OUT'],encode_type['type_DEBIT'],encode_type['type_PAYMENT'],encode_type['type_TRANSFER']
df.drop(['type'],axis=1)

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,Cash_IN,Cash_Out,Debit,Payment,Transfer
0,9839.64,170136.00,160296.36,0.00,0.00,0,0.0,0.0,0.0,1.0,0.0
1,1864.28,21249.00,19384.72,0.00,0.00,0,0.0,0.0,0.0,1.0,0.0
2,181.00,181.00,0.00,0.00,0.00,1,0.0,0.0,0.0,0.0,1.0
3,181.00,181.00,0.00,21182.00,0.00,1,0.0,1.0,0.0,0.0,0.0
4,11668.14,41554.00,29885.86,0.00,0.00,0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,339682.13,339682.13,0.00,0.00,339682.13,1,0.0,1.0,0.0,0.0,0.0
6362616,6311409.28,6311409.28,0.00,0.00,0.00,1,0.0,0.0,0.0,0.0,1.0
6362617,6311409.28,6311409.28,0.00,68488.84,6379898.11,1,0.0,1.0,0.0,0.0,0.0
6362618,850002.52,850002.52,0.00,0.00,0.00,1,0.0,0.0,0.0,0.0,1.0


In [7]:
# Balancing the fraud and not fraud data
oversample = SMOTE()
x,y = oversample.fit_resample(df[['Cash_IN','Cash_Out','Debit','Payment','Transfer','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest']],df['isFraud'])
X = pd.DataFrame(x)
Y = pd.DataFrame((y))
Balanced_df = pd.concat([X,Y],axis= 1)

In [8]:
# Random Forest Model
fraud = Balanced_df['isFraud']
normal = Balanced_df.drop('isFraud',axis=1)
Norm_Train, Norm_Test,Fraud_Train,Fraud_Test = train_test_split(normal,fraud,train_size=0.3,random_state= 42,stratify= y)
rf_model = RandomForestClassifier(n_estimators= 100,n_jobs= -1 ,random_state=42)
rf_model.fit(Norm_Train,Fraud_Train)
Fraud_pred = rf_model.predict(Norm_Test)

In [9]:
# Testing the accuraqcy of the model
print("Accuracy",accuracy_score(Fraud_Test,Fraud_pred))
print("Confusion MAtrix\n|" , confusion_matrix(Fraud_Test,Fraud_pred))
print("Classification Report", classification_report(Fraud_Test,Fraud_pred)) 

Accuracy 0.9993407275265648
Confusion MAtrix
| [[4443447    4638]
 [   1227 4446858]]
Classification Report               precision    recall  f1-score   support

           0       1.00      1.00      1.00   4448085
           1       1.00      1.00      1.00   4448085

    accuracy                           1.00   8896170
   macro avg       1.00      1.00      1.00   8896170
weighted avg       1.00      1.00      1.00   8896170



In [10]:
importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature Importances:\n", importances) 


Feature Importances:
 oldbalanceOrg     0.350655
newbalanceOrig    0.181047
amount            0.169277
Payment           0.078177
Transfer          0.066118
newbalanceDest    0.053349
Cash_IN           0.038645
oldbalanceDest    0.032419
Cash_Out          0.029883
Debit             0.000428
dtype: float64
