## 1. Import All the necessary Libraries

In [1]:
import nltk
import re
import pandas as pd
import numpy as np
from numpy import asarray
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report



## 2. Import data from sheet in Dataframe

In [2]:
df = pd.read_csv('/content/Bank_Statement_dataset.csv')
df

Unnamed: 0,Tran Date,Chq No,Particulars,Debit,Credit,Balance,Init. Br
0,18-01-2020,,BY CASH DEPOSIT- BNA/APRH38403/802/180120/BORIVAL,,40500.0,42914.04,572
1,18-01-2020,,NEFT/MB/AXMB200183005469/munna kumar,40000.0,,2914.04,572
2,18-01-2020,,GST @18% on Charge,56.7,,2857.34,572
3,18-01-2020,,Consolidated Charges for A/c,315.0,,2542.34,572
4,23-01-2020,,BY CASH DEPOSIT- BNA/APRH38403/2247/230120/BOR...,,37000.0,39542.34,572
5,23-01-2020,,BY CASH DEPOSIT- BNA/APRH38403/2249/230120/BOR...,,8900.0,48442.34,572
6,23-01-2020,,BY CASH DEPOSIT- BNA/APRH38403/2314/230120/BOR...,,85000.0,133442.34,572
7,23-01-2020,,NEFT/MB/AXMB200234602051/kamruddin,50000.0,,83442.34,572
8,23-01-2020,,NEFT/MB/AXMB200234602077/kamruddin,30000.0,,53442.34,572
9,24-01-2020,,NEFT/MB/AXMB200244625495/Rahul bhagvan,20000.0,,33442.34,572


## 3. Drop the non-contributing columns

In [3]:
df.drop(columns=['Chq No','Init. Br'],inplace=True)

## 4. Fill the empty values in 'Credit' and 'Debit' Card

In [4]:
df['Debit']=df['Debit'].fillna(0)
df['Credit']=df['Credit'].fillna(0)

## 5. Create a Fraud Column with empty values

In [5]:
df["isFraud"] = np.nan

## 6. Classify the Debit and Credit Transactions as Fraud and Non-Fraud

In [6]:
for item in range(len(df)):
  value=df.loc[item,'Credit']
  if value>50000:
    df.loc[item,'isFraud']=1

In [7]:
for item in range(len(df)):
  value=df.loc[item,'Debit']
  if value>50000:
    df.loc[item,'isFraud']=1

In [8]:
df['isFraud']=df['isFraud'].fillna(0)
df

Unnamed: 0,Tran Date,Particulars,Debit,Credit,Balance,isFraud
0,18-01-2020,BY CASH DEPOSIT- BNA/APRH38403/802/180120/BORIVAL,0.0,40500.0,42914.04,0.0
1,18-01-2020,NEFT/MB/AXMB200183005469/munna kumar,40000.0,0.0,2914.04,0.0
2,18-01-2020,GST @18% on Charge,56.7,0.0,2857.34,0.0
3,18-01-2020,Consolidated Charges for A/c,315.0,0.0,2542.34,0.0
4,23-01-2020,BY CASH DEPOSIT- BNA/APRH38403/2247/230120/BOR...,0.0,37000.0,39542.34,0.0
5,23-01-2020,BY CASH DEPOSIT- BNA/APRH38403/2249/230120/BOR...,0.0,8900.0,48442.34,0.0
6,23-01-2020,BY CASH DEPOSIT- BNA/APRH38403/2314/230120/BOR...,0.0,85000.0,133442.34,1.0
7,23-01-2020,NEFT/MB/AXMB200234602051/kamruddin,50000.0,0.0,83442.34,0.0
8,23-01-2020,NEFT/MB/AXMB200234602077/kamruddin,30000.0,0.0,53442.34,0.0
9,24-01-2020,NEFT/MB/AXMB200244625495/Rahul bhagvan,20000.0,0.0,33442.34,0.0


## 7. Clean the Particulars column data and create  corpus

In [9]:
nltk.download('stopwords')
ps = PorterStemmer()
corpus = []
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]',' ', df['Particulars'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
tfidf_v = TfidfVectorizer(max_features=10,ngram_range=(1,3))
X = tfidf_v.fit_transform(corpus).toarray()

## 8. Perform Standardisation on Debit and Credit Columns

In [11]:
X_Trans=df.loc[:,['Debit','Credit']]

In [12]:
X_Trans=asarray(X_Trans)
scaler = StandardScaler()
scaled = scaler.fit_transform(X_Trans)

## 9. Concatenate the Independent features from the Particulars,debit and Credit columns

In [13]:
X=np.concatenate((X,scaled),axis=1)

## 10. Obtain the dependent and independent features

In [14]:
Y=df['isFraud']

In [15]:
print(X.shape)
print(Y.shape)

(54, 12)
(54,)


## 11. Apply Sampling to get the balanced dataset

In [16]:
print('Original dataset shape {}'.format(Counter(Y)))

Original dataset shape Counter({0.0: 47, 1.0: 7})


In [17]:
os =  RandomOverSampler(sampling_strategy=0.6)
X_train_res, y_train_res = os.fit_resample(X, Y)



In [18]:
print('Sampled dataset shape {}'.format(Counter(y_train_res)))

Sampled dataset shape Counter({0.0: 47, 1.0: 28})


## 12. Split into train and test dataset

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train_res,y_train_res, test_size=0.3, random_state=1)

In [20]:
print(X_train.shape)
print(X_test.shape)

(52, 12)
(23, 12)


## 13. Create models on the Training Dataset

In [21]:
def models(X_train,Y_train):
    #Logistic Regression
    log = LogisticRegression(random_state=0)
    log.fit(X_train,Y_train)
    
    #Decision Tree
    tree = DecisionTreeClassifier(criterion='entropy',random_state=0)
    tree.fit(X_train,Y_train)
    
    #Random Forest Classifier
    forest = RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    forest.fit(X_train,Y_train)
    
    #Print the output of training the 3 models
    print("Logistic Regression model result " , log.score(X_train,Y_train))
    print("Decision Tree classifier result " , tree.score(X_train,Y_train))
    print("Random Forest cassifier result " , forest.score(X_train,Y_train))
    
    return log , tree, forest

## 14. Get the result of model for Logistic Regression, Decision Tree, Random Forest using the Train dataset

In [22]:
model = models(X_train,Y_train)

Logistic Regression model result  0.9807692307692307
Decision Tree classifier result  1.0
Random Forest cassifier result  0.9807692307692307


## 15.  Create a model that evaluates Performance Matrix such as
Accuracy of model

Confusion Matrix

Classification Report

In [23]:
def Perf_matrix(model_no):
    cm = confusion_matrix(Y_test,model[model_no].predict(X_test))
    
    TP = cm[0][0]
    TN = cm[1][1]
    FP = cm[0][1]
    FN = cm[1][0]
    
    Accuracy     = (TP+TN)/(TP+TN+FP+FN)
    preds        =     model[model_no].predict(X_test)
    Class_Report = classification_report(Y_test,preds)

    return cm,Accuracy,Class_Report

## 16. Performance Matrix of Logistic Regression Model

In [24]:
cm_log,Acc_log,Report_log = Perf_matrix(0)
print("The Accuracy of unkown dataset for Logistic Regression Model: ",Acc_log)
print('\n')
print("The Classification Report for Logistic Regression Model: \n",Report_log)
print('\n')
print("The Confusion Matrix for Logistic Regression Model: \n",cm_log)

The Accuracy of unkown dataset for Logistic Regression Model:  0.9130434782608695


The Classification Report for Logistic Regression Model: 
               precision    recall  f1-score   support

         0.0       0.87      1.00      0.93        13
         1.0       1.00      0.80      0.89        10

    accuracy                           0.91        23
   macro avg       0.93      0.90      0.91        23
weighted avg       0.92      0.91      0.91        23



The Confusion Matrix for Logistic Regression Model: 
 [[13  0]
 [ 2  8]]


## 17. Performance Matrix of Decision Tree Model

In [25]:
cm_tree,Acc_tree,Report_tree = Perf_matrix(1)
print("The Accuracy of unkown dataset for Decision Tree Model: ",Acc_tree)
print('\n')
print("The Classification Report for Decision Tree Model: \n",Report_tree)
print('\n')
print("The Confusion Matrix for Decision Tree Model: \n",cm_tree)

The Accuracy of unkown dataset for Decision Tree Model:  1.0


The Classification Report for Decision Tree Model: 
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        13
         1.0       1.00      1.00      1.00        10

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23



The Confusion Matrix for Decision Tree Model: 
 [[13  0]
 [ 0 10]]


## 18. Performance Matrix of Random Forest Model

In [26]:
cm_forest,Acc_forest,Report_forest = Perf_matrix(2)
print("The Accuracy of unkown dataset for Random Forest Model: ",Acc_forest)
print('\n')
print("The Classification Report for Random Forest Model: \n",Report_forest)
print('\n')
print("The Confusion Matrix for Random Forest Model: \n",cm_forest)

The Accuracy of unkown dataset for Random Forest Model:  0.9130434782608695


The Classification Report for Random Forest Model: 
               precision    recall  f1-score   support

         0.0       0.87      1.00      0.93        13
         1.0       1.00      0.80      0.89        10

    accuracy                           0.91        23
   macro avg       0.93      0.90      0.91        23
weighted avg       0.92      0.91      0.91        23



The Confusion Matrix for Random Forest Model: 
 [[13  0]
 [ 2  8]]
