In [1]:
import csv
import json
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from preprocessing_func import process_corpus

[nltk_data] Downloading package stopwords to /Users/james/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/james/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/james/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/james/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
#Load the corpus and do the preprocessing
print("Loading Corpus and do the preprocessing")
Corpus=pd.read_json('./data/corpus.json')
Corpus=process_corpus(Corpus)
print("Successfully Loaded the corpus")

Loading Corpus and do the preprocessing
Successfully Loaded the corpus


In [3]:
#obtain the training and testing dataset
Train_X, Test_X, Train_Y, Test_Y=model_selection.train_test_split(Corpus['Tokenized_text'],Corpus['isFraud'],test_size=0.3)

#Encoding the dataset
Encoder=LabelEncoder()
Train_Y=Encoder.fit_transform(Train_Y)
Test_Y=Encoder.fit_transform(Test_Y)

In [4]:
#Word Vectorization using TF-IDF
BOW_vect=CountVectorizer(max_features=5000)
BOW_vect.fit(Corpus['Tokenized_text'])

Train_X_BOW=BOW_vect.transform(Train_X)
Test_X_BOW=BOW_vect.transform(Test_X)

In [5]:
#Using Naive Bayes Classifier to predict the outcome
Naive=naive_bayes.MultinomialNB()
Naive.fit(Train_X_BOW,Train_Y)

predictions_NB=Naive.predict(Test_X_BOW)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)
print("Confusion matrix is")
print(confusion_matrix(predictions_NB, Test_Y))
print("classification report is")
print(classification_report(predictions_NB, Test_Y))

Naive Bayes Accuracy Score ->  84.92462311557789
Confusion matrix is
[[264  39]
 [ 51 243]]
classification report is
              precision    recall  f1-score   support

           0       0.84      0.87      0.85       303
           1       0.86      0.83      0.84       294

    accuracy                           0.85       597
   macro avg       0.85      0.85      0.85       597
weighted avg       0.85      0.85      0.85       597



In [6]:
#Using SVM to predict the outcome
SVM=svm.SVC(C=1.0,kernel='linear',degree=3,gamma='auto')
SVM.fit(Train_X_BOW,Train_Y)

predictions_SVM=SVM.predict(Test_X_BOW)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print("Confusion matrix is")
print(confusion_matrix(predictions_SVM, Test_Y))
print("classification report is")
print(classification_report(predictions_SVM, Test_Y))

SVM Accuracy Score ->  89.7822445561139
Confusion matrix is
[[274  20]
 [ 41 262]]
classification report is
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       294
           1       0.93      0.86      0.90       303

    accuracy                           0.90       597
   macro avg       0.90      0.90      0.90       597
weighted avg       0.90      0.90      0.90       597



In [7]:
LR=LogisticRegression(C=100, random_state=0, max_iter=1000)
LR.fit(Train_X_BOW,Train_Y)
predictions_LR=LR.predict(Test_X_BOW)
print("LR Accuracy Score -> ",accuracy_score(predictions_LR, Test_Y)*100)
print("Confusion matrix is")
print(confusion_matrix(predictions_LR, Test_Y))
print("classification report is")
print(classification_report(predictions_LR, Test_Y))

LR Accuracy Score ->  92.12730318257957
Confusion matrix is
[[283  15]
 [ 32 267]]
classification report is
              precision    recall  f1-score   support

           0       0.90      0.95      0.92       298
           1       0.95      0.89      0.92       299

    accuracy                           0.92       597
   macro avg       0.92      0.92      0.92       597
weighted avg       0.92      0.92      0.92       597

