### Anomaly Detection - Paytm Data

In [None]:
#Importing the Required Libraries
import pandas as pd 
import numpy as np
from collections import Counter

from sklearn.preprocessing import scale 

#Applying the SMOTE
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

#Models
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
#Martix library
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score

In [None]:
#Loading The Dataset
df = pd.read_excel("/content/AD_data.xlsx")
df.head()

Unnamed: 0,id,registration_time,cnt1,cnt2,cnt3,cnt4,cnt5,cnt6,cnt7,cnt8,...,txn_amount_business_5,txn_cnt_business_6,txn_amount_business_6,txn_cnt_business_7,txn_amount_business_7,txn_cnt_business_8,txn_amount_business_8,txn_cnt_business_9,txn_amount_business_9,isFraud
0,26854.0,2021-12-01 04:35:08,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,8142.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,28389.0,2021-12-01 09:28:21,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34757.0,2021-12-01 14:08:47,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,...,0.0,2.0,90000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21694.0,2021-12-01 22:59:25,8.0,8.0,1.0,10.0,1.0,1.0,0.0,0.0,...,0.0,1.0,2254.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,33145.0,2021-12-02 10:59:26,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Checking the Shape
df.shape

(20784, 40)

In [None]:
#Checking for NA Value
df.isnull().sum()

id                       0
registration_time        0
cnt1                     0
cnt2                     0
cnt3                     0
cnt4                     0
cnt5                     0
cnt6                     0
cnt7                     0
cnt8                     0
cnt9                     0
cnt10                    0
cnt11                    0
cnt12                    0
cnt13                    0
txn_amount1              0
txn_amount2              0
txn_amount3              0
txn_amount4              0
txn_amount5              0
txn_amount6              0
txn_cnt_business_1       0
txn_amount_business_1    0
txn_cnt_business_2       0
txn_amount_business_2    0
txn_cnt_business_3       0
txn_amount_business_3    0
txn_cnt_business_4       0
txn_amount_business_4    0
txn_cnt_business_5       0
txn_amount_business_5    0
txn_cnt_business_6       0
txn_amount_business_6    0
txn_cnt_business_7       0
txn_amount_business_7    0
txn_cnt_business_8       0
txn_amount_business_8    0
t

In [None]:
#Checking Count of Distinct Value of Y Variable
df['isFraud'].value_counts()

0.0    20551
1.0      233
Name: isFraud, dtype: int64

In [None]:
#Drop features that are not required to build our model ('id', 'registration_time')
df1 = df.drop(['id', 'registration_time'],axis='columns')
df1.shape

(20784, 38)

In [None]:
#Separating the X & Y Variable
x = df1.iloc[:,:37]
y = df1['isFraud']
print(x.shape)
print(y.shape)

(20784, 37)
(20784,)


##### Implementing Model on the Imbalance Data (Baseline Model)

In [None]:
#Train Test Split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20,random_state=7)

In [None]:
#With Default parameter
kfold = KFold(n_splits=5, random_state=10, shuffle=True)
model =  XGBClassifier()
scores = cross_val_score(model,x_train,y_train,scoring ='roc_auc', cv = kfold)
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.976


In [None]:
#Recall of Baseline Model
model.fit(x_train,y_train)
print(model.score(x_test,y_test))
print(recall_score(y_test,model.predict(x_test)))

0.9877315371662256
0.2903225806451613


#####Removing Scale

In [None]:
#Converting into numpy array -- as scale() requires array format
data = x_train.values
#Normalizing the numerical data  by using scale()
x_train_scaled  = scale(data)

data_test = x_test.values
x_test_scaled = scale(data_test)

#### SMOTE

In [None]:
#Initializing the SMOTE for Minority Class & undersampler for Majority Class
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
#Initializing the pipeline to Tranform
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X, Y = pipeline.fit_resample(x_train_scaled, y_train)
# summarize the new class distribution
counter = Counter(Y)
print(counter)

Counter({0.0: 3290, 1.0: 1645})


##### Find best Algorithms using GridSearchCV

In [None]:
#GridSearchCV

def find_best_model_using_gridsearchcv(x,y):
    algos = {
        'Decision Tree': {'Classifier': DecisionTreeClassifier(),'params': {'criterion': ['entropy'],'max_depth': [3,4,5]}
        },
        'Random Forest': {'Classifier': RandomForestClassifier(),'params':{'n_estimators':[100,125,150,175,200],'max_features': [4,6,8,10,20,25,35],'random_state':[7]}
        },
        'AdaBoost': {'Classifier': AdaBoostClassifier(),'params': {'n_estimators' : [10,15,20],'random_state':[7]}
        },
        'Gradient Boosting': {'Classifier': GradientBoostingClassifier(),'params': {'learning_rate':[0.1,0.01,0.2],'random_state':[7]}
        },
        'XGBM': {'Classifier': XGBClassifier(),'params': {'n_estimators' : [70,80,90,100],'max_depth': [3,4,5,7],
                                                          'learning_rate':[0.1,0.2],'random_state':[7]}
        },
        'SVM': {'Classifier': SVC(),'params': {'kernel':['rbf'],'gamma':[50,100],'C':[10,15,20]}
        },
        'KNN': {'Classifier': KNeighborsClassifier(),'params': {'n_neighbors':[5,10,15,20]}
        }
    }
    scores = []
    kfold = KFold(n_splits=5, random_state=10, shuffle=True)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['Classifier'], config['params'], cv=kfold)
        gs.fit(x,y)
        scores.append({
            'Classifier': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['Classifier','best_score','best_params'])

find_best_model_using_gridsearchcv(X,Y)

Unnamed: 0,Classifier,best_score,best_params
0,Decision Tree,0.946505,"{'criterion': 'entropy', 'max_depth': 5}"
1,Random Forest,0.977305,"{'max_features': 6, 'n_estimators': 100, 'rand..."
2,AdaBoost,0.941641,"{'n_estimators': 20, 'random_state': 7}"
3,Gradient Boosting,0.974063,"{'learning_rate': 0.2, 'random_state': 7}"
4,XGBM,0.978318,"{'learning_rate': 0.2, 'max_depth': 3, 'n_esti..."
5,SVM,0.780952,"{'C': 10, 'gamma': 50, 'kernel': 'rbf'}"
6,KNN,0.956839,{'n_neighbors': 5}


#### Model Building - On Train & Test Data

In [None]:
#Train Test Split 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20,random_state=7)

In [None]:
#SVM Model
svm = SVC(kernel = 'rbf',gamma = 50, C = 10)
svm.fit(X_train,Y_train)
svm.score(X_test, Y_test)*100

77.9128672745694

In [None]:
#XGBM Model
xgbm = XGBClassifier(n_estimators = 100, max_depth = 3,learning_rate = 0.2,random_state = 7)
xgbm.fit(X_train,Y_train)
xgbm.score(X_test, Y_test)*100

97.56838905775076

In [None]:
#Random Forest Model
rfc = RandomForestClassifier(n_estimators=100, max_features=6,random_state=7)
rfc.fit(X_train,Y_train)
rfc.score(X_test, Y_test)*100

97.36575481256332

In [None]:
#KNN Model
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train,Y_train)
knn.score(X_test, Y_test)*100

95.13677811550151

#### Model (XGBM) (Train Accuarcy)

In [None]:
#XGBM Classifier and fit the model (On Smote Data)
classifier = XGBClassifier(n_estimators = 100, max_depth = 3,learning_rate = 0.2,random_state = 7)
classifier.fit(X,Y)
#Predict for X dataset
y_pred = classifier.predict(X)

# Confusion Matrix for the model accuracy
confusion_matrix = confusion_matrix(Y,y_pred)
print (confusion_matrix)

[[3265   25]
 [  29 1616]]


In [None]:
#Calculating Accuracy 
#TP+TN / TP + FP + FN + TN
((3265+1616)/(3265+25+29+1616))*100

98.90577507598785

In [None]:
#Cross Validation on Entire Data (Smote Data)
kfold = KFold(n_splits=5, random_state=10, shuffle=True)
scores = cross_val_score(classifier,X,Y,scoring ='roc_auc', cv = kfold)
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.996


In [None]:
#Recall Score on Smote Data of Final Model
recall_score(Y,xgbm.predict(X))

0.982370820668693

##### Inference: Our Final Model Accuracy is 98.90%

#####  Model Test Accuarcy on (Imbalanced Data)

In [None]:
#Test of Imbalanced Data on Smote Trained Final Model (Imbalanced Data)
classifier.score(x_test_scaled,y_test)*100

88.78999278325715

In [None]:
#Cross Validation on Test Data by the Final Model
kfold = KFold(n_splits=5, random_state=10, shuffle=True)
scores = cross_val_score(classifier,x_test_scaled,y_test,scoring ='roc_auc', cv = kfold)
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.928


In [None]:
#Recall Score on Test Data (Imbalanced Data) of Smote Trained Final Model
recall_score(y_test,classifier.predict(x_test_scaled))

0.8548387096774194

#### Inference: So XGBM Gave the Best Test Accuracy of 97.56 % On the Smote Data Lets test now this model on Imbalanced Data 

In [None]:
#Test on Imbalanced Data of Smote Trained Model (Imbalanced Data)
xgbm.score(x_test_scaled,y_test)*100

90.8587923983642

In [None]:
#Cross Validation on Test Data  (Imbalanced Data)
kfold = KFold(n_splits=5, random_state=10, shuffle=True)
scores = cross_val_score(xgbm,x_test_scaled,y_test,scoring ='roc_auc', cv = kfold)
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.928


In [None]:
#Recall Score on Test Data (Imbalanced Data) of Smote Trained Model
recall_score(y_test,xgbm.predict(x_test_scaled))

0.8548387096774194

##### Saving the Trained Model

In [None]:
# .pickle file
import pickle
with open('detections_model.pickle','wb') as f:
    pickle.dump(classifier,f)

Summary:

**BaseLine Model (Imbalanced Data),**

Accuracy = 98.7%
ROC = 97.6%,
Recall = 29.05% .


**Xgbm Model (Built On Entire Smote Data) Training Result,**

Accuracy = 98.90%,
ROC = 99.6%,
Recall = 98.2% .


**Xgbm Model (Built On Entire Smote Data) Test Result on Imbalanced Data,**

Accuracy = 88.78%,
ROC = 92.8%,
Recall = 85.4% .


**Xgbm Model (Built on Smote Data)(FINAL) Test Result on Imbalanced Data,**

Accuracy = 90.85%,
ROC = 92.8%,
Recall = 85.4% .