### In this notebook we will leverage Watson Studio's Python environment to build out a fraud detection model programmatically. As can be seen in the diagram below, this is the approach that is most reliant on data science/coding skills as compared to SPSS Modeler and AutoAI.
#### Notebook authored by Elliott Botwick - elliott.botwick@ibm.com

![image.png](attachment:image.png)

# Importing data with built in functionality!

In [1]:
from IPython.display import IFrame    

IFrame("https://giphy.com/embed/S9i7ABVxaXsYcgkT6d" ,width="750", height="437")

In [2]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Amount,Cardholder_Country,CatMcc,Fraud_Flag,Int_Amount_Log,LastRespCode,LastTrxAmount,Merchant_Country_Code,MostOccMCC,NumFirstTimeCards20min,...,RespCode51in12h,Response_Code,Shopping,SP_Timestamp,SumHighAmount24h,SumTrx24h,TimeLastATM,Trx_Timestamp,Large Purchase,Cardholder_Age_BIN
0,100.0,Germany,Cash,1,4.61,0,200.0,ES,6011,2,...,0,0,0,2/28/19 11:53,0,700,0.0,7/31/14 15:47,T,1
1,2200.0,Hungary,Shops,1,7.7,0,94.5,IT,5812,0,...,0,0,1,2/28/19 11:52,0,994,12.7,7/31/14 14:35,T,3
2,300.0,Germany,Cash,1,5.7,0,0.0,ES,0,0,...,0,0,0,2/28/19 11:53,0,0,12.7,7/31/14 15:42,T,5
3,390.0,Chech Republ,Shops,1,5.97,0,800.0,IT,5511,0,...,0,0,0,2/28/19 11:53,4050,4050,12.7,7/31/14 15:52,T,4
4,300.0,Germany,Cash,1,5.7,0,0.0,ES,0,1,...,0,0,0,2/28/19 11:53,0,0,12.7,7/31/14 15:44,T,1


In [3]:
#understand data types
data.dtypes

Amount                    float64
Cardholder_Country         object
CatMcc                     object
Fraud_Flag                  int64
Int_Amount_Log            float64
LastRespCode                int64
LastTrxAmount             float64
Merchant_Country_Code      object
MostOccMCC                  int64
NumFirstTimeCards20min      int64
NumHighAmount24h            int64
NumSameTerminal30d          int64
NumTrx24h                   int64
NumTrx5Min                  int64
NumTrxUS4d                  int64
POS_Entry_Mode              int64
PrimaryInstanceId           int64
PrimaryUrid                 int64
ProfFreqLast4d              int64
ProfFreqYesterday           int64
ProfTotalLast4d             int64
ProfTotalYesterday          int64
RespCode3h                  int64
RespCode51in12h             int64
Response_Code               int64
Shopping                    int64
SP_Timestamp               object
SumHighAmount24h            int64
SumTrx24h                   int64
TimeLastATM   

In [4]:
#understand data better utilizing pandas profiling package
!pip install pandas-profiling
import pandas as pd
import pandas_profiling

data.profile_report(style={'full_width':True})

Collecting pandas-profiling
[?25l  Downloading https://files.pythonhosted.org/packages/2c/2f/aae19e2173c10a9bb7fee5f5cad35dbe53a393960fc91abc477dcc4661e8/pandas-profiling-2.3.0.tar.gz (127kB)
[K     |████████████████████████████████| 133kB 18.5MB/s eta 0:00:01
Collecting missingno>=0.4.2 (from pandas-profiling)
  Downloading https://files.pythonhosted.org/packages/2b/de/6e4dd6d720c49939544352155dc06a08c9f7e4271aa631a559dfbeaaf9d4/missingno-0.4.2-py3-none-any.whl
Collecting htmlmin>=0.1.12 (from pandas-profiling)
  Downloading https://files.pythonhosted.org/packages/b3/e7/fcd59e12169de19f0131ff2812077f964c6b960e7c09804d30a7bf2ab461/htmlmin-0.1.12.tar.gz
Collecting phik>=0.9.8 (from pandas-profiling)
[?25l  Downloading https://files.pythonhosted.org/packages/45/ad/24a16fa4ba612fb96a3c4bb115a5b9741483f53b66d3d3afd987f20fa227/phik-0.9.8-py3-none-any.whl (606kB)
[K     |████████████████████████████████| 614kB 25.0MB/s eta 0:00:01
[?25hCollecting confuse>=1.0.0 (from pandas-profiling)
 



In [5]:
#identify categorical variables and print # of unique categories for each
categoricalvars = data.dtypes[data.dtypes==object]

cat_feats = list(categoricalvars.index)
for cat in cat_feats:
    print(cat, " - Unique Categories: ", data[cat].value_counts().shape[0])

Cardholder_Country  - Unique Categories:  13
CatMcc  - Unique Categories:  8
Merchant_Country_Code  - Unique Categories:  99
SP_Timestamp  - Unique Categories:  32
Trx_Timestamp  - Unique Categories:  1228
Large_Purchase  - Unique Categories:  2


In [6]:
#drop categorical features with > 25 unique classes
for cat in cat_feats:
    if(data[cat].value_counts().shape[0]) > 25:
        print ("Removing ", cat)
        cat_feats.remove(cat)

for cat in cat_feats:
    if(data[cat].value_counts().shape[0]) > 25:
        print ("Removing ", cat)
        cat_feats.remove(cat)
        
cat_feats

Removing  Merchant_Country_Code
Removing  Trx_Timestamp
Removing  SP_Timestamp


['Cardholder_Country', 'CatMcc', 'Large_Purchase']

In [7]:
#create dummy variables for the categorial features with < 25 unique classes
datawdummies =  pd.get_dummies(data,columns=cat_feats,drop_first=True)
datawdummies = datawdummies.drop(['PrimaryInstanceId'], axis = 1)

In [8]:
#create a correlation matrix of the data and show all features with at least a .10 correlation with Fraud Flag
datacorrs = datawdummies.corr()
datacorrs['Fraud_Flag'].loc[abs(datacorrs['Fraud_Flag'])>0.1].sort_values(ascending=False)

Fraud_Flag            1.000000
Shopping              0.331049
SumTrx24h             0.329857
NumHighAmount24h      0.324559
ProfTotalLast4d       0.299923
NumTrx24h             0.280348
SumHighAmount24h      0.273643
LastTrxAmount         0.211968
Int_Amount_Log        0.203679
Amount                0.202969
ProfFreqLast4d        0.166711
Large_Purchase_T      0.140047
ProfTotalYesterday    0.138935
MostOccMCC            0.108071
Name: Fraud_Flag, dtype: float64

In [9]:
#Take list of factors with at least a .1 correlation and create new df with just those features
final_factors = list(datacorrs['Fraud_Flag'].loc[abs(datacorrs['Fraud_Flag'])>0.1].sort_values(ascending=False).index)
final_data = datawdummies[final_factors]

In [10]:
final_data = final_data.rename(columns = {'Large Purchase_T': 'LargePurchaseT'})
final_data.head()

Unnamed: 0,Fraud_Flag,Shopping,SumTrx24h,NumHighAmount24h,ProfTotalLast4d,NumTrx24h,SumHighAmount24h,LastTrxAmount,Int_Amount_Log,Amount,ProfFreqLast4d,Large_Purchase_T,ProfTotalYesterday,MostOccMCC
0,1,0,700,0,800,4,0,200.0,4.61,100.0,5,1,0,6011
1,1,1,994,0,3194,4,0,94.5,7.7,2200.0,5,1,659,5812
2,1,0,0,0,300,0,0,0.0,5.7,300.0,1,1,0,0
3,1,0,4050,5,4440,5,4050,800.0,5.97,390.0,6,1,900,5511
4,1,0,0,0,300,0,0,0.0,5.7,300.0,1,1,0,0


In [11]:
#Partition data

X = final_data.drop('Fraud_Flag', axis = 1)
Y = final_data['Fraud_Flag']

from sklearn.model_selection import train_test_split 
xtrain, xtest, ytrain, ytest = train_test_split(X,Y, test_size = .25)

In [12]:
#import library and create decision tree classifier object
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

In [13]:
#train decision tree model
dtc.fit(xtrain, ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [14]:
#score model
yhat = dtc.predict(xtest)

In [15]:
#evaluate model
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(ytest,yhat))

[[2079  153]
 [ 157  111]]


In [16]:
print(classification_report(ytest,yhat))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      2232
           1       0.42      0.41      0.42       268

   micro avg       0.88      0.88      0.88      2500
   macro avg       0.68      0.67      0.67      2500
weighted avg       0.88      0.88      0.88      2500



#### .38 precision and .43 recall not great... Let's try a random forest

In [17]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(xtrain,ytrain)
rfc_yhat= rfc.predict(xtest)

In [18]:
confusion_matrix(ytest,rfc_yhat)

array([[2197,   35],
       [ 173,   95]])

In [19]:
print(classification_report(ytest,rfc_yhat))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95      2232
           1       0.73      0.35      0.48       268

   micro avg       0.92      0.92      0.92      2500
   macro avg       0.83      0.67      0.72      2500
weighted avg       0.91      0.92      0.90      2500



In [20]:
#Random forest is better - can we improve more with an xgboost?
#!pip install xgboost
from xgboost import XGBClassifier

xgbmodel = XGBClassifier(booster='gbtree')
xgbmodel.fit(xtrain, ytrain)
xgb_yhat = xgbmodel.predict(xtest)

In [21]:
confusion_matrix(ytest,xgb_yhat)

array([[2208,   24],
       [ 174,   94]])

In [22]:
print(classification_report(ytest,xgb_yhat))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96      2232
           1       0.80      0.35      0.49       268

   micro avg       0.92      0.92      0.92      2500
   macro avg       0.86      0.67      0.72      2500
weighted avg       0.91      0.92      0.91      2500



In [23]:
#Xgboost does outperform the decision tree and random forest models - now lets apply some HPO to the xgboost to see if we can further increase the performance
#First build a pipeline
from sklearn.pipeline import Pipeline
pipe = Pipeline([('xgbclass', XGBClassifier())])

In [24]:
#Use grid search to optimize hyperparamaters in xgb classifier
from sklearn.model_selection import GridSearchCV

params = {
        'xgbclass__min_child_weight': [1, 5, 10],
        'xgbclass__subsample': [0.6, 0.8, 1.0],
        'xgbclass__max_depth': [3, 4, 5],
        "xgbclass__learning_rate": [0.1, 0.5, 1],
        "xgbclass__n_estimators": [10, 50, 100, 500],
        }

fit_params = {"xgbclass__eval_set": [(xtest, ytest)], 
              "xgbclass__early_stopping_rounds": 10, 
              "xgbclass__verbose": False} 

searchCV = GridSearchCV(pipe, cv=5, param_grid=params, fit_params=fit_params)
searchCV.fit(xtrain, ytrain)  



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('xgbclass', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params={'xgbclass__eval_set': [(      Shopping  SumTrx24h  NumHighAmount24h  ProfTotalLast4d  NumTrx24h  \
7782         0          0                 0               81          0
3023         0        188                 0             2271          2
5717         0          0              ...ag, Length: 2500, dtype: int64)], 'xgbclass__early_stopping_rounds': 10, 'xgbclass__verbose': False},
       iid='warn', n_jobs=None,
       param_grid={'xgbclass__min_child_weight': [1, 5

In [25]:
#describe the 'best estimator"from the grid search
bestxgbcv = searchCV.best_estimator_
bestxgbcv

Pipeline(memory=None,
     steps=[('xgbclass', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.5, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8))])

In [26]:
#use the best estimator to create a new and *hopefully* improved xgb classifier
bestxgbcv.fit(xtrain,ytrain)
gridscores = bestxgbcv.predict(xtest)

In [27]:
#evaluate
confusion_matrix(ytest, gridscores)

array([[2195,   37],
       [ 159,  109]])

In [28]:
print ("best estimator grid")
print(classification_report(ytest, gridscores))

best estimator grid
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      2232
           1       0.75      0.41      0.53       268

   micro avg       0.92      0.92      0.92      2500
   macro avg       0.84      0.70      0.74      2500
weighted avg       0.91      0.92      0.91      2500



In [29]:
from sklearn.metrics import f1_score
modelf1 = f1_score(ytest, gridscores)
modelf1

0.5265700483091788

In [None]:
#deploy model into watson machine learning repo

from watson_machine_learning_client import WatsonMachineLearningAPIClient


wml_creds = {
  "apikey": "###",
  "instance_id": "###",
  "password": "###",
  "url": "https://us-south.ml.cloud.ibm.com",
  "username": "###"
}
client = WatsonMachineLearningAPIClient( wml_creds )

In [None]:
model_details = client.repository.store_model(bestxgbcv, 'PyFraudModelxgbv3', training_data=xtrain, training_target = ytrain, pipeline = pipe)

In [None]:
model_details

### Authored by Elliott Botwick - elliott.botwick@ibm.com