In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from catboost import CatBoostClassifier

from sklearn.metrics import classification_report, confusion_matrix

In [3]:
import os
data_path = os.path.join(os.path.dirname(os.getcwd()),'data')

In [4]:
fraudtrain = pd.read_csv(os.path.join(data_path,"fraudTrain.csv")).iloc[:,1:]

fraudtrain.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
fraudtrain.shape

(1296675, 22)

In [6]:
fraudtrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop          

In [7]:
# Now lets select columns that I think are possibly useful
selectedcolumns = ['category','amt','gender','job','merchant','is_fraud']
fraudtrain_reduced = fraudtrain[selectedcolumns]

fraudtrain_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 6 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   category  1296675 non-null  object 
 1   amt       1296675 non-null  float64
 2   gender    1296675 non-null  object 
 3   job       1296675 non-null  object 
 4   merchant  1296675 non-null  object 
 5   is_fraud  1296675 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 59.4+ MB


I am just curious about the breakdown for some of the categorical features, especially on the gender and the merchant categories. I believe the distribution of some of these categorical fields are quite highly skewed.

In [8]:
print(fraudtrain_reduced.groupby(['category']).size())
print(fraudtrain_reduced.groupby(['is_fraud']).size())
print(fraudtrain_reduced.groupby(['gender']).size())

category
entertainment      94014
food_dining        91461
gas_transport     131659
grocery_net        45452
grocery_pos       123638
health_fitness     85879
home              123115
kids_pets         113035
misc_net           63287
misc_pos           79655
personal_care      90758
shopping_net       97543
shopping_pos      116672
travel             40507
dtype: int64
is_fraud
0    1289169
1       7506
dtype: int64
gender
F    709863
M    586812
dtype: int64


In [9]:
X = fraudtrain_reduced.drop(columns=['is_fraud'],axis=1)
y = fraudtrain_reduced['is_fraud']

print(X.shape)
print(y.shape)

(1296675, 5)
(1296675,)


In [10]:
num_columns = X.select_dtypes(include=['float64','int64']).columns
cat_columns = X.select_dtypes(include=['object']).columns

In [12]:
print(num_columns)
print(cat_columns)

Index(['amt'], dtype='object')
Index(['category', 'gender', 'job', 'merchant'], dtype='object')


### creating the pre processing pipelines - these are important!

In [13]:
cat_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore',sparse=False))
num_transformer = make_pipeline(MinMaxScaler())

preprocessor = make_pipeline(ColumnTransformer([('num_transformer',num_transformer,num_columns),('cat_transformer',cat_transformer,cat_columns)],
                                               remainder='passthrough'))

In [14]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(907672, 5)
(389003, 5)
(907672,)
(389003,)


In [15]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

print(X_train.shape)
print(X_test.shape)

(907672, 1204)
(389003, 1204)


In [17]:
from sklearn.linear_model import LogisticRegression
#Let us try fitting on a very simple logistic regression model first okay??

XGBmodel = XGBClassifier(random_state=123, n_jobs=-1)
XGBmodel.fit(X_train, y_train)

XGB_pred = XGBmodel.predict(X_test)

In [16]:
## loading the XGB model here just in case m kernal crashes
import pickle
model_in = open('/Users/eugenechua/Downloads/skillsfuture_interview/fraud/objects/XGB_classifier.pkl','rb')
XGBmodel = pickle.load(model_in)

XGB_pred = XGBmodel.predict(X_test)

In [15]:

cf_matrix = confusion_matrix(y_test, XGB_pred)
print(cf_matrix)

[[386404    346]
 [   777   1476]]


In [17]:
#testing out recall calculations

1476/(1476+777)


0.6551264980026631

In [18]:
print(classification_report(y_test,XGB_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    386750
           1       0.81      0.66      0.72      2253

    accuracy                           1.00    389003
   macro avg       0.90      0.83      0.86    389003
weighted avg       1.00      1.00      1.00    389003



The Recall metric (tp/(tp + fn)) might be more relevant here as missing out on the *false negative* might be more expensive to the organisation than missing out on the *false positive*.

In [19]:
XGBproba = XGBmodel.predict_proba(X_test)

In [20]:
XGBproba_boolean = (XGBproba[:,1] > 0.4)
print(classification_report(y_test,XGBproba_boolean))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    386750
           1       0.76      0.73      0.74      2253

    accuracy                           1.00    389003
   macro avg       0.88      0.86      0.87    389003
weighted avg       1.00      1.00      1.00    389003



Techinically, I might feel more comfortable with this with the precision and recall metrics both more balanced out.

In [33]:
import pickle

model_out = open('XGB_classifier.pkl','wb')
pickle.dump(XGBmodel, model_out)
model_out.close()

In [35]:
preprocessor_out = open('preprocessor.pkl', 'wb')
pickle.dump(preprocessor, preprocessor_out)
preprocessor_out.close()

In [37]:
def create_preprocessor(input_df: pd.DataFrame):
    
    num_columns = 'amt'
    cat_columns = ['category', 'gender', 'job', 'merchant']
    
    cat_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore',sparse=False))
    num_transformer = make_pipeline(MinMaxScaler())
    
    preprocessor = make_pipeline(ColumnTransformer([('num_transformer',num_transformer,num_columns),('cat_transformer',cat_transformer,cat_columns)],
                                               remainder='passthrough'))
    
    return preprocessor
    
    
    

In [16]:
X.head()

Unnamed: 0,category,amt,gender,job,merchant
0,misc_net,4.97,F,"Psychologist, counselling","fraud_Rippin, Kub and Mann"
1,grocery_pos,107.23,F,Special educational needs teacher,"fraud_Heller, Gutmann and Zieme"
2,entertainment,220.11,M,Nature conservation officer,fraud_Lind-Buckridge
3,gas_transport,45.0,M,Patent attorney,"fraud_Kutch, Hermiston and Farrell"
4,misc_pos,41.96,M,Dance movement psychotherapist,fraud_Keeling-Crist


In [40]:
X.to_csv('/Users/eugenechua/Downloads/skillsfuture_interview/fraud/data/X_data.csv', index=False)

In [17]:
fraudtrain_reduced[fraudtrain_reduced['is_fraud']==1]

Unnamed: 0,category,amt,gender,job,merchant,is_fraud
2449,grocery_pos,281.06,M,Soil scientist,fraud_Rutherford-Mertz,1
2472,gas_transport,11.52,F,Horticultural consultant,"fraud_Jenkins, Hauck and Friesen",1
2523,grocery_pos,276.31,F,Horticultural consultant,fraud_Goodwin-Nitzsche,1
2546,gas_transport,7.03,M,Soil scientist,fraud_Erdman-Kertzmann,1
2553,grocery_pos,275.73,F,Horticultural consultant,fraud_Koepp-Parker,1
...,...,...,...,...,...,...
1295399,shopping_net,977.01,F,"Librarian, public",fraud_Kassulke PLC,1
1295491,shopping_net,1210.91,F,"Librarian, public",fraud_Schumm PLC,1
1295532,gas_transport,10.24,M,Herbalist,"fraud_Tillman, Dickinson and Labadie",1
1295666,gas_transport,21.69,F,Cytogeneticist,fraud_Corwin-Collins,1


In [18]:
#lets try a prediction here simulating a person inputing and then dishing out predictions

input_df = X[:1]
X_processed = preprocessor.transform(input_df)

X_processed.shape





(1, 1204)

In [19]:
new_pred = XGBmodel.predict(X_processed)

new_pred

array([0])

Lets look at the actual test data

In [17]:
fraudtest = pd.read_csv(os.path.join(data_path,"fraudTest.csv")).iloc[:,1:]

fraudtest.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [18]:
## let us look at those fraudulent transactions.

fraudtest[fraudtest['is_fraud']==1].head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
1685,2020-06-21 22:06:39,3560725013359375,fraud_Hamill-D'Amore,health_fitness,24.84,Brooke,Smith,F,63542 Luna Brook Apt. 012,Notrees,...,31.8599,-102.7413,23,Cytogeneticist,1969-09-15,16bf2e46c54369a8eab2214649506425,1371852399,32.575873,-102.60429,1
1767,2020-06-21 22:32:22,6564459919350820,"fraud_Rodriguez, Yost and Jenkins",misc_net,780.52,Douglas,Willis,M,619 Jeremy Garden Apt. 681,Benton,...,42.5545,-90.3508,1306,Public relations officer,1958-09-10,ab4b379d2c0c9c667d46508d4e126d72,1371853942,42.461127,-91.147148,1
1781,2020-06-21 22:37:27,6564459919350820,fraud_Nienow PLC,entertainment,620.33,Douglas,Willis,M,619 Jeremy Garden Apt. 681,Benton,...,42.5545,-90.3508,1306,Public relations officer,1958-09-10,47a9987ae81d99f7832a54b29a77bf4b,1371854247,42.771834,-90.158365,1
1784,2020-06-21 22:38:55,4005676619255478,"fraud_Heathcote, Yost and Kertzmann",shopping_net,1077.69,William,Perry,M,458 Phillips Island Apt. 768,Denham Springs,...,30.459,-90.9027,71335,Herbalist,1994-05-31,fe956c7e4a253c437c18918bf96f7b62,1371854335,31.204974,-90.261595,1
1857,2020-06-21 23:02:16,3560725013359375,fraud_Hermann and Sons,shopping_pos,842.65,Brooke,Smith,F,63542 Luna Brook Apt. 012,Notrees,...,31.8599,-102.7413,23,Cytogeneticist,1969-09-15,f6838c01f5d2262006e6b71d33ba7c6d,1371855736,31.315782,-102.73639,1


Let us also try with data from the **real** test set with a record that was actually a fraud!

In [19]:
realfraud = fraudtest[fraudtest['is_fraud']==1][1:2]

realfraud_reduced = realfraud[selectedcolumns] 

realfraud_reduced = realfraud_reduced.drop(columns=['is_fraud'],axis=1)

realfraud_reduced



Unnamed: 0,category,amt,gender,job,merchant
1767,misc_net,780.52,M,Public relations officer,"fraud_Rodriguez, Yost and Jenkins"


In [23]:
X_preprocessed = preprocessor.transform(realfraud)

XGBmodel.predict(X_preprocessed)

array([1])

Oh the model actually predicted a fraud on the actual testing set!

### Areas of improvement.

* I took a leap of faith and simply based on intuition, I selected a few fields that I thought would be relevant to the model and went for it. There could be other features that could be a potentially better predictor of fraud? I can't affirm this possibility.

* Just running the base model without any hyperparameter tuning or cross validation already took me 3 hours. I believe if there the hyperparameters are better optimized, we could have gotten better precison/recall scores.

* I could have looked at SHAP/LIME scores for model explainability - given that XGboost is something like a blackbox model to senior stakeholders.

### ML Ops portion

In the interest of time, I would be focusing more on target and data drift which are both key drivers to **monitor** the model in the context of model drifting. I would be using the package evidently to help me with this. the 2 key things to take note of would be
* Concept Drifting
* Data Drifting

In [21]:
from evidently.dashboard import Dashboard
from evidently.tabs import DataDriftTab, CatTargetDriftTab

In [22]:
fraudtrain_reduced.head()

Unnamed: 0,category,amt,gender,job,merchant,is_fraud
0,misc_net,4.97,F,"Psychologist, counselling","fraud_Rippin, Kub and Mann",0
1,grocery_pos,107.23,F,Special educational needs teacher,"fraud_Heller, Gutmann and Zieme",0
2,entertainment,220.11,M,Nature conservation officer,fraud_Lind-Buckridge,0
3,gas_transport,45.0,M,Patent attorney,"fraud_Kutch, Hermiston and Farrell",0
4,misc_pos,41.96,M,Dance movement psychotherapist,fraud_Keeling-Crist,0


In [23]:
fraudtest_reduced = fraudtest[selectedcolumns]
fraudtest_reduced.head()

Unnamed: 0,category,amt,gender,job,merchant,is_fraud
0,personal_care,2.86,M,Mechanical engineer,fraud_Kirlin and Sons,0
1,personal_care,29.84,F,"Sales professional, IT",fraud_Sporer-Keebler,0
2,health_fitness,41.28,F,"Librarian, public","fraud_Swaniawski, Nitzsche and Welch",0
3,misc_pos,60.05,M,Set designer,fraud_Haley Group,0
4,travel,3.19,M,Furniture designer,fraud_Johnston-Casper,0


In [24]:
#Evidently needs target variable to be seen as "target"ArithmeticError
fraudtrain_reduced.rename(columns={'is_fraud': 'target'}, inplace=True)
#fraudtrain_reduced.drop(columns=['job','merchant'],axis=1, inplace = True)
print(fraudtrain_reduced.shape)
fraudtrain_reduced.head()

(1296675, 6)


Unnamed: 0,category,amt,gender,job,merchant,target
0,misc_net,4.97,F,"Psychologist, counselling","fraud_Rippin, Kub and Mann",0
1,grocery_pos,107.23,F,Special educational needs teacher,"fraud_Heller, Gutmann and Zieme",0
2,entertainment,220.11,M,Nature conservation officer,fraud_Lind-Buckridge,0
3,gas_transport,45.0,M,Patent attorney,"fraud_Kutch, Hermiston and Farrell",0
4,misc_pos,41.96,M,Dance movement psychotherapist,fraud_Keeling-Crist,0


In [25]:
fraudtest_reduced.rename(columns={'is_fraud': 'target'}, inplace=True)
#fraudtest_reduced.drop(columns=['job','merchant'],axis=1, inplace=True)
print(fraudtest_reduced.shape)
fraudtest_reduced.head()

(555719, 6)


Unnamed: 0,category,amt,gender,job,merchant,target
0,personal_care,2.86,M,Mechanical engineer,fraud_Kirlin and Sons,0
1,personal_care,29.84,F,"Sales professional, IT",fraud_Sporer-Keebler,0
2,health_fitness,41.28,F,"Librarian, public","fraud_Swaniawski, Nitzsche and Welch",0
3,misc_pos,60.05,M,Set designer,fraud_Haley Group,0
4,travel,3.19,M,Furniture designer,fraud_Johnston-Casper,0


Let me just take a sample, because running it on the full dataset will simply crash my machine. But in the real world it is also practical to take subset of the data on a periodical basis to check for model drifting.

In [27]:
reference = fraudtrain_reduced.sample(n=300000, replace=False)
current = fraudtest_reduced.sample(n=300000, replace=False)

In [28]:
fraud_datadrift_dashboard = Dashboard(tabs=[DataDriftTab(), CatTargetDriftTab()])

fraud_datadrift_dashboard.calculate(reference, current, column_mapping=None)

In [29]:
fraud_datadrift_dashboard.save('fraud_datadrift_report.html')