# Exercises - Fraud Detection

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
import datetime
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

pd.set_option('display.max_columns', None)
plt.style.use(style = 'seaborn')
%matplotlib inline
warnings.filterwarnings('ignore')

runCVflag = False # If False then optimal values are used

# Import Data

In [None]:
%%time
## You can retrieve the data at https://www.kaggle.com/c/ieee-fraud-detection/data 
data_path = "./"
train_tr = pd.read_csv(data_path + "train_transaction.csv")
train_id = pd.read_csv(data_path + "train_identity.csv") 
test_tr = pd.read_csv(data_path + "test_transaction.csv")
test_id = pd.read_csv(data_path + "test_identity.csv")

print('train_transaction shape is: {}'.format(train_tr.shape))
print('train_identity shape is: {}'.format(train_id.shape))

print('test_transaction shape is: {}'.format(test_tr.shape))
print('test_identity shape is: {}'.format(test_id.shape))

In [None]:
train_tr.head()

In [None]:
train_id.head()

In [None]:
test_tr.head()

In [None]:
test_id.head()

## Data Preparation

In [None]:
%%time

train = pd.merge(train_tr, train_id, how = 'left', on = 'TransactionID')
test = pd.merge(test_tr, test_id, how = 'left', on = 'TransactionID')
del train_tr, train_id, test_tr, test_id
print('train set shape is: {}'.format(train.shape))
print('test set shape is: {}'.format(test.shape))

In [None]:
train.head()

In [None]:
test.head()

In [None]:
def different_columns(traincols, testcols):
    diff_cols = []
    for i in traincols:
        if i not in testcols:
            diff_cols.append(i)
    return diff_cols
            
print(different_columns(train.columns, test.columns))
# train and test sets should have the same columns (not considering the target variable 'IsFraud')

In [None]:
test = test.rename(columns = {"id-01": "id_01", "id-02": "id_02", "id-03": "id_03", 
                            "id-06": "id_06", "id-05": "id_05", "id-04": "id_04", 
                            "id-07": "id_07", "id-08": "id_08", "id-09": "id_09", 
                            "id-10": "id_10", "id-11": "id_11", "id-12": "id_12", 
                            "id-15": "id_15", "id-14": "id_14", "id-13": "id_13", 
                            "id-16": "id_16", "id-17": "id_17", "id-18": "id_18", 
                            "id-21": "id_21", "id-20": "id_20", "id-19": "id_19", 
                            "id-22": "id_22", "id-23": "id_23", "id-24": "id_24", 
                            "id-27": "id_27", "id-26": "id_26", "id-25": "id_25", 
                            "id-28": "id_28", "id-29": "id_29", "id-30": "id_30", 
                            "id-31": "id_31", "id-32": "id_32", "id-33": "id_33", 
                            "id-34": "id_34", "id-35": "id_35", "id-36": "id_36", 
                            "id-37": "id_37", "id-38": "id_38"})

print(different_columns(train.columns, test.columns))
# now test and train have the same column names

In [None]:
fig = plt.figure(figsize = (5, 5))
sns.barplot(x = [0,1], y = train['isFraud'].value_counts().values)
plt.show()

In [None]:
fraud_ratio = train['isFraud'].sum()/len(train['isFraud'])
print(fraud_ratio) # percentage of frauds in the train set
del fraud_ratio 

In [None]:
tot_missing_value = train.isnull().sum().sum()
print(tot_missing_value) # missing values in the train set
del tot_missing_value

In [None]:
column_missing_value = train.isnull().sum()
print(column_missing_value[0 : 60])
print(column_missing_value[60 : 120])
print(column_missing_value[120 : 180])
print(column_missing_value[180 : 240])
print(column_missing_value[240 : 300])
print(column_missing_value[300 : 360])
print(column_missing_value[360 : 420])
print(column_missing_value[420 : 434])
del column_missing_value

In [None]:
## PLOT TRANSACTION DATES (THEY DON'T OVERLAP) 

fig = plt.figure(figsize = (10, 5))
plt.hist(train['TransactionDT'], label = 'Train', color = 'red')
plt.hist(test['TransactionDT'], label = 'Test', color = 'yellow')
plt.legend()
plt.title('Train vs. Test TransactionDT Distribution')

## Variables Encoding

In [None]:
%%time

encoder_dict = {}

complete_labelset_temp = pd.concat([train.drop(['isFraud'], axis=1), test], axis=0).reset_index()
variables_encode = complete_labelset_temp.keys()
for k in variables_encode:
    if complete_labelset_temp[k].dtype == object:
        le = preprocessing.LabelEncoder()
        le_fit = le.fit(complete_labelset_temp[k])
        encoder_dict.update({k: le_fit})
        #train[k + '_encoded'] = le_fit.transform(train[k])  
        train[k + '_encoded'] = encoder_dict[k].transform(train[k])  
        train = train.drop([k], axis=1)
        test[k + '_encoded'] = encoder_dict[k].transform(test[k])  
        test = test.drop([k], axis=1)

del complete_labelset_temp

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print('train_set shape is: {}'.format(train.shape))
print('test_set shape is: {}'.format(test.shape))

In [None]:
fig = plt.figure(figsize = (5, 5))
sns.barplot(x = [0,1], y = train['isFraud'].value_counts().values)
plt.show()

In [None]:
fraud_ratio = train['isFraud'].sum()/len(train['isFraud'])
print(fraud_ratio) # percentage of frauds in the train set
del fraud_ratio 

In [None]:
stats_df = pd.DataFrame(columns = ['train_time', 'train_precision', 'train_accuracy', 'train_recall', 'train_roc_auc',
                                   'test_precision', 'test_accuracy', 'test_recall', 'test_roc_auc'])

In [None]:
def print_metric_stats(experiment_name, y_train, y_train_pred, y_test, y_pred):
    print('-----------------------------------------------------')
    print(experiment_name + ' train precision score is {}'.format(precision_score(y_train, y_train_pred)))
    print(experiment_name + ' train accuracy score is {}'.format(accuracy_score(y_train, y_train_pred)))
    print(experiment_name + ' train recall score is {}'.format(recall_score(y_train, y_train_pred)))
    print(experiment_name + ' train auc score is {}'.format(roc_auc_score(y_train, y_train_pred)))
    print('-----------------------------------------------------')
    print(experiment_name + ' test precision score is {}'.format(precision_score(y_test, y_pred)))
    print(experiment_name + ' test accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
    print(experiment_name + ' test recall score is {}'.format(recall_score(y_test, y_pred)))
    print(experiment_name + ' test auc score is {}'.format(roc_auc_score(y_test, y_pred)))
    print('-----------------------------------------------------')
    print(' Train confusion matrix')
    print(confusion_matrix(y_train, y_train_pred))
    print('-----------------------------------------------------')
    print(' Test confusion matrix')
    print(confusion_matrix(y_test, y_pred))
    print('-----------------------------------------------------')

## Unbalanced Approach - Decision Tree

In [None]:
%%time
start_time = datetime.datetime.now()
experiment_name = 'UNBALANCED APPROACH - DECISION TREE'

X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, shuffle=False)

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

### Random Search Tuning

In [None]:
%%time

if runCVflag:
    clf_model = DecisionTreeClassifier(criterion="gini")
    distrib = dict(max_depth = [10,100,500], min_samples_leaf=[5,10,20,50])
    clf = RandomizedSearchCV(clf_model, distrib, random_state=17)
    search = clf.fit(X_train_imp,y_train)
    best_min_samples_leaf = search.best_params_['min_samples_leaf']
    best_max_depth = search.best_params_['max_depth']
else:
    # results ====> optimal values are: 'min_samples_leaf': 20, 'max_depth': 10
    best_min_samples_leaf = 20
    best_max_depth = 10

### Training

In [None]:
%%time
clf_model = DecisionTreeClassifier(criterion="gini", max_depth = best_max_depth, min_samples_leaf = best_min_samples_leaf)
clf_model.fit(X_train_imp,y_train)
end_time = datetime.datetime.now() - start_time

### Predict

In [None]:
%%time
y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = clf_model.predict(X_test_imp)
y_train_pred = clf_model.predict(X_train_imp)

stats_df.loc[experiment_name] = ([end_time,
                                  precision_score(y_train, y_train_pred),
                                  accuracy_score(y_train, y_train_pred),
                                  recall_score(y_train, y_train_pred),
                                  roc_auc_score(y_train, y_train_pred),
                                  precision_score(y_test, y_pred),
                                  accuracy_score(y_test, y_pred),
                                  recall_score(y_test, y_pred),
                                  roc_auc_score(y_test, y_pred),
                                 ])

print_metric_stats(experiment_name, y_train, y_train_pred, y_test, y_pred)

## Unbalanced Approach - Xgboost

In [None]:
%%time
start_time = datetime.datetime.now()
experiment_name = 'UNBALANCED APPROACH - XGBOOST'

X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, shuffle=False)

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

### Random Search Tuning

In [None]:
%%time

if runCVflag:
    clf_model = xgb.XGBClassifier(tree_method = 'gpu_hist')
    distrib = dict(max_depth = [5,10], n_estimators = [50, 100], learning_rate=[0.02, 0.1, 0.2])
    clf = RandomizedSearchCV(clf_model, distrib, random_state=17)
    search = clf.fit(X_train_imp,y_train)
    best_n_estimators = search.best_params_['n_estimators']
    best_max_depth = search.best_params_['max_depth']
    best_learning_rate = search.best_params_['learning_rate']
else:
    # results ====> optimal values are: 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.02
    best_max_depth = 10
    best_n_estimators = 100
    best_learning_rate = 0.02

### Training

In [None]:
%%time
xgmodel = xgb.XGBClassifier(tree_method = 'gpu_hist',
                            max_depth = best_max_depth,
                            n_estimators = best_n_estimators,
                            learning_rate = best_learning_rate)

xgmodel.fit(X_train_imp,y_train)

end_time = datetime.datetime.now() - start_time

### Predict

In [None]:
%%time
y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = xgmodel.predict(X_test_imp)
y_train_pred = xgmodel.predict(X_train_imp)


stats_df.loc[experiment_name] = ([end_time,
                                  precision_score(y_train, y_train_pred),
                                  accuracy_score(y_train, y_train_pred),
                                  recall_score(y_train, y_train_pred),
                                  roc_auc_score(y_train, y_train_pred),
                                  precision_score(y_test, y_pred),
                                  accuracy_score(y_test, y_pred),
                                  recall_score(y_test, y_pred),
                                  roc_auc_score(y_test, y_pred),
                                 ])

print_metric_stats(experiment_name, y_train, y_train_pred, y_test, y_pred)

## Undersampling Approach - Decision Tree

In [None]:
%%time
start_time = datetime.datetime.now()

experiment_name = 'UNDERSAMPLING APPROACH - DECISION TREE'

X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, shuffle=False)

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

# Undersample the train dataset
ros = RandomUnderSampler(random_state=17)

X_train_imp, y_train = ros.fit_resample(X_train_imp, y_train)
print('Resampled dataset shape {}'.format(Counter(y_train)))

### Random Search Tuning

In [None]:
%%time

if runCVflag:
    clf_model = DecisionTreeClassifier(criterion="gini")
    distrib = dict(max_depth = [10,100,500], min_samples_leaf=[5,10,20,50])
    clf = RandomizedSearchCV(clf_model, distrib, random_state=17)
    search = clf.fit(X_train_imp,y_train)
    best_min_samples_leaf = search.best_params_['min_samples_leaf']
    best_max_depth = search.best_params_['max_depth']
else:
    # results ====> optimal values are: 'min_samples_leaf': 50, 'max_depth': 10
    best_min_samples_leaf = 50
    best_max_depth = 10

### Training

In [None]:
%%time
clf_model = DecisionTreeClassifier(criterion="gini", 
                                   max_depth = best_max_depth, 
                                   min_samples_leaf = best_min_samples_leaf)

clf_model.fit(X_train_imp,y_train)

end_time = datetime.datetime.now() - start_time

### Predict

In [None]:
%%time
y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = clf_model.predict(X_test_imp)
y_train_pred = clf_model.predict(X_train_imp)



stats_df.loc[experiment_name] = ([end_time,
                                  precision_score(y_train, y_train_pred),
                                  accuracy_score(y_train, y_train_pred),
                                  recall_score(y_train, y_train_pred),
                                  roc_auc_score(y_train, y_train_pred),
                                  precision_score(y_test, y_pred),
                                  accuracy_score(y_test, y_pred),
                                  recall_score(y_test, y_pred),
                                  roc_auc_score(y_test, y_pred),
                                 ])

print_metric_stats(experiment_name, y_train, y_train_pred, y_test, y_pred)

## Undersampling Approach - Xgboost

In [None]:
%%time
start_time = datetime.datetime.now()

experiment_name = 'UNDERSAMPLING APPROACH - XGBOOST'

X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, shuffle=False)

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

# Undersample the train dataset
ros = RandomUnderSampler(random_state=17)

X_train_imp, y_train = ros.fit_resample(X_train_imp, y_train)
print('Resampled dataset shape {}'.format(Counter(y_train)))

### Random Search Tuning

In [None]:
%%time

if runCVflag:
    clf_model = xgb.XGBClassifier(tree_method = 'gpu_hist')
    distrib = dict(max_depth = [5,10], n_estimators = [50, 100], learning_rate=[0.02, 0.1, 0.2])
    clf = RandomizedSearchCV(clf_model, distrib, random_state=17)
    search = clf.fit(X_train_imp,y_train)
    best_n_estimators = search.best_params_['n_estimators']
    best_max_depth = search.best_params_['max_depth']
    best_learning_rate = search.best_params_['learning_rate']
else:
    # results ====> optimal values are: 'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.02
    best_max_depth = 50
    best_n_estimators = 5
    best_learning_rate = 0.02

### Training

In [None]:
%%time
xgmodel = xgb.XGBClassifier(tree_method = 'gpu_hist',
                            max_depth = best_max_depth,
                            n_estimators = best_n_estimators,
                            learning_rate = best_learning_rate)

xgmodel.fit(X_train_imp,y_train)

end_time = datetime.datetime.now() - start_time

### Predict

In [None]:
%%time
y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = xgmodel.predict(X_test_imp)
y_train_pred = xgmodel.predict(X_train_imp)


stats_df.loc[experiment_name] = ([end_time,
                                  precision_score(y_train, y_train_pred),
                                  accuracy_score(y_train, y_train_pred),
                                  recall_score(y_train, y_train_pred),
                                  roc_auc_score(y_train, y_train_pred),
                                  precision_score(y_test, y_pred),
                                  accuracy_score(y_test, y_pred),
                                  recall_score(y_test, y_pred),
                                  roc_auc_score(y_test, y_pred),
                                 ])

print_metric_stats(experiment_name, y_train, y_train_pred, y_test, y_pred)

## Oversampling Approach - Decision Tree

In [None]:
%%time
start_time = datetime.datetime.now()

experiment_name = 'OVERSAMPLING APPROACH - DECISION TREE'

X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, shuffle=False)

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

# Undersample the train dataset
ros = RandomOverSampler(random_state=17)

X_train_imp, y_train = ros.fit_resample(X_train_imp, y_train)
print('Resampled dataset shape {}'.format(Counter(y_train)))

### Random Search Tuning

In [None]:
%%time

if runCVflag:
    clf_model = DecisionTreeClassifier(criterion="gini")
    distrib = dict(max_depth = [10,100,500], min_samples_leaf=[5,10,20,50])
    clf = RandomizedSearchCV(clf_model, distrib, random_state=17)
    search = clf.fit(X_train_imp,y_train)
    best_min_samples_leaf = search.best_params_['min_samples_leaf']
    best_max_depth = search.best_params_['max_depth']
else:
    # results ====> optimal values are: 'min_samples_leaf': 5, 'max_depth': 100
    best_min_samples_leaf = 5
    best_max_depth = 100

### Training

In [None]:
%%time
clf_model = DecisionTreeClassifier(criterion="gini", 
                                   max_depth = best_max_depth, 
                                   min_samples_leaf = best_min_samples_leaf)

clf_model.fit(X_train_imp,y_train)

end_time = datetime.datetime.now() - start_time

### Predict

In [None]:
%%time
y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = clf_model.predict(X_test_imp)
y_train_pred = clf_model.predict(X_train_imp)


stats_df.loc[experiment_name] = ([end_time,
                                  precision_score(y_train, y_train_pred),
                                  accuracy_score(y_train, y_train_pred),
                                  recall_score(y_train, y_train_pred),
                                  roc_auc_score(y_train, y_train_pred),
                                  precision_score(y_test, y_pred),
                                  accuracy_score(y_test, y_pred),
                                  recall_score(y_test, y_pred),
                                  roc_auc_score(y_test, y_pred),
                                 ])

print_metric_stats(experiment_name, y_train, y_train_pred, y_test, y_pred)

## Oversampling Approach - Xgboost

In [None]:
%%time
start_time = datetime.datetime.now()

experiment_name = 'OVERSAMPLING APPROACH - XGBOOST'

X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, shuffle=False)

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

# Undersample the train dataset
ros = RandomOverSampler(random_state=17)

X_train_imp, y_train = ros.fit_resample(X_train_imp, y_train)
print('Resampled dataset shape {}'.format(Counter(y_train)))

### Random Search Tuning

In [None]:
%%time

if runCVflag:
    clf_model = xgb.XGBClassifier(tree_method = 'gpu_hist')
    distrib = dict(max_depth = [5,10], n_estimators = [50, 100], learning_rate=[0.02, 0.1, 0.2])
    clf = RandomizedSearchCV(clf_model, distrib, random_state=17)
    search = clf.fit(X_train_imp,y_train)
    best_n_estimators = search.best_params_['n_estimators']
    best_max_depth = search.best_params_['max_depth']
    best_learning_rate = search.best_params_['learning_rate']
else:
    # results ====> optimal values are: 'n_estimators': 5, 'max_depth': 50, 'learning_rate': 0.02
    best_max_depth = 5
    best_n_estimators = 50
    best_learning_rate = 0.02

### Training

In [None]:
%%time
xgmodel = xgb.XGBClassifier(tree_method = 'gpu_hist',
                           max_depth = best_max_depth,
                            n_estimators = best_n_estimators,
                            learning_rate = best_learning_rate)

xgmodel.fit(X_train_imp,y_train)

end_time = datetime.datetime.now() - start_time

### Predict

In [None]:
%%time
y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = xgmodel.predict(X_test_imp)
y_train_pred = xgmodel.predict(X_train_imp)


stats_df.loc[experiment_name] = ([end_time,
                                  precision_score(y_train, y_train_pred),
                                  accuracy_score(y_train, y_train_pred),
                                  recall_score(y_train, y_train_pred),
                                  roc_auc_score(y_train, y_train_pred),
                                  precision_score(y_test, y_pred),
                                  accuracy_score(y_test, y_pred),
                                  recall_score(y_test, y_pred),
                                  roc_auc_score(y_test, y_pred),
                                 ])

print_metric_stats(experiment_name, y_train, y_train_pred, y_test, y_pred)

## Results

In [None]:
display(stats_df)