In [1]:
!pip install catboost



In [2]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

Create a function to validate models using crossvalidation

In [3]:
def validate(model, X_train, y_train, cv = 5):
  start = time.time()
  scores = cross_val_score(model,X_train,y_train, cv = cv, scoring= 'f1')
  end = time.time()
  ex_time = end - start
  return scores.mean(), scores.std(), ex_time,  scores

Read the data

In [4]:
train = pd.read_csv('https://raw.githubusercontent.com/chimaobi-okite/DSML-Projects/main/FraudDetection/pr_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/chimaobi-okite/DSML-Projects/main/FraudDetection/pr_test.csv')

In [5]:
y = train['Unusual']
X = train.drop('Unusual', axis = 1)

In [6]:
## split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Check validation scores across several models

In [7]:
tree_model = DecisionTreeClassifier()
tree_mean, tree_std,tree_time, _ = validate(tree_model, X_train, y_train, cv = 3)

In [8]:
xg_model = XGBClassifier(max_depth=150)
xg_mean, xg_std,xg_time,  _ = validate(xg_model, X_train, y_train, cv = 3)

In [9]:
rand_model = RandomForestClassifier(max_depth= 200)
rand_mean, rand_std, rand_time, _ = validate(rand_model, X_train, y_train, cv= 3)

In [10]:
cat_model = CatBoostClassifier(
    eval_metric= metrics.F1(),
    random_seed=42,
    logging_level='Silent'
)

cat_mean, cat_std,cat_time, _ = validate(cat_model, X_train, y_train, cv = 3)

In [11]:
clf = [('cat',cat_model),('xgb',xg_model)] #list of (str, estimator)

lr =LogisticRegression()
stack_model = StackingClassifier( estimators = clf,final_estimator = lr)
stack_mean, stack_std,stack_time, _ = validate(stack_model, X_train, y_train, cv= 3)

In [12]:
mean_F1 = [tree_mean, xg_mean, rand_mean, cat_mean, stack_mean]
std = [tree_std, xg_std, rand_std, cat_std, stack_std]
exe_time = [tree_time, xg_time, rand_time, cat_time, stack_time]
index = ['tree_model', 'xgboost', 'random_forest', 'catboost', 'stack_ensemble']
result_df = pd.DataFrame({'MeanF1': mean_F1, 'StandardDeviation': std , 'ExecutionTime': exe_time}, index = index)

In [13]:
result_df

Unnamed: 0,MeanF1,StandardDeviation,ExecutionTime
tree_model,0.877862,0.009098,0.680952
xgboost,0.978869,0.000724,62.073441
random_forest,0.805793,0.003507,10.867603
catboost,0.979159,0.001229,26.664298
stack_ensemble,0.98529,0.001167,396.318407


## Train Model

From the above dataframe, taking time and F1 score into consideration: Catboost gave the best performance

In [14]:
import catboost
cat_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7fdc60bc8050>

In [15]:
## make prediction on X_test and check the F1 score
y_pred = cat_model.predict(X_test)
f1_score(y_test, y_pred)

0.9831265508684864

In [16]:
##make predictions on  given test data
test_pred = cat_model.predict(test)

## Saving the model

In [17]:
import pickle
pickle_out = open("classifier.pkl", mode = "wb") 
pickle.dump(cat_model, pickle_out) 
pickle_out.close()