In [1]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# importing libraries
import pandas as pd
import numpy as np
import shap
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.feature_selection import chi2
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")
import warnings
warnings.filterwarnings(action="ignore")

In [3]:
#Check the data
df = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")
df.head(10)

### **We don't know what is the meaning of each feature.**
### Classes
    - Fraud = 1
    - Not Fraud = 0

In [4]:
# Data Shape (number of rows and columns)
print(f"Dataset has {df.shape[1]} columns and {df.shape[0]} rows")

In [5]:
df.info()

In [6]:
# Percentage of null values
(df.isna().sum())/len(df)

In [7]:
df.describe()

In [8]:
# See the relationshipt between Classes
sns.countplot(df["Class"],palette="Set1")
plt.title("Countplot of Target Variable")

### Imbalanced data. It is necessary to correct it.

In [9]:
# Scale Data with RobustScaler
rbst = RobustScaler()
df['Amount'] = rbst.fit_transform(df['Amount'].values.reshape(-1,1))
df['Time'] = rbst.fit_transform(df['Time'].values.reshape(-1,1))

In [10]:
# Split features and Target
X = df.drop("Class",axis=1)
y = df.Class

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25,random_state=24)

In [11]:
# SMOTE: solution for imbalance Data
smote = SMOTE(random_state=24)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# **XGBoost**

In [12]:
xgb = XGBClassifier()
xgb.fit(X_smote, y_smote)

In [14]:
y_pred_xgb = xgb.predict(X_test)
report_xgb = classification_report(y_pred_xgb, y_test)
print(report_xgb)
print(f"ROC AUC Score for XGBoost Classifier: {roc_auc_score(y_pred_xgb, y_test)}")

# **Cat Boost**

In [15]:
cat_classifier = CatBoostClassifier(iterations=250,verbose=False)
cat_classifier.fit(X_smote, y_smote)

In [16]:
y_pred_cat = cat_classifier.predict(X_test)
report_cat = classification_report(y_pred_cat, y_test)
print(report_cat)
print(f"ROC AUC Score for Cat Boost Classifier: {roc_auc_score(y_pred_cat, y_test)}")

# **Random Forest**

In [None]:
rf = RandomForestClassifier()
rf.fit(X_smote, y_smote)

In [None]:
y_pred_rf = rf.predict(X_test)
report_rf = classification_report(y_pred_rf, y_test)
print(report_rf)
print(f"ROC AUC Score for Random Forest Classifier: {roc_auc_score(y_pred_rf, y_test)}")

# **Feature Importance**

In [None]:
# Feature importance (Random Forest)
perm = PermutationImportance(rf,random_state=24).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())