In [202]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split

In [None]:
#https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
df = pd.read_csv('dataset/creditcard.csv')
df.shape

In [None]:
df.head()

In [None]:
plt.subplots(figsize=(18,5))
sns.scatterplot(data=df, x='Time', y='Amount',  hue='Class')

In [None]:
print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '%')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '%')

In [None]:
sns.countplot(x='Class',data=df);

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(12,5))

amount_val = df['Amount'].values
time_val = df['Time'].values

sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])

sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

plt.show()

In [None]:
df_sub = df.copy()

In [None]:
df_sub['Amount'].values.reshape(-1,1)

In [None]:
df_sub['Time'].values.reshape(-1,1)

In [None]:
rob_scaler = RobustScaler()

df_sub['scaled_amount'] = rob_scaler.fit_transform(df_sub['Amount'].values.reshape(-1,1))
df_sub['scaled_time'] = rob_scaler.fit_transform(df_sub['Time'].values.reshape(-1,1))
df_sub.drop(['Time','Amount'], axis=1, inplace=True)

df_sub.head()

In [None]:
fraud_df = df_sub.loc[df_sub['Class'] == 1]
len(fraud_df)

In [None]:
fraud_df = df_sub.loc[df_sub['Class'] == 1]
non_fraud_df = df_sub.loc[df_sub['Class'] == 0][:len(fraud_df)]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])
new_df = normal_distributed_df.sample(frac=1)

new_df.head()

In [None]:
sns.countplot(x='Class',data=new_df)

In [None]:
X = new_df.drop('Class', axis=1)
y = new_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
train_score = cross_val_score(clf, X_train, y_train)
print("Regression: ", round(train_score.mean(), 2) * 100, "% accuracy score")

In [None]:
y_pred = clf.predict(X_test)
log_fpr, log_tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(12,8))
plt.title('Regression ROC Curve', fontsize=16)
plt.plot(log_fpr, log_tpr, 'b-', linewidth=2)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.show()

In [None]:
log_reg_cf = confusion_matrix(y_test, y_pred)
s = sns.heatmap(log_reg_cf, annot=True, fmt='g')
s.set(xlabel='Predict Label', ylabel='True Label')

In [None]:
df_diff = pd.concat([df_sub, new_df]).drop_duplicates(keep=False)
print(df_diff.shape)
df_diff.head()

In [None]:
X_diff = df_sub.drop('Class', axis=1).to_numpy()
y_diff = df_sub['Class']

score = cross_val_score(clf, X_diff, y_diff)
print("Regression: ", round(score.mean(), 2) * 100, "% accuracy score")

In [None]:
pred = clf.predict(X_diff)

cm = confusion_matrix(y_diff, pred)
s = sns.heatmap(cm, annot=True, fmt='g)
s.set(xlabel='Predict Label', ylabel='True Label')