In [17]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("Fraud.csv")

# Dataset overview
print(f"Dataset Shape: {df.shape}")
print("\nData Types:\n")
print(df.dtypes)

df.head()

Dataset Shape: (6362620, 11)

Data Types:

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [18]:
# Missing values
missing_values = df.isnull().sum().sum()

# Duplicate rows
duplicate_rows = df.duplicated().sum()

# Fraud distribution
fraud_counts = df['isFraud'].value_counts()
fraud_percentage = df['isFraud'].value_counts(normalize=True) * 100

print(f"Total Missing Values: {missing_values}")
print(f"Total Duplicate Rows: {duplicate_rows}\n")

print("Fraud Distribution:")
print(fraud_counts)

print("\nFraud Percentage:")
print(fraud_percentage.round(4))

Total Missing Values: 0
Total Duplicate Rows: 0

Fraud Distribution:
isFraud
0    6354407
1       8213
Name: count, dtype: int64

Fraud Percentage:
isFraud
0   99.87
1    0.13
Name: proportion, dtype: float64


In [19]:
# Transaction distribution
type_distribution = df['type'].value_counts()

# Fraud counts by type
fraud_by_type = pd.crosstab(df['type'], df['isFraud'])

# Fraud percentage by type
fraud_percentage_by_type = (
    pd.crosstab(df['type'], df['isFraud'], normalize='index') * 100
).round(4)

print("Transaction Type Distribution:\n")
print(type_distribution)

print("\nFraud Count by Transaction Type:\n")
print(fraud_by_type)

print("\nFraud Percentage by Transaction Type:\n")
print(fraud_percentage_by_type)

Transaction Type Distribution:

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

Fraud Count by Transaction Type:

isFraud         0     1
type                   
CASH_IN   1399284     0
CASH_OUT  2233384  4116
DEBIT       41432     0
PAYMENT   2151495     0
TRANSFER   528812  4097

Fraud Percentage by Transaction Type:

isFraud       0    1
type                
CASH_IN  100.00 0.00
CASH_OUT  99.82 0.18
DEBIT    100.00 0.00
PAYMENT  100.00 0.00
TRANSFER  99.23 0.77


In [20]:
# Keep only transaction types where fraud occurs
df = df[df['type'].isin(['TRANSFER', 'CASH_OUT'])]

# Updated fraud distribution
fraud_counts_filtered = df['isFraud'].value_counts()
fraud_percentage_filtered = df['isFraud'].value_counts(normalize=True) * 100

print(f"Filtered Dataset Shape: {df.shape}\n")

print("Fraud Distribution After Filtering:")
print(fraud_counts_filtered)

print("\nFraud Percentage After Filtering:")
print(fraud_percentage_filtered.round(4))

Filtered Dataset Shape: (2770409, 11)

Fraud Distribution After Filtering:
isFraud
0    2762196
1       8213
Name: count, dtype: int64

Fraud Percentage After Filtering:
isFraud
0   99.70
1    0.30
Name: proportion, dtype: float64


In [21]:
# Create balance difference features
df['orig_balance_diff'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['dest_balance_diff'] = df['newbalanceDest'] - df['oldbalanceDest']

# Create balance inconsistency (error) features
df['orig_balance_error'] = df['amount'] - df['orig_balance_diff']
df['dest_balance_error'] = df['amount'] - df['dest_balance_diff']

# Preview new features
new_features = [
    'orig_balance_diff',
    'dest_balance_diff',
    'orig_balance_error',
    'dest_balance_error'
]

df[new_features].head()

Unnamed: 0,orig_balance_diff,dest_balance_diff,orig_balance_error,dest_balance_error
2,181.0,0.0,0.0,181.0
3,181.0,-21182.0,0.0,21363.0
15,15325.0,46430.44,213808.94,182703.5
19,705.0,-22425.0,214605.3,237735.3
24,10835.0,2712905.89,300850.89,-2401220.0


In [22]:
# Encode transaction type
df['type'] = df['type'].map({'TRANSFER': 1, 'CASH_OUT': 0})

# Drop non-informative ID columns
df = df.drop(['nameOrig', 'nameDest'], axis=1)

# Define feature matrix and target variable
X = df.drop('isFraud', axis=1)
y = df['isFraud']

print(f"Feature Matrix Shape: {X.shape}")
print(f"Target Vector Shape: {y.shape}\n")

print("Final Feature Columns:")
print(list(X.columns))

Feature Matrix Shape: (2770409, 12)
Target Vector Shape: (2770409,)

Final Feature Columns:
['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud', 'orig_balance_diff', 'dest_balance_diff', 'orig_balance_error', 'dest_balance_error']


In [23]:
from sklearn.model_selection import train_test_split

# Split dataset (stratified due to class imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training Set Shape: {X_train.shape}")
print(f"Test Set Shape: {X_test.shape}\n")

print("Fraud Percentage in Training Set:")
print((y_train.value_counts(normalize=True) * 100).round(4))

print("\nFraud Percentage in Test Set:")
print((y_test.value_counts(normalize=True) * 100).round(4))

Training Set Shape: (2216327, 12)
Test Set Shape: (554082, 12)

Fraud Percentage in Training Set:
isFraud
0   99.70
1    0.30
Name: proportion, dtype: float64

Fraud Percentage in Test Set:
isFraud
0   99.70
1    0.30
Name: proportion, dtype: float64


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Initialize model
model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    n_jobs=-1
)

# Train model (clean feature set)
model.fit(X_train2, y_train)

# Predict probabilities
y_prob = model.predict_proba(X_test2)[:, 1]

# Evaluate using ROC-AUC
roc_score = roc_auc_score(y_test, y_prob)

print(f"ROC-AUC Score: {roc_score:.4f}")

ROC-AUC Score: 0.9736


In [25]:
from sklearn.metrics import precision_recall_curve
import numpy as np

# Compute precision-recall values
precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)

# Compute F1 scores
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)

# Identify best threshold
best_index = np.argmax(f1_scores)
best_threshold = thresholds[best_index]

print(f"Optimal Threshold: {best_threshold:.6f}")
print(f"Precision at Optimal Threshold: {precisions[best_index]:.4f}")
print(f"Recall at Optimal Threshold: {recalls[best_index]:.4f}")
print(f"F1 Score at Optimal Threshold: {f1_scores[best_index]:.4f}")

Optimal Threshold: 0.999532
Precision at Optimal Threshold: 0.7996
Recall at Optimal Threshold: 0.4760
F1 Score at Optimal Threshold: 0.5967


In [26]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred_final = (y_prob > best_threshold).astype(int)

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred_final))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_final))

Confusion Matrix:

[[552243    196]
 [   862    781]]

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    552439
           1       0.80      0.48      0.60      1643

    accuracy                           1.00    554082
   macro avg       0.90      0.74      0.80    554082
weighted avg       1.00      1.00      1.00    554082

