<a href="https://colab.research.google.com/github/gussgary/Financial-Fraud-Detection/blob/main/IEEE_FraudDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
import os

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Advanced ML Libraries
import xgboost as xgb
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

# Metrics and Evaluation
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    average_precision_score,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score
    )

# Statistical Libraries
from scipy import stats
from scipy.stats import chi2_contingency

#Progress Bar
from tqdm.auto import tqdm

In [11]:
from google.colab import drive

#Mount Google drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [12]:
data_dir = '/content/drive/MyDrive/Master Project/ieee-fraud-detection'
print("Files in directory:", os.listdir(data_dir))

Files in directory: ['test_identity.csv', 'sample_submission.csv', 'test_transaction.csv', 'train_identity.csv', 'train_transaction.csv']


In [13]:
data_dir = '/content/drive/MyDrive/Master Project/ieee-fraud-detection'
if os.path.exists(data_dir):
  print(f'Directory Exist')
  print(f'Files:', os.listdir(data_dir))
else:
  print(f'Directory not found!')

Directory Exist
Files: ['test_identity.csv', 'sample_submission.csv', 'test_transaction.csv', 'train_identity.csv', 'train_transaction.csv']


In [18]:
class FraudDetectionPipeline:
  def __init__(self):
    self.df = None

  def load_data(self, data_dir):
      try:
        train_transaction = pd.read_csv(os.path.join(data_dir, 'train_transaction.csv'))
        train_identity = pd.read_csv(os.path.join(data_dir, 'train_identity.csv'))
        test_transaction = pd.read_csv(os.path.join(data_dir, 'test_transaction.csv'))
        test_identity = pd.read_csv(os.path.join(data_dir, 'test_identity.csv'))

        print(f'Train Transaction Data Shape: {train_transaction.shape}')
        print(f'Train Identity Data Shape: {train_identity.shape}')
        print(f'Test Transaction Data Shape: {test_transaction.shape}')
        print(f'Test Identity Data Shape: {test_identity.shape}')

        #Merge trained dataset
        self.df = train_transaction.merge(train_identity, how='left', on='TransactionID')
        print(f'Merged Data Shape: {self.df.shape}')

        print(f"Dataset Overview:")
        print(f"- Total transactions: {len(self.df):,}")
        print(f"- Total features: {self.df.shape[1]}")
        print(f"- Fraud transactions: {self.df['isFraud'].sum():,}")
        print(f"- Fraud rate: {self.df['isFraud'].mean():.3%}")
        print(f"- Memory usage: {self.df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

        return True

      except Exception as exc:
        print(f'Error: {exc}')
        return False


#Execute pipeline
pipeline = FraudDetectionPipeline()
success = pipeline.load_data(data_dir)

Train Transaction Data Shape: (590540, 394)
Train Identity Data Shape: (144233, 41)
Test Transaction Data Shape: (506691, 393)
Test Identity Data Shape: (141907, 41)
Merged Data Shape: (590540, 434)
Dataset Overview:
- Total transactions: 590,540
- Total features: 434
- Fraud transactions: 20,663
- Fraud rate: 3.499%
- Memory usage: 2567.1 MB
