<a href="https://colab.research.google.com/github/gitswathig/ML-Projects/blob/main/fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# importing libraries

In [None]:
# imprt libraries
import os
import gc
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
import joblib

# Optional imports
try:
    import lightgbm as lgb
except Exception:
    lgb = None

try:
    from imblearn.over_sampling import SMOTE
except Exception:
    SMOTE = None

# CONFIG
DATA_PATH = "Fraud.csv"
TARGET = None
SAMPLE_FRAC = 0.05
RANDOM_STATE = 42
USE_SMOTE = True
MODEL_OUTPUT = "best_model.joblib"


Load and optimize data


In [None]:
def mem_optimize(df):
    start_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtype
        if pd.api.types.is_integer_dtype(col_type):
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif pd.api.types.is_float_dtype(col_type):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage(deep=True).sum() / 1024 ** 2
    print(f"Memory: {start_mem:.2f}MB → {end_mem:.2f}MB")
    return df

def detect_target_column(df):
    for c in ['isFraud', 'fraud', 'Class', 'label']:
        if c in df.columns:
            print(f"Auto-detected TARGET column: {c}")
            return c
    return None

def load_data(path=DATA_PATH, sample_frac=SAMPLE_FRAC):
    df = pd.read_csv(path)
    print("Original shape:", df.shape)
    df = mem_optimize(df)
    if 0 < sample_frac < 1.0:
        df = df.sample(frac=sample_frac, random_state=RANDOM_STATE).reset_index(drop=True)
    print("Sample shape:", df.shape)
    return df

# Load data
df = load_data()
TARGET = detect_target_column(df)
print("Target:", TARGET)
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Fraud.csv'

Handling Missing Values

In [None]:
def summarize_missing(df):
    miss = df.isna().mean().sort_values(ascending=False)
    print(miss[miss > 0])
    return miss

def winsorize(df, cols):
    for c in cols:
        low, high = df[c].quantile(0.01), df[c].quantile(0.99)
        df[c] = df[c].clip(low, high)
    return df

# Basic cleaning
print(df.info())
print("Missing values:")
summarize_missing(df)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if TARGET in numeric_cols:
    numeric_cols.remove(TARGET)
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric cols:", len(numeric_cols))
print("Categorical cols:", len(cat_cols))

df = winsorize(df, numeric_cols)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5541 entries, 0 to 5540
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            5541 non-null   int8   
 1   type            5541 non-null   object 
 2   amount          5541 non-null   float64
 3   nameOrig        5541 non-null   object 
 4   oldbalanceOrg   5541 non-null   float64
 5   newbalanceOrig  5541 non-null   float64
 6   nameDest        5541 non-null   object 
 7   oldbalanceDest  5541 non-null   float64
 8   newbalanceDest  5541 non-null   float64
 9   isFraud         5541 non-null   float32
 10  isFlaggedFraud  5541 non-null   float32
dtypes: float32(2), float64(5), int8(1), object(3)
memory usage: 395.1+ KB
None
Missing values:
Series([], dtype: float64)
Numeric cols: 7
Categorical cols: 3


Basic Feature Engineering


In [None]:
def feature_engineering(df, numeric_cols):
    for c in numeric_cols:
        if (df[c] <= 0).any():
            df[c+'_log'] = np.log1p(df[c] - df[c].min() + 1)
        else:
            df[c+'_log'] = np.log1p(df[c])
    return df

df = feature_engineering(df, numeric_cols)


Split Data and Building the Preprocessing pipeline

In [None]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=RANDOM_STATE)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=RANDOM_STATE)

print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # ✅ fixed
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, [c for c in numeric_cols if c in X_train.columns]),
    ("cat", categorical_transformer, [c for c in cat_cols if c in X_train.columns])
])


Train: 3878 | Val: 831 | Test: 832


Training Logistic Regression

In [None]:
log_reg = Pipeline([
    ("pre", preprocessor),
    ("clf", LogisticRegression(max_iter=2000, solver="saga", penalty="l1", class_weight="balanced"))
])

log_reg.fit(X_train, y_train)
y_pred_proba = log_reg.predict_proba(X_val)[:, 1]

auc = roc_auc_score(y_val, y_pred_proba)
print("Validation AUC (Logistic Regression):", auc)
print(classification_report(y_val, (y_pred_proba >= 0.5).astype(int)))


Validation AUC (Logistic Regression): 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       830
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00       831
   macro avg       0.50      0.50      0.50       831
weighted avg       1.00      1.00      1.00       831



Training LightGBM

In [None]:
if lgb is not None:
    lgb_model = Pipeline([
        ("pre", preprocessor),
        ("clf", lgb.LGBMClassifier(objective="binary", n_estimators=150,
                                   class_weight="balanced", random_state=RANDOM_STATE))
    ])
    lgb_model.fit(X_train, y_train)
    y_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
    print("Validation AUC (LightGBM):", roc_auc_score(y_val, y_pred_lgb))
else:
    print("LightGBM not installed. Skipping...")


[LightGBM] [Info] Number of positive: 3, number of negative: 3875
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1296
[LightGBM] [Info] Number of data points in the train set: 3878, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Validation AUC (LightGBM): 0.7313253012048193


Model save and Backtest

In [None]:
joblib.dump(log_reg, MODEL_OUTPUT)
print(f"Model saved as {MODEL_OUTPUT}")

# Simple backtest idea
df_val = X_val.copy()
df_val["score"] = y_pred_proba
df_val[TARGET] = y_val.values

threshold = 0.9
flagged = df_val[df_val["score"] >= threshold]
print(f"Flagged {len(flagged)} transactions (threshold {threshold})")
print("Detected frauds:", flagged[TARGET].sum())


Model saved as best_model.joblib
Flagged 0 transactions (threshold 0.9)
Detected frauds: 0.0
