In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [None]:
!pip install --upgrade scikit-learn==1.3.2 imbalanced-learn==0.11.0
import sklearn
print("Version:", sklearn.__version__)
print("Location:", sklearn.__file__)

In [None]:
!pip install dagshub mlflow
import dagshub
import mlflow
dagshub.init(repo_owner='dimna21', repo_name='ML_Assignment2', mlflow=True)

In [5]:
identity_train = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
transaction_train = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [3]:
def print_stats(df):
    print('Columns:')
    print(df.columns)
    print('----------------------------')
    print(f'Missing counts:')
    print(df.isna().sum())
    print('----------------------------')

In [None]:
print_stats(transaction_train)

In [None]:
print_stats(identity_train)

In [6]:
print(transaction_train.shape)
print(identity_train.shape)
print(transaction_test.shape)
print(identity_test.shape)

(590540, 394)
(144233, 41)
(506691, 393)
(141907, 41)


In [7]:
df = transaction_train.merge(identity_train, on='TransactionID', how='left')

In [8]:
bad_cols = ['id_33','id_31','id_30','P_emaildomain','R_emaildomain', 'DeviceInfo']
cat_cols = [col for col in df.columns if df[col].dtype == 'object' and col not in bad_cols]
num_cols = [col for col in df.columns if df[col].dtype != 'object' and col not in ['TransactionID', 'TransactionDT','isFraud']]

print(len(cat_cols))
print(len(num_cols))

25
400


# Undersampling

In [76]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

X = df.drop(columns='isFraud')
y = df['isFraud']

rus = RandomUnderSampler(sampling_strategy=0.3, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)


# NA filler

In [77]:
from sklearn.base import BaseEstimator, TransformerMixin

class NAfiller(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols, num_cols):
        self.cat_cols = cat_cols
        self.num_cols = num_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        #drop bad columns before encoding
        X_copy.drop(['id_33','id_31','id_30','P_emaildomain','R_emaildomain', 'DeviceInfo','TransactionID', 'TransactionDT'], axis=1, inplace=True)
        for cat in self.cat_cols:
            X_copy[cat] = X_copy[cat].fillna(f'no_{cat}')
        for num in self.num_cols:
            X_copy[num] = X_copy[num].fillna(0)
        return X_copy

In [78]:
na_filler = NAfiller(cat_cols, num_cols)
X_train = na_filler.fit_transform(X_train)
X_train.isna().sum().sum()

0

# WOE mappings

In [81]:
na_filler = NAfiller(cat_cols, num_cols)
df_filled = na_filler.transform(X_resampled)
woe_mappings = {}

for category in cat_cols:
    category_mapping = {}
    distinct_values = df_filled[category].unique()
    
    total_positive = y_resampled[y_resampled == 1].shape[0]
    total_negative = y_resampled[y_resampled == 0].shape[0]
    
    for value in distinct_values:
        times_positive = X_resampled[(X_resampled[category] == value) & (y_resampled == 1)].shape[0] + 1
        times_negative = X_resampled[(X_resampled[category] == value) & (y_resampled == 0)].shape[0] + 1
        
        weight = np.log((times_positive / total_positive) / (times_negative / total_negative))
        category_mapping[value] = weight
        
    woe_mappings[category] = category_mapping


In [82]:
from sklearn.base import BaseEstimator, TransformerMixin
import mlflow

class WoeEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, woe_mappings):
        self.woe_mappings = woe_mappings
        
    def fit(self, X, y=None):

        mlflow.log_dict(self.woe_mappings, artifact_file="woe_mappings.json")

        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for col in self.woe_mappings:
            mapping = self.woe_mappings[col]
            X_copy[col] = X_copy[col].replace(mapping)
        return X_copy

In [83]:
encoder = WoeEncoder(woe_mappings)
X_train = encoder.fit_transform(X_train)

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


# Correlation filter

In [84]:
threshold=0.8
corr_matrix = X_train.corr().abs()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
high_corr_pairs = []

for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > threshold:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

features_to_drop = []

for feat1, feat2, _ in high_corr_pairs:
    if abs(X_train[feat1].corr(y)) < abs(X_train[feat2].corr(y_train)):
        features_to_drop.append(feat1)
    else:
        features_to_drop.append(feat2)

features_to_drop = list(set(features_to_drop))

In [88]:
class CorrelationFilter(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_drop):
        self.threshold = 0.8
        self.features_to_drop = features_to_drop

    def fit(self, X, y=None):
        mlflow.log_dict({"features_to_drop": self.features_to_drop}, artifact_file="features_to_drop.json")
        return self

    def transform(self, X):
        return X.drop(columns=self.features_to_drop)

In [89]:
corr_filt = CorrelationFilter(features_to_drop)
X_train = corr_filt.fit_transform(X_train)

# RFE

In [None]:
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE

estimator = XGBRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42,
    verbosity=0
)

rfe = RFE(estimator=estimator, n_features_to_select=100)
rfe.fit(X_train, y_train)

selected_features = X_train.columns[rfe.support_].tolist()

In [92]:
class FeatureSelectorRFE(BaseEstimator, TransformerMixin):
    def __init__(self, selected_features):
        self.selected_features = selected_features

    def fit(self, X, y=None):
        mlflow.log_dict({"selected_features": self.selected_features}, artifact_file="selected_features.json")
        return self

    def transform(self, X):
        return X[self.selected_features]


In [93]:
rfe = FeatureSelectorRFE(selected_features)
X_train = rfe.fit_transform(X_train)

In [123]:
import shap
import tempfile
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    precision_recall_curve, roc_auc_score, roc_curve, confusion_matrix
)
import matplotlib.pyplot as plt
import pandas as pd
import mlflow.sklearn
import os
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import shap


X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

mlflow.set_experiment("iter_2")
T = 0.5  # Prediction threshold

# Imbalanced pipeline with RandomUnderSampler
imb_pipeline = ImbPipeline([
    ('undersampler', RandomUnderSampler(random_state=42, sampling_strategy=0.3)),
    ('na_fill', NAfiller(cat_cols=cat_cols, num_cols=num_cols)),
    ('woe', WoeEncoder(woe_mappings=woe_mappings)),
    ('corr_filter', CorrelationFilter(features_to_drop=features_to_drop)),
    ('feature_select', FeatureSelectorRFE(selected_features=selected_features)),
    
    ('clf', AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=1500, 
    learning_rate=1.5,
    random_state=42
    ))

])

with mlflow.start_run(run_name="iter2_AdaBoost6", nested=True) as run:
    imb_pipeline.fit(X_train, y_train)

    mlflow.sklearn.log_model(imb_pipeline, "model")
    ada_model = imb_pipeline.named_steps["clf"]
    mlflow.log_params({
        "sampling_strategy": 0.3,
        "n_estimators": ada_model.n_estimators,
        "learning_rate": ada_model.learning_rate,
        "random_state": ada_model.random_state,
    })

    def log_metrics_and_curves(X, y, split, threshold=0.25):
        probas = imb_pipeline.predict_proba(X)[:, 1]
        preds = (probas >= threshold).astype(int)

        prec = precision_score(y, preds)
        rec = recall_score(y, preds)
        f1 = f1_score(y, preds)
        auc = roc_auc_score(y, probas)

        mlflow.log_metrics({
            f"{split}_precision": prec,
            f"{split}_recall": rec,
            f"{split}_f1_score": f1,
            f"{split}_auc": auc,
            f"{split}_threshold": T
        })

        # Precision-Recall curve
        precs, recalls, _ = precision_recall_curve(y, probas)
        plt.figure()
        plt.plot(recalls, precs, label="PR Curve")
        plt.axvline(x=rec, linestyle='--', color='r', label=f"Recall@thresh={rec:.2f}")
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(f"{split.capitalize()} Precision-Recall Curve")
        plt.legend()
        pr_path = f"{split}_pr_curve.png"
        plt.savefig(pr_path)
        plt.close()
        mlflow.log_artifact(pr_path)

        # ROC curve
        fpr, tpr, _ = roc_curve(y, probas)
        plt.figure()
        plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
        plt.plot([0, 1], [0, 1], linestyle='--')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"{split.capitalize()} ROC Curve")
        plt.legend()
        roc_path = f"{split}_roc_curve.png"
        plt.savefig(roc_path)
        plt.close()
        mlflow.log_artifact(roc_path)

        # Confusion matrix
        cm = confusion_matrix(y, preds)
        plt.figure(figsize=(6, 6))
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title(f"{split.capitalize()} Confusion Matrix")
        plt.colorbar()
        tick_marks = range(2)
        plt.xticks(tick_marks, ["Non-Fraud", "Fraud"])
        plt.yticks(tick_marks, ["Non-Fraud", "Fraud"])
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        
        # Plot the numbers inside the matrix
        thresh = cm.max() / 2.
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                plt.text(j, i, format(cm[i, j], 'd'),
                         horizontalalignment="center",
                         color="white" if cm[i, j] > thresh else "black",
                         fontsize=16)

        cm_path = f"{split}_confusion_matrix.png"
        plt.savefig(cm_path)
        plt.close()
        mlflow.log_artifact(cm_path)


    # Log train and test metrics
    log_metrics_and_curves(X_train, y_train, split="train", threshold=T)
    log_metrics_and_curves(X_test, y_test, split="test", threshold=T)
    print(f"Run ID: {run.info.run_id}")


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.
Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Run ID: e0ad2a58e3ae4b09a5650d483573a33e
🏃 View run iter2_AdaBoost6 at: https://dagshub.com/dimna21/ML_Assignment2.mlflow/#/experiments/15/runs/e0ad2a58e3ae4b09a5650d483573a33e
🧪 View experiment at: https://dagshub.com/dimna21/ML_Assignment2.mlflow/#/experiments/15
