In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [6]:
df_train_tran = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')


In [7]:
df_train_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')


In [8]:
df_test_tran = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')


In [9]:
df_test_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')


In [10]:
df_train = pd.merge(df_train_tran, df_train_id, on='TransactionID', how='left')
df_test = pd.merge(df_test_tran, df_test_id, on='TransactionID', how='left')


In [11]:
X=df_train.drop(columns=['isFraud'])
y=df_train['isFraud']

In [12]:
from sklearn.model_selection import train_test_split
#same ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)


In [13]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
#if null values>= 80percent -> drop
class DropNulls(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.columns_to_drop_ = None

    def fit(self, X, y=None):
        null_frac = X.isnull().mean()
        self.columns_to_drop_ = null_frac[null_frac > self.threshold].index.tolist()
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop_, errors='ignore')

#correlated numvalues -> drop
class DropCorr(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.columns_to_drop_ = None

    def fit(self, X, y=None):
        X_num = X.select_dtypes(include=[np.number])
        corr_matrix = X_num.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.columns_to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop_, errors='ignore')


#cat handling
class SplitCatCols(BaseEstimator, TransformerMixin):
    def __init__(self, max_unique_for_onehot=3):
        self.max_unique_for_onehot = max_unique_for_onehot
        self.low_cardinality_cols_ = []
        self.high_cardinality_cols_ = []

    def fit(self, X, y=None):
        cat_cols = X.select_dtypes(include=['object', 'category']).columns
        for col in cat_cols:
            n_unique = X[col].nunique(dropna=False)
            if n_unique <= self.max_unique_for_onehot:
                self.low_cardinality_cols_.append(col)
            else:
                self.high_cardinality_cols_.append(col)
        return self

    def transform(self, X):
        return X  


In [14]:
def droptoomanynulls(threshold=0.8):
    return DropNulls(threshold=threshold)


In [16]:
from sklearn.compose import make_column_selector 

def woeandcathandling(max_unique_for_onehot=3):
    numerical_transformer = SimpleImputer(strategy='median')
    onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    woe_encoder = ce.WOEEncoder()

    splitter = SplitCatCols(max_unique_for_onehot=max_unique_for_onehot)
    return Pipeline(steps=[
        ('split_cat', splitter),
        ('encode', ColumnTransformer(transformers=[
            ('num', numerical_transformer, make_column_selector(dtype_include=['int64', 'float64'])),
            ('onehot', onehot_encoder, make_column_selector(dtype_include=['object', 'category'])),
            ('woe', woe_encoder, make_column_selector(dtype_include=['object', 'category']))  
        ], remainder='drop'))
    ])

In [17]:
def corelation(threshold=0.9):
    return DropCorr(threshold=threshold)


In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

model_pipeline = Pipeline(steps=[
    ('droptoomanynulls', droptoomanynulls(threshold=0.8)),
    ('corelation', corelation(threshold=0.9)), 
    ('woeandcathandling', woeandcathandling(max_unique_for_onehot=3)),
    ('scaler', StandardScaler()),  # still keeping scaler, fine for RF even if not necessary
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])



In [19]:
%pip install mlflow
%pip install dagshub

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql

In [21]:
import dagshub
dagshub.init(repo_owner='electrolizzys', repo_name='Fraud_Detection', mlflow=True)


Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=06a66953-84b2-496f-a5d2-d772e97e67bb&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=09fa092c6c8bd648e522a99cbfc497b29039087d14e67cd5e157cda7794461ea




In [22]:
import mlflow
import mlflow.sklearn

with mlflow.start_run(run_name="random_forest_run"):
    
    model_pipeline.fit(X_train, y_train)
    
    y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1]
    
    from sklearn.metrics import roc_auc_score
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    mlflow.log_metric("roc_auc", roc_auc)
    
    mlflow.sklearn.log_model(model_pipeline, artifact_path="model")

    print(f"Logged model with ROC AUC: {roc_auc:.4f}")


  return op(a, b)


Logged model with ROC AUC: 0.9370
🏃 View run random_forest_run at: https://dagshub.com/electrolizzys/Fraud_Detection.mlflow/#/experiments/0/runs/350cdeb5bbb24580ba3415f18adab99e
🧪 View experiment at: https://dagshub.com/electrolizzys/Fraud_Detection.mlflow/#/experiments/0


In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

model_pipeline2 = Pipeline(steps=[
    ('droptoomanynulls', droptoomanynulls(threshold=0.8)),
    ('corelation', corelation(threshold=0.9)), 
    ('woeandcathandling', woeandcathandling(max_unique_for_onehot=3)),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(
        n_estimators=300, 
        max_depth=10, 
        min_samples_split=5,
        random_state=42
    ))
])

import mlflow
import mlflow.sklearn

with mlflow.start_run(run_name="random_forest_ext"):
    
    model_pipeline2.fit(X_train, y_train)
    
    y_pred_proba = model_pipeline2.predict_proba(X_test)[:, 1]
    
    from sklearn.metrics import roc_auc_score
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    mlflow.log_metric("roc_auc", roc_auc)
    
    mlflow.sklearn.log_model(model_pipeline2, artifact_path="model")

    print(f"Logged model with ROC AUC: {roc_auc:.4f}")



  return op(a, b)


Logged model with ROC AUC: 0.8625
🏃 View run random_forest_ext at: https://dagshub.com/electrolizzys/Fraud_Detection.mlflow/#/experiments/0/runs/ff366e752d9247c18e832ff63f144077
🧪 View experiment at: https://dagshub.com/electrolizzys/Fraud_Detection.mlflow/#/experiments/0
