In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)
from modelPY.featuresp import data_process
# Step 2: Load dataset
df = pd.read_csv("data/UCI_Credit_Card.csv")
X, y, target_column = data_process(df, target_column=None)

def rf_classifier(X,y):
    numerical_cols = X.select_dtypes(include=["int64","float64"]).columns #finding numerical data to handle null value
    categorical_cols = X.select_dtypes(include=["object"]).columns #finding categorical data to handle null value
    
    # Train-test split 20% test 80% training and random sample is 50
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=40, stratify=y
    )
    # -------------------------------------------------
    # Pipelines
    # -------------------------------------------------
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ])
    
    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])
    
    preprocessor = ColumnTransformer([
        ("num", num_pipeline, numerical_cols),
        ("cat", cat_pipeline, categorical_cols)
    ])
    
    # -------------------------------------------------
    # Final Model Pipeline
    # -------------------------------------------------
    model = Pipeline([
        ("prep", preprocessor),
        ("rf", RandomForestClassifier(
            n_estimators=300,
            max_depth=20,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1
        ))
    ])

 
    model.fit(X_train, y_train)
    
    #Predictions
    y_prob = model.predict_proba(X_test)[:, 1]
    for t in [0.25, 0.3, 0.35, 0.4, 0.45]:
        y_pred = (y_prob >= t).astype(int)
    #Evaluation Metrics
    baseline_metrics ={
    "accuracy": accuracy_score(y_test, y_pred),
    "auc": float(roc_auc_score(y_test, y_prob)),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
    "mcc": float(matthews_corrcoef(y_test, y_pred))
    }
    return baseline_metrics, y_test, y_pred 

result, xab, yab = rf_classifier(X,y)

print(result)    


{'accuracy': 0.8153333333333334, 'auc': 0.7804247846863872, 'precision': 0.6026241799437676, 'recall': 0.4845516201959307, 'f1': 0.5371762740183793, 'mcc': 0.4274553171783341}
