### Imporing Libraries

In [4]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Config
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)
import warnings
warnings.filterwarnings("ignore")  #
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 



print("Environment ready for CDR ensemble training (warnings suppressed).")





### Load & Prepare Data

In [5]:
data_path = "../dataset/CDR_Dataset.csv"
df = pd.read_csv(data_path, parse_dates=["call_start_time"])

# Convert label column
df["label"] = df["label"].map({"Fraud": 1, "Not Fraud": 0})

# Drop irrelevant IDs and convert to numeric
drop_cols = ["call_id", "caller_id", "callee_id", "imei", "imsi", "cell_tower_id", "call_start_time"]
X = df.drop(columns=drop_cols + ["label"], errors="ignore")
y = df["label"]

# One-hot encode call_type if present
if "call_type" in X.columns:
    X = pd.get_dummies(X, columns=["call_type"], prefix="type")

# Fill missing values
X = X.fillna(0)

# Train-test split + SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE
)
sm = SMOTE(random_state=RANDOM_STATE, sampling_strategy=0.3)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("Data loaded and balanced:")
print("Train shape:", X_train_res.shape, "| Test shape:", X_test.shape)
print("Fraud ratio:", y_train_res.mean().round(4))


Data loaded and balanced:
Train shape: (49429, 11) | Test shape: (10000, 11)
Fraud ratio: 0.2308


### Training Supervised Models 

### 1. XGBoost

In [6]:
xgb = XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.2,
    max_depth=4,
    n_estimators=200,
    subsample=1.0,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=RANDOM_STATE,
    use_label_encoder=False,
    n_jobs=-1
)

### 2. RandomForest 

In [7]:
rf = RandomForestClassifier(
    max_depth=10,
    min_samples_leaf=2,
    min_samples_split=10,
    n_estimators=200,
    class_weight="balanced",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

### 3.  Logistic Regression

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

lr_model = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE)

### Fittng Models 

In [9]:
xgb.fit(X_train_res, y_train_res)
rf.fit(X_train_res, y_train_res)
lr_model.fit(X_train_scaled, y_train_res)
print(" Supervised models trained successfully.")

 Supervised models trained successfully.


### Evaluating Supervised Models

In [10]:
def evaluate_model(model, X, y, name, scaled=False):
    preds = model.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, preds)
    print(f"\nðŸ”¹ {name} ROC-AUC: {auc:.4f}")
    print(classification_report(y, preds > 0.5, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y, preds > 0.5))
    return preds

p_xgb = evaluate_model(xgb, X_test, y_test, "XGBoost")
p_rf = evaluate_model(rf, X_test, y_test, "RandomForest")
p_lr = evaluate_model(lr_model, X_test_scaled, y_test, "LogisticRegression", scaled=True)



ðŸ”¹ XGBoost ROC-AUC: 0.9944
              precision    recall  f1-score   support

           0     0.9963    0.9975    0.9969      9506
           1     0.9503    0.9291    0.9396       494

    accuracy                         0.9941     10000
   macro avg     0.9733    0.9633    0.9683     10000
weighted avg     0.9940    0.9941    0.9941     10000

Confusion Matrix:
 [[9482   24]
 [  35  459]]

ðŸ”¹ RandomForest ROC-AUC: 0.9947
              precision    recall  f1-score   support

           0     0.9966    0.9943    0.9955      9506
           1     0.8953    0.9352    0.9149       494

    accuracy                         0.9914     10000
   macro avg     0.9460    0.9648    0.9552     10000
weighted avg     0.9916    0.9914    0.9915     10000

Confusion Matrix:
 [[9452   54]
 [  32  462]]

ðŸ”¹ LogisticRegression ROC-AUC: 0.9275
              precision    recall  f1-score   support

           0     0.9901    0.9648    0.9772      9506
           1     0.5455    0.8138    0.

### Training Unsupervised Models 

### 1. Isolation Forest 

In [11]:
iso_train = X_train[y_train == 0]  # Only normal calls
iso_model = IsolationForest(contamination=0.02, random_state=RANDOM_STATE)
iso_model.fit(iso_train)

# Get anomaly scores (negative = anomaly)
s_iso = -iso_model.score_samples(X_test)
scaler_iso = MinMaxScaler()
p_iso = scaler_iso.fit_transform(s_iso.reshape(-1, 1)).ravel()

print("IsolationForest trained and anomaly scores normalized.")


IsolationForest trained and anomaly scores normalized.


### 2. Autoencoder

In [12]:
X_train_clean = X_train.select_dtypes(include=[np.number]).fillna(0)
X_test_clean = X_test.select_dtypes(include=[np.number]).fillna(0)

ae_train = X_train_clean[y_train == 0].values.astype("float32")
input_dim = ae_train.shape[1]
encoding_dim = int(input_dim / 2)

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu")(input_layer)
decoder = Dense(input_dim, activation="sigmoid")(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss="mse")
autoencoder.fit(ae_train, ae_train, epochs=10, batch_size=256, shuffle=True, verbose=0)

X_test_np = X_test_clean.values.astype("float32")
reconstructions = autoencoder.predict(X_test_np, verbose=0)
reconstruction_error = np.mean(np.square(X_test_np - reconstructions), axis=1)

scaler_ae = MinMaxScaler()
p_ae = scaler_ae.fit_transform(reconstruction_error.reshape(-1, 1)).ravel()

print("Autoencoder trained and reconstruction errors normalized.")


Autoencoder trained and reconstruction errors normalized.


### Combine Ensemble Scores

In [13]:
results_df = pd.DataFrame({
    "p_xgb": p_xgb,
    "p_rf": p_rf,
    "p_lr": p_lr,
    "p_iso": p_iso,
    "p_ae": p_ae,
    "label": y_test.values
})

results_df["p_sup"] = results_df[["p_xgb", "p_rf", "p_lr"]].mean(axis=1)
results_df["p_unsup"] = results_df[["p_iso", "p_ae"]].mean(axis=1)
results_df["p_final"] = (0.7 * results_df["p_sup"]) + (0.3 * results_df["p_unsup"])

print("Combined ensemble scores computed.")
results_df.head()


Combined ensemble scores computed.


Unnamed: 0,p_xgb,p_rf,p_lr,p_iso,p_ae,label,p_sup,p_unsup,p_final
0,2.4e-05,0.021229,0.151931,0.101352,0.000143,0,0.057728,0.050748,0.055634
1,1e-06,0.001276,0.17423,0.083388,0.00115,0,0.058502,0.042269,0.053632
2,9.4e-05,0.012445,0.127224,0.091309,9.7e-05,0,0.046588,0.045703,0.046322
3,3e-06,0.001602,0.148108,0.137375,0.000647,0,0.049904,0.069011,0.055636
4,0.002255,0.015044,0.06306,0.444,4.3e-05,0,0.026786,0.222021,0.085357


### Risk Buckets 

In [15]:
def risk_bucket(p):
    if p >= 0.7:
        return "High"
    elif p >= 0.3:
        return "Medium"
    return "Low"

results_df["risk_bucket"] = results_df["p_final"].apply(risk_bucket)
print(" Risk buckets assigned:")
print(results_df["risk_bucket"].value_counts())

roc_final = roc_auc_score(results_df["label"], results_df["p_final"])
print(f"\n Final Ensemble ROC-AUC: {roc_final:.4f}")


 Risk buckets assigned:
risk_bucket
Low       9431
High       395
Medium     174
Name: count, dtype: int64

 Final Ensemble ROC-AUC: 0.9878


### Saving Models 

In [16]:
model_dir = "../models/cdr_ensemble"
os.makedirs(model_dir, exist_ok=True)

joblib.dump(xgb, f"{model_dir}/cdr_xgb.joblib")
joblib.dump(rf, f"{model_dir}/cdr_rf.joblib")
joblib.dump(lr_model, f"{model_dir}/cdr_lr.joblib")
joblib.dump(iso_model, f"{model_dir}/cdr_iso.joblib")
autoencoder.save(f"{model_dir}/cdr_autoencoder.keras")
joblib.dump(scaler, f"{model_dir}/scaler_lr.joblib")
joblib.dump(scaler_iso, f"{model_dir}/scaler_iso.joblib")
joblib.dump(scaler_ae, f"{model_dir}/scaler_ae.joblib")

results_path = "../dataset/cdr_ensemble_predictions.csv"
results_df.to_csv(results_path, index=False)

print(f" All models and predictions saved successfully to: {model_dir}")


 All models and predictions saved successfully to: ../models/cdr_ensemble
