In [1]:
import subprocess, os, time
import cml.workers_v1 as workers

DASHBOARD_PORT = os.environ['CDSW_READONLY_PORT']
DASHBOARD_IP = os.environ['CDSW_IP_ADDRESS']

# use num-cpus=0 when start a head node to prevent this node from performing task/actor computation.
command = "ray start --head --block --include-dashboard=true --dashboard-port=$CDSW_READONLY_PORT --num-cpus=0 --num-gpus=0 &" 
subprocess.run(command, shell = True, executable="/bin/bash")

with open("RAY_HEAD_IP", 'w') as output_file:
    output_file.write(DASHBOARD_IP)
            
ray_head_addr = DASHBOARD_IP + ':6379'
ray_url = f"ray://{DASHBOARD_IP}:10001" 
worker_start_cmd = f"!ray start --block --address={ray_head_addr}"

time.sleep(7)
ray_workers = workers.launch_workers(
    n=5, 
    cpu=1, 
    memory=20,
    nvidia_gpu=0,
    code=worker_start_cmd,
)

Skipping addon with invalid or excluded ID: {'type': 'cmladdon', 'path': '/runtime-addons/cmladdon-2.0.49-b279', 'spec': '\nenv:\n  MLFLOW_TRACKING_URI: cml://localhost\n  MLFLOW_REGISTRY_URI: cml://localhost\n  PYTHONPATH: ${PYTHONPATH}:/opt/cmladdons/python/site-customize\n  R_LIBS_SITE: ${R_LIBS_SITE}:/opt/cmladdons/r/libs\npaths:\n  - /opt/cmladdons', 'version': '', 'id': -1}
2025-07-16 06:09:36,850	INFO usage_lib.py:467 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.
2025-07-16 06:09:36,850	INFO scripts.py:971 -- [37mLocal node IP[39m: [1m10.42.1.212[22m
2025-07-16 06:09:51,998	SUCC scripts.py:1007 -- [32m--------------------[39m
2025-07-16 06:09:51,9

In [2]:
import ray
from ray.data import Dataset
from ray.train import ScalingConfig
from ray.train.xgboost import XGBoostTrainer
import xgboost as xgb
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import time
import os

def calculate_all_features_for_group(group_df: pd.DataFrame) -> pd.DataFrame:
    """Calculates aggregated features for a single user (a group of records)."""
    group_df['is_fraud'] = group_df['is_fraud'].astype(bool)
    nocturnal_hours = (group_df['hour_of_day'] >= 22) | (group_df['hour_of_day'] <= 6)
    features = {
        'total_calls': len(group_df),
        'outgoing_call_ratio': (group_df['call_direction'] == 'outgoing').mean(),
        'avg_duration': group_df['duration'].mean(),
        'std_duration': group_df['duration'].std(),
        'nocturnal_call_ratio': nocturnal_hours.mean(),
        'mobility': group_df['cell_tower'].nunique(),
        'is_fraud': group_df['is_fraud'].iloc[0]
    }
    return pd.DataFrame([features], index=[group_df['msisdn'].iloc[0]])

def feature_engineering_ray(ds: ray.data.Dataset) -> ray.data.Dataset:
    """Performs feature engineering on the raw call data using Ray Data."""
    print("Performing feature engineering on the entire dataset...")
    # Group by 'msisdn' and apply the feature calculation function to each group.
    # This creates one row of features for each unique msisdn.
    user_features_ds = ds.groupby('msisdn').map_groups(
        calculate_all_features_for_group
    )
    return user_features_ds

def prepare_data(dataset: Dataset) -> tuple[Dataset, Dataset, Dataset]:
    """Splits the dataset into train (70%), validation (15%), and test (15%) sets."""
    print("\nSplitting engineered data into training, validation, and test sets...")
    seed = 42
    # First, split into training (70%) and the rest (30%)
    train_dataset, rest_dataset = dataset.train_test_split(test_size=0.3, shuffle=True, seed=seed)
    # Split the rest (30%) evenly into validation (15%) and test (15%)
    valid_dataset, test_dataset = rest_dataset.train_test_split(test_size=0.5, shuffle=True, seed=seed)
    
    print(f"Train set size: {train_dataset.count()}")
    print(f"Validation set size: {valid_dataset.count()}")
    print(f"Test set size: {test_dataset.count()}")
    
    return train_dataset, valid_dataset, test_dataset

def train_fraud_detection_model_xgb_ray(train_ds: ray.data.Dataset, valid_ds: ray.data.Dataset):
    print("\nTraining the XGBoost model with Ray Train...")

    # Fill any potential NaN values that resulted from feature engineering (e.g., std_duration for a single call)
    train_ds = train_ds.map_batches(lambda df: df.fillna(0), batch_format="pandas")
    valid_ds = valid_ds.map_batches(lambda df: df.fillna(0), batch_format="pandas")

    print("Calculating scale_pos_weight from training data for class imbalance...")
    num_fraud = train_ds.filter(lambda row: row["is_fraud"] == True).count()
    num_non_fraud = train_ds.filter(lambda row: row["is_fraud"] == False).count()

    if num_fraud > 0 and num_non_fraud > 0:
        scale_pos_weight = num_non_fraud / num_fraud
        print(f"scale_pos_weight determined to be: {scale_pos_weight:.2f}")
    else:
        scale_pos_weight = 1.0
        print("Warning: Insufficient classes to calculate scale_pos_weight. Defaulting to 1.0.")

    xgb_params = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "tree_method": "hist",
        "scale_pos_weight": scale_pos_weight,
        "random_state": 42,
    }

    label_column = 'is_fraud'
    
    trainer = XGBoostTrainer(
        scaling_config=ScalingConfig(num_workers=5, use_gpu=False),
        label_column=label_column,
        params=xgb_params,
        datasets={"train": train_ds, "valid": valid_ds},
        # Add num_boost_round to be comparable with Dask's n_estimators
        num_boost_round=100 
    )

    result = trainer.fit()
    print("\nModel Training Complete.")
    
    # Use the best checkpoint based on validation loss to get the best model
    best_checkpoint = result.get_best_checkpoint(metric="valid-logloss", mode="min")
    
    if best_checkpoint:
        print("Loading model from the best checkpoint...")
        with best_checkpoint.as_directory() as checkpoint_dir:
            # The model file might be model.xgb or model.ubj depending on the XGBoost version
            model_path_ubj = os.path.join(checkpoint_dir, "model.ubj")
            model_path_xgb = os.path.join(checkpoint_dir, "model.xgb")
            
            model_path = model_path_ubj if os.path.exists(model_path_ubj) else model_path_xgb

            if os.path.exists(model_path):
                booster = xgb.Booster()
                booster.load_model(model_path)
                return booster
    
    print("Could not load a model from checkpoint.")
    return None

def evaluate_model(booster: xgb.Booster, test_ds: ray.data.Dataset):
    """Evaluates the trained model on the unseen test dataset."""
    print("\n--- Model Evaluation on Unseen Test Data ---")

    test_ds = test_ds.map_batches(lambda df: df.fillna(0), batch_format="pandas")

    feature_columns = [col for col in test_ds.columns() if col != 'is_fraud']
    label_column = 'is_fraud'
    
    # Convert the test set to a pandas DataFrame for evaluation
    test_df = test_ds.to_pandas()
    
    if test_df.empty:
        print("Test dataset is empty. Cannot evaluate.")
        return

    X_test = test_df[feature_columns]
    y_test = test_df[label_column]
    dmatrix_test = xgb.DMatrix(X_test)
    
    # Get predictions
    y_pred_proba = booster.predict(dmatrix_test)
    y_pred = (y_pred_proba > 0.5).astype(int)

    print("\nConfusion Matrix (Test Data):")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report (Test Data):")
    print(classification_report(y_test, y_pred))

    # Display feature importances from the final model
    feature_scores = booster.get_score(importance_type='weight')
    if feature_scores:
        feature_importances = pd.Series(feature_scores).sort_values(ascending=False)
        print("\nFeature Importances:")
        print(feature_importances)


if __name__ == '__main__':
    # Initialize Ray
    # ray.init() # Use this if not connecting to an existing cluster

    raw_data_filename = '3G_cdr_data.csv'
    model_output_filename = 'fraud_detection_model_xgb_ray.json'

    try:
        print(f"\nReading '{raw_data_filename}' with Ray Data...")
        raw_ds = ray.data.read_csv(raw_data_filename)
    except Exception as e:
        print(f"Error reading raw data file: {e}")
        exit()

    start_time = time.time()

    # 1. Perform feature engineering on the *entire* dataset first.
    features_ds = feature_engineering_ray(raw_ds)

    # 2. Split the *engineered* data into three sets.
    train_ds, valid_ds, test_ds = prepare_data(features_ds)

    # 3. Train the model on the correctly engineered training and validation sets.
    fraud_model_booster = train_fraud_detection_model_xgb_ray(train_ds, valid_ds)

    # 4. Evaluate the model and save the final result.
    if fraud_model_booster:
        evaluate_model(fraud_model_booster, test_ds)
        
        fraud_model_booster.save_model(model_output_filename)
        print(f"\nTrained XGBoost model saved to '{model_output_filename}'")
    else:
        print("Model training failed, so no evaluation or saving was performed.")

    print(f"\nProcess complete in {time.time() - start_time:.2f} seconds.")
    
    # ray.shutdown()

2025-07-16 06:11:13,708	INFO worker.py:1723 -- Connecting to existing Ray cluster at address: 10.42.1.212:6379...
2025-07-16 06:11:13,744	INFO worker.py:1908 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8100 [39m[22m



Reading '3G_cdr_data.csv' with Ray Data...


2025-07-16 06:11:17,345	INFO logging.py:295 -- Registered dataset logger for dataset dataset_5_0


Performing feature engineering on the entire dataset...

Splitting engineered data into training, validation, and test sets...


2025-07-16 06:11:17,549	INFO streaming_executor.py:117 -- Starting execution of Dataset dataset_5_0. Full logs are in /tmp/ray/session_2025-07-16_06-09-36_851382_143/logs/ray-data
2025-07-16 06:11:17,551	INFO streaming_executor.py:118 -- Execution plan of Dataset dataset_5_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(calculate_all_features_for_group)->RandomShuffle] -> AggregateNumRows[AggregateNumRows]


Running 0:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- ReadCSV->SplitBlocks(107) 1: 0.00 row [00:00, ? row/s]

- Sort 2: 0.00 row [00:00, ? row/s]

Sort Sample 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 5:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- MapBatches(calculate_all_features_for_group)->RandomShuffle 6: 0.00 row [00:00, ? row/s]

Shuffle Map 7:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 8:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- AggregateNumRows 9:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

2025-07-16 06:12:58,133	INFO streaming_executor.py:227 -- ✔️  Dataset dataset_5_0 execution finished in 100.57 seconds
2025-07-16 06:12:58,181	INFO logging.py:295 -- Registered dataset logger for dataset dataset_4_0
2025-07-16 06:12:58,202	INFO streaming_executor.py:117 -- Starting execution of Dataset dataset_4_0. Full logs are in /tmp/ray/session_2025-07-16_06-09-36_851382_143/logs/ray-data
2025-07-16 06:12:58,204	INFO streaming_executor.py:118 -- Execution plan of Dataset dataset_4_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(calculate_all_features_for_group)->RandomShuffle]


Running 0: 0.00 row [00:00, ? row/s]

- ReadCSV->SplitBlocks(107) 1: 0.00 row [00:00, ? row/s]

- Sort 2: 0.00 row [00:00, ? row/s]

Sort Sample 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Map 4:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 5:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- MapBatches(calculate_all_features_for_group)->RandomShuffle 6: 0.00 row [00:00, ? row/s]

Shuffle Map 7:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 8:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

2025-07-16 06:13:51,403	INFO streaming_executor.py:227 -- ✔️  Dataset dataset_4_0 execution finished in 53.19 seconds
2025-07-16 06:13:51,722	INFO logging.py:295 -- Registered dataset logger for dataset dataset_8_0
2025-07-16 06:13:51,728	INFO streaming_executor.py:117 -- Starting execution of Dataset dataset_8_0. Full logs are in /tmp/ray/session_2025-07-16_06-09-36_851382_143/logs/ray-data
2025-07-16 06:13:51,730	INFO streaming_executor.py:118 -- Execution plan of Dataset dataset_8_0: InputDataBuffer[Input] -> AllToAllOperator[RandomShuffle]


Running 0: 0.00 row [00:00, ? row/s]

- RandomShuffle 1: 0.00 row [00:00, ? row/s]

Shuffle Map 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

Shuffle Reduce 3:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

2025-07-16 06:13:52,510	INFO streaming_executor.py:227 -- ✔️  Dataset dataset_8_0 execution finished in 0.77 seconds
2025-07-16 06:13:52,627	INFO logging.py:295 -- Registered dataset logger for dataset dataset_14_0
2025-07-16 06:13:52,634	INFO streaming_executor.py:117 -- Starting execution of Dataset dataset_14_0. Full logs are in /tmp/ray/session_2025-07-16_06-09-36_851382_143/logs/ray-data
2025-07-16 06:13:52,635	INFO streaming_executor.py:118 -- Execution plan of Dataset dataset_14_0: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(<lambda>)->Filter(<lambda>)] -> AggregateNumRows[AggregateNumRows]


Train set size: 140000
Validation set size: 30000
Test set size: 30000

Training the XGBoost model with Ray Train...
Calculating scale_pos_weight from training data for class imbalance...


Running 0:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- MapBatches(<lambda>)->Filter(<lambda>) 1: 0.00 row [00:00, ? row/s]

- AggregateNumRows 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

2025-07-16 06:13:59,599	INFO streaming_executor.py:227 -- ✔️  Dataset dataset_14_0 execution finished in 6.94 seconds
2025-07-16 06:13:59,627	INFO logging.py:295 -- Registered dataset logger for dataset dataset_16_0
2025-07-16 06:13:59,636	INFO streaming_executor.py:117 -- Starting execution of Dataset dataset_16_0. Full logs are in /tmp/ray/session_2025-07-16_06-09-36_851382_143/logs/ray-data
2025-07-16 06:13:59,638	INFO streaming_executor.py:118 -- Execution plan of Dataset dataset_16_0: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(<lambda>)->Filter(<lambda>)] -> AggregateNumRows[AggregateNumRows]


Running 0:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

- MapBatches(<lambda>)->Filter(<lambda>) 1: 0.00 row [00:00, ? row/s]

- AggregateNumRows 2:   0%|          | 0.00/1.00 [00:00<?, ? row/s]

2025-07-16 06:14:06,058	INFO streaming_executor.py:227 -- ✔️  Dataset dataset_16_0 execution finished in 6.41 seconds
2025-07-16 06:14:06,240	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


scale_pos_weight determined to be: 19.00
== Status ==
Current time: 2025-07-16 06:14:06 (running for 00:00:00.15)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-07-16 06:14:11 (running for 00:00:05.20)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-07-16 06:14:16 (running for 00:00:10.27)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_art

[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m Started distributed worker processes: 
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m - (node_id=ab45681e33ef7beafd12c0910f2ddab578d6fdbf4bd1a9c9e1a44719, ip=10.42.3.24, pid=14183) world_rank=0, local_rank=0, node_rank=0
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m - (node_id=ab45681e33ef7beafd12c0910f2ddab578d6fdbf4bd1a9c9e1a44719, ip=10.42.3.24, pid=14182) world_rank=1, local_rank=1, node_rank=0
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m - (node_id=ab45681e33ef7beafd12c0910f2ddab578d6fdbf4bd1a9c9e1a44719, ip=10.42.3.24, pid=14184) world_rank=2, local_rank=2, node_rank=0
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m - (node_id=ab45681e33ef7beafd12c0910f2ddab578d6fdbf4bd1a9c9e1a44719, ip=10.42.3.24, pid=14185) world_rank=3, local_rank=3, node_rank=0
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m - (node_id=ab45681e33ef7beafd12c0910f2ddab578d6fdbf4bd1a9c9e1a44719, ip=10.42.3.24, pid=14186) world_rank=4, local_

== Status ==
Current time: 2025-07-16 06:14:37 (running for 00:00:30.48)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-16 06:14:42 (running for 00:00:35.54)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




[36m(RayTrainWorker pid=14183, ip=10.42.3.24)[0m [06:14:45] Task [xgboost.ray-rank=00000000]:1e86880754d823b1d300eb9501000000 got rank 0
[36m(SplitCoordinator pid=14527, ip=10.42.3.24)[0m Registered dataset logger for dataset train_17_0
[36m(SplitCoordinator pid=14527, ip=10.42.3.24)[0m Starting execution of Dataset train_17_0. Full logs are in /tmp/ray/session_2025-07-16_06-09-36_851382_143/logs/ray-data
[36m(SplitCoordinator pid=14527, ip=10.42.3.24)[0m Execution plan of Dataset train_17_0: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(<lambda>)] -> OutputSplitter[split(5, equal=True)]


(pid=14527, ip=10.42.3.24) Running 0: 0.00 row [00:00, ? row/s]

(pid=14527, ip=10.42.3.24) - MapBatches(<lambda>) 1: 0.00 row [00:00, ? row/s]

(pid=14527, ip=10.42.3.24) - split(5, equal=True) 2: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-07-16 06:14:47 (running for 00:00:40.60)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-16 06:14:52 (running for 00:00:45.66)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




[36m(SplitCoordinator pid=14527, ip=10.42.3.24)[0m ✔️  Dataset train_17_0 execution finished in 7.22 seconds
[36m(RayTrainWorker pid=14186, ip=10.42.3.24)[0m [06:14:45] Task [xgboost.ray-rank=00000004]:e714213032229a6ca85e65ac01000000 got rank 4[32m [repeated 4x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(RayTrainWorker pid=14183, ip=10.42.3.24)[0m Registered dataset logger for dataset dataset_19_0
[36m(SplitCoordinator pid=14528, ip=10.42.3.24)[0m Starting execution of Dataset valid_18_0. Full logs are in /tmp/ray/session_2025-07-16_06-09-36_851382_143/logs/ray-data
[36m(SplitCoordinator pid=14528, ip=10.42.3.24)[0m Execution plan of Dataset valid_18_0: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(<lambda>)] -> OutputSplitter[split(5, equal=True)]


(pid=14528, ip=10.42.3.24) Running 0: 0.00 row [00:00, ? row/s]

(pid=14528, ip=10.42.3.24) - MapBatches(<lambda>) 1: 0.00 row [00:00, ? row/s]

(pid=14528, ip=10.42.3.24) - split(5, equal=True) 2: 0.00 row [00:00, ? row/s]

== Status ==
Current time: 2025-07-16 06:14:57 (running for 00:00:50.72)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 RUNNING)


== Status ==
Current time: 2025-07-16 06:15:02 (running for 00:00:55.77)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




[36m(RayTrainWorker pid=14183, ip=10.42.3.24)[0m Registered dataset logger for dataset dataset_24_0[32m [repeated 6x across cluster][0m
[36m(SplitCoordinator pid=14528, ip=10.42.3.24)[0m ✔️  Dataset valid_18_0 execution finished in 8.87 seconds
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:04] [0]	train-logloss:0.43750	train-error:0.00000	valid-logloss:0.43750	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:04] [1]	train-logloss:0.29630	train-error:0.00000	valid-logloss:0.29630	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:04] [2]	train-logloss:0.20732	train-error:0.00000	valid-logloss:0.20732	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:04] [3]	train-logloss:0.14780	train-error:0.00000	valid-logloss:0.14780	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:04] [4]	train-logloss:0.10661	train-error:0.00000	valid-logloss:0.10661	valid-error:0.00000

== Status ==
Current time: 2025-07-16 06:15:07 (running for 00:01:00.81)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 RUNNING)




[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:07] [66]	train-logloss:0.00001	train-error:0.00000	valid-logloss:0.00001	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:07] [67]	train-logloss:0.00001	train-error:0.00000	valid-logloss:0.00001	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:07] [68]	train-logloss:0.00001	train-error:0.00000	valid-logloss:0.00001	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:07] [69]	train-logloss:0.00001	train-error:0.00000	valid-logloss:0.00001	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:07] [70]	train-logloss:0.00001	train-error:0.00000	valid-logloss:0.00001	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:07] [71]	train-logloss:0.00001	train-error:0.00000	valid-logloss:0.00001	valid-error:0.00000
[36m(XGBoostTrainer pid=14109, ip=10.42.3.24)[0m [06:15:07] [72]	train-logloss:0.00001	train

== Status ==
Current time: 2025-07-16 06:15:09 (running for 00:01:03.39)
Using FIFO scheduling algorithm.
Logical resource usage: 6.0/160 CPUs, 0/0 GPUs
Result logdir: /tmp/ray/session_2025-07-16_06-09-36_851382_143/artifacts/2025-07-16_06-14-06/XGBoostTrainer_2025-07-16_06-14-06/driver_artifacts
Number of trials: 1/1 (1 TERMINATED)



Model Training Complete.
Loading model from the best checkpoint...

--- Model Evaluation on Unseen Test Data ---


Running 0: 0.00 row [00:00, ? row/s]

- MapBatches(<lambda>) 1: 0.00 row [00:00, ? row/s]

- limit=1 2: 0.00 row [00:00, ? row/s]

2025-07-16 06:15:13,839	INFO streaming_executor.py:227 -- ✔️  Dataset dataset_30_0 execution finished in 3.83 seconds
2025-07-16 06:15:13,848	INFO logging.py:295 -- Registered dataset logger for dataset dataset_29_0
2025-07-16 06:15:13,855	INFO streaming_executor.py:117 -- Starting execution of Dataset dataset_29_0. Full logs are in /tmp/ray/session_2025-07-16_06-09-36_851382_143/logs/ray-data
2025-07-16 06:15:13,856	INFO streaming_executor.py:118 -- Execution plan of Dataset dataset_29_0: InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(<lambda>)]


Running 0: 0.00 row [00:00, ? row/s]

- MapBatches(<lambda>) 1: 0.00 row [00:00, ? row/s]

2025-07-16 06:15:14,185	INFO streaming_executor.py:227 -- ✔️  Dataset dataset_29_0 execution finished in 0.32 seconds



Confusion Matrix (Test Data):
[[28562     0]
 [    0  1438]]

Classification Report (Test Data):
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     28562
        True       1.00      1.00      1.00      1438

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000


Feature Importances:
total_calls             20.0
outgoing_call_ratio     13.0
nocturnal_call_ratio     8.0
avg_duration             7.0
std_duration             7.0
dtype: float64

Trained XGBoost model saved to 'fraud_detection_model_xgb_ray.json'

Process complete in 237.07 seconds.
