In [6]:
# Disable dask_expr by setting an environment variable
import os
os.environ['DASK_EXPRESSION'] = '0'

# Test for Dask with XGBoost integration

# Step 1: Import necessary modules
try:
    from dask import array as da
    import dask.dataframe as dd
    import dask.distributed
    import xgboost as xgb
    from xgboost.dask import DaskDMatrix, train as dask_xgboost_train
    from dask.dataframe.utils import make_meta
    print("Step 1: Importing modules passed.")
except Exception as e:
    print(f"Step 1 failed: {str(e)}")
    raise e

# Step 2: Create or initialize necessary objects
try:
    # Set up a Dask client
    client = dask.distributed.Client()

    # Create a small Dask DataFrame
    df = dd.demo.make_timeseries(start='2000', end='2001', freq='1D', 
                                 partition_freq='1ME', seed=42)

    # Define metadata explicitly
    meta = make_meta({'x': 'f8', 'y': 'f8', 'id': 'i8'})

    # Use map_partitions with an explicit columns= argument and defined meta
    def select_columns(df, columns=None):
        return df[columns]

    df = df.map_partitions(select_columns, columns=['x', 'y', 'id'], meta=meta)

    # Check initial partition sizes
    initial_partition_sizes = df.map_partitions(len).compute()
    print("Initial Partition sizes:", initial_partition_sizes)

    # Repartition the DataFrame to ensure better distribution
    df = df.repartition(npartitions=16)

    # Check repartitioned sizes
    repartitioned_sizes = df.map_partitions(len).compute()
    print("Repartitioned sizes:", repartitioned_sizes)

    print("Step 2: Initialization passed.")
except Exception as e:
    print(f"Step 2 failed: {str(e)}")
    raise e

# Step 3: Perform basic operations
try:
    # Define features and labels
    X_dask = df[['x', 'y']].to_dask_array(lengths=True)
    y_dask = df['id'].to_dask_array(lengths=True)

    # Verify array shapes
    print(f"X_dask shape: {X_dask.shape}, y_dask shape: {y_dask.shape}")

    # Train a Dask-XGBoost model
    params = {'objective': 'reg:squarederror', 'max_depth': 5}
    dtrain = DaskDMatrix(client, X_dask, y_dask)
    output = dask_xgboost_train(client, params, dtrain, num_boost_round=10)

    print("Step 3: Basic operations passed.")
except Exception as e:
    print(f"Step 3 failed: {str(e)}")
    raise e

# Step 4: Error handling and edge cases
try:
    try:
        # Attempt to train on empty data
        empty_X = da.empty((0, 2))
        empty_y = da.empty(0)
        empty_dtrain = DaskDMatrix(client, empty_X, empty_y)
        output = dask_xgboost_train(client, params, empty_dtrain, num_boost_round=10)
    except ValueError as e:
        print(f"Step 4: Correctly handled error: {str(e)}")
except Exception as e:
    print(f"Step 4 failed: {str(e)}")
    raise e

# Step 5: Final Confirmation
print("All tests completed successfully.")


Step 1: Importing modules passed.


Perhaps you already have a cluster running?
Hosting the HTTP server on port 33245 instead


Initial Partition sizes: 0     29
1     31
2     30
3     31
4     30
5     31
6     31
7     30
8     31
9     30
10    31
dtype: int64
Repartitioned sizes: 0     20
1     21
2     21
3     21
4     21
5     21
6     21
7     21
8     21
9     22
10    20
11    21
12    22
13    20
14    21
15    21
dtype: int64
Step 2: Initialization passed.
X_dask shape: (335, 2), y_dask shape: (335,)


[12:26:43] Task [xgboost.dask-0]:tcp://127.0.0.1:45305 got rank 0
[12:26:43] Task [xgboost.dask-2]:tcp://127.0.0.1:40773 got rank 2
[12:26:43] Task [xgboost.dask-1]:tcp://127.0.0.1:33763 got rank 1
[12:26:43] Task [xgboost.dask-3]:tcp://127.0.0.1:44953 got rank 3
[12:26:43] Task [xgboost.dask-1]:tcp://127.0.0.1:33763 got rank 0


Step 3: Basic operations passed.
All tests completed successfully.
