XGBoost 1.1.0 SNAPSHOT gpu_hist still numerically unstable #5632

Zethson · 2020-05-05T11:49:27Z

Dear everyone,

according to #5023 a model using gpu_hist should be reproducible using 1.1.0

The version that I was using for these experiments is:
https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/xgboost-1.1.0_SNAPSHOT%2B86beb68ce8ffa73a90def6a9862f0e8a917b58c3-py2.py3-none-manylinux1_x86_64.whl

The code that I am using is:

#!/home/user/miniconda/envs/xgboost-1.0.2-cuda-10.1/bin/python
import click
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_covtype, load_boston
from sklearn.model_selection import train_test_split
import time
import random
import os


@click.command()
@click.option('--seed', type=int, default=0)
@click.option('--epochs', type=int, default=25)
@click.option('--no-cuda', type=bool, default=False)
def train(seed, epochs, no_cuda, dataset):
    # Fetch dataset using sklearn
    dataset == 'covertype':
    dataset = fetch_covtype()
    param = {
        'objective': 'multi:softmax',
        'num_class': 8
        # 'single_precision_histogram': True
    }

    X = dataset.data
    y = dataset.target

    # Create 0.75/0.25 train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=0)

    # Set random seeds
    random_seed(seed, param)
    param['subsample'] = 0.5
    param['colsample_bytree'] = 0.5
    param['colsample_bylevel'] = 0.5

    # Convert input data from numpy to XGBoost format
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Set CPU or GPU as training device
    if no_cuda:
        param['tree_method'] = 'hist'
    else:
        param['tree_method'] = 'gpu_hist'

    # Train on the chosen device
    results = {}
    gpu_runtime = time.time()
    xgb.train(param, dtrain, epochs, evals=[(dtest, 'test')], evals_result=results)
    if not no_cuda:
        print(f'GPU Run Time: {str(time.time() - gpu_runtime)} seconds')
    else:
        print(f'CPU Run Time: {str(time.time() - gpu_runtime)} seconds')

def random_seed(seed, param):
    os.environ['PYTHONHASHSEED'] = str(seed) # Python general
    np.random.seed(seed)
    random.seed(seed) # Python random
    param['seed'] = seed


if __name__ == '__main__':
    train()

When training the covertype dataset with 1000 epochs I get the following results:

[999] test-merror:0.05985
GPU Run Time: 470.45096254348755 seconds
[999] test-merror:0.05978
GPU Run Time: 493.8201196193695 seconds
[999] test-merror:0.05894
GPU Run Time: 484.21098017692566 seconds
[999] test-merror:0.05962
GPU Run Time: 477.36872577667236 seconds
[999] test-merror:0.06025
GPU Run Time: 487.09354853630066 seconds

Is this still to be expected or why do I not get perfectly reproducible results?

Thank you very much!

The text was updated successfully, but these errors were encountered:

trivialfis · 2020-05-06T12:51:01Z

I use a slightly modified script with syntax error fix, changing epoches to 999 to match your result, and load the data without sklearn (avoid downloading the data again), obtained reproducible result:

import click
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
import time
import random
import os


def fetch_covtype():
    if not os.path.exists('covertype.npy'):
        Xy = np.genfromtxt('covtype.data',
                           delimiter=',')
        np.save('covertype', Xy)
    Xy = np.load('covertype.npy')

    X = Xy[:, :-1]
    y = Xy[:, -1].astype(np.int32, copy=False)
    return X, y


@click.command()
@click.option('--seed', type=int, default=0)
@click.option('--epochs', type=int, default=999)
@click.option('--no-cuda', type=bool, default=False)
def train(seed, epochs, no_cuda):
    # Fetch dataset using sklearn
    X, y = fetch_covtype()
    param = {
        'objective': 'multi:softmax',
        'num_class': 8,
        'single_precision_histogram': True
    }

    # Create 0.75/0.25 train/test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        train_size=0.75,
                                                        random_state=0)

    # Set random seeds
    random_seed(seed, param)
    param['subsample'] = 0.5
    param['colsample_bytree'] = 0.5
    param['colsample_bylevel'] = 0.5

    # Convert input data from numpy to XGBoost format
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Set CPU or GPU as training device
    if no_cuda:
        param['tree_method'] = 'hist'
    else:
        param['tree_method'] = 'gpu_hist'

    # Train on the chosen device
    results = {}
    gpu_runtime = time.time()
    booster = xgb.train(param,
                        dtrain,
                        epochs,
                        evals=[(dtest, 'test')],
                        evals_result=results)

    model = 'model.json'
    i = 0
    path = str(i) + '-' + model
    while os.path.exists(path):
        i += 1
        path = str(i) + '-' + model

    booster.save_model(path)

    if not no_cuda:
        print(f'GPU Run Time: {str(time.time() - gpu_runtime)} seconds')
    else:
        print(f'CPU Run Time: {str(time.time() - gpu_runtime)} seconds')


def random_seed(seed, param):
    os.environ['PYTHONHASHSEED'] = str(seed)  # Python general
    np.random.seed(seed)
    random.seed(seed)  # Python random
    param['seed'] = seed


if __name__ == '__main__':
    train()

Hash values from training for 2 runs with single precision:

77db9129e8b7e4cd19fac7173a32d590ead1920f83f3043c845e12a2b4a5ed67  0-model.json
77db9129e8b7e4cd19fac7173a32d590ead1920f83f3043c845e12a2b4a5ed67  1-model.json

Training for 2 runs with double precision:

ac5fb1be1f40e37349c69c4ba27572298d6fd3402eb307f233935d13b7c9ccd1  2-model.json
ac5fb1be1f40e37349c69c4ba27572298d6fd3402eb307f233935d13b7c9ccd1  3-model.json

trivialfis · 2020-05-06T12:52:32Z

Could you try using the rc build: #5593 ?

Zethson · 2020-05-06T12:53:04Z

@trivialfis Yes, I will try and update you.

Sorry for the syntax errors, I basically copied it from my stuff and modified it inside the markdown editor here haha

trivialfis · 2020-05-06T13:01:27Z

@Zethson No problem. Let us know the result. ;-)

Zethson · 2020-05-06T17:07:25Z

Ain't working for me!
I am even running everything inside a Docker container.

[999]	test-merror:0.05960
GPU Run Time: 395.1967947483063 seconds

[999]	test-merror:0.05989
GPU Run Time: 394.52636194229126 seconds

edit:

same with single_precision_histogram:

[999]	test-merror:0.06774
GPU Run Time: 204.32505822181702 seconds

[999]	test-merror:0.06780
GPU Run Time: 209.27868032455444 seconds

Zethson · 2020-05-06T17:12:07Z

Corresponding repository:

https://github.com/Zethson/nextflow_machine_learning

Python script is under /bin.

run command: nextflow run main.nf -with-docker --platform single_gpu --xgboost --dataset covertype --epochs 1000

Running on a Nvidia 1050M

trivialfis · 2020-05-06T18:43:17Z

@Zethson Are you using dask?

Zethson · 2020-05-06T18:43:47Z

No

Zethson · 2020-05-06T18:44:33Z

The same issue appears on my system with a tesla k80

trivialfis · 2020-05-06T18:46:58Z

Let me try running on different cards tomorrow.

Zethson · 2020-05-06T18:59:46Z

If this doesn't help, then you maybe need to try out Docker with the nvidia container toolkit (and maybe even nextflow).

trivialfis · 2020-05-06T19:01:14Z

@Zethson Can you confirm the result on CPU is reproducible?

Zethson · 2020-05-06T19:01:53Z

I can.

trivialfis · 2020-05-06T19:08:59Z

Will try other cards. Tested with 1080 and 1080 TI, couldn't reproduce.

RAMitchell · 2020-05-06T22:09:28Z

Please double check your installed xgboost version with print(xgb.__version__)

Zethson · 2020-05-07T08:25:03Z

>>> print(xgb.__version__)
1.1.0rc2

The Conda environment
https://github.com/Zethson/nextflow_machine_learning/blob/master/xgboost_environment.yml

trivialfis · 2020-05-07T16:59:10Z

Tried 3 other cards with 1 mobile card included (960m) today, still couldn't reproduce. The XGBoost binary pip distribution is built with cuda 9 so I also tried it, still no luck.

Zethson · 2020-05-07T17:01:12Z

@trivialfis Could you maybe try it with Docker and the nvidia container toolkit using my containers?

I can give you more detailed instructions if you require them.

I can reproduce it on 2 systems, both of which are running the models inside Docker via Nextflow...

Zethson · 2020-05-07T17:01:32Z

I will try to run it outside of Docker and see what happens on my system.

Edit: Well, 25 epochs are even enough to show that I get different results between runs on my system.

Zethson · 2020-05-07T17:07:15Z

I could offer you that you give me your SSH key and I give you access to one of my machines.

trivialfis · 2020-05-07T17:35:42Z

@Zethson Thanks for the offer, might not be needed after all.

@hcho3 Can I have temporary direct access to your build environment in these few days?

trivialfis · 2020-05-08T18:14:51Z

Noticed compiling on 9.0 there's a waning might be relevant:

xgboost/src/tree/gpu_hist/histogram.cu(56): warning: function "xgboost::tree::<unnamed>::operator+" was declared but never referenced

Trying 9.2 again.

trivialfis · 2020-05-08T20:32:44Z

@hcho3 @RAMitchell Can we upgrade the default build to 9.2 ? I think there's an issue during compilation. Thus far I can only reproduce with 9.0 build.

RAMitchell · 2020-05-08T22:04:50Z

Seems reasonable. We could do 9.2 for this release and move to 10 after.

hcho3 · 2020-05-08T22:19:50Z

@trivialfis 9.2 seems reasonable. Did you verify the reproducibility with 9.2?

trivialfis · 2020-05-09T08:20:09Z

@hcho3 Yes, tried 9.0 all the way to 10.2 .

hcho3 · 2020-05-09T08:21:30Z

@trivialfis So the bug disappeared when you upgraded CUDA version to 9.2? Okay, I will file a PR to upgrade CUDA to 9.2.

hcho3 · 2020-05-09T08:26:25Z

#5649

Zethson · 2020-05-09T08:57:04Z

I am using Cuda 10.2 by the way, not 9.2

Why do you expect this to solve my reproducibility issue?

hcho3 · 2020-05-09T09:00:14Z

@Zethson Our server was using CUDA 9.0 to build the binary wheel (that's what you get when you run pip install xgboost). If you compile XGBoost from the source on your machine, the issue will not occur.

Zethson · 2020-05-09T10:01:43Z

Ahh, I see. Cool! Looking forward to trying rc3 out :)

hcho3 · 2020-05-10T01:07:44Z

@Zethson Here is a new wheel built with CUDA 10.0: https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/PR-5649/xgboost-1.1.0rc2%2B65895f677a23de0de74ad8e5d0dbab6220bfbf10-py3-none-manylinux2010_x86_64.whl. See if it works for you.

hcho3 · 2020-05-10T01:28:50Z

I tried the repro script on a new EC2 instance (g4dn.12xlarge), which has NVIDIA T4 GPU. The new wheel runs deterministically, whereas the current RC2 does not.

Two runs with the current RC2:

[997]	test-merror:0.06849
[998]	test-merror:0.06852
GPU Run Time: 75.64691066741943 seconds

[997]	test-merror:0.06854
[998]	test-merror:0.06859
GPU Run Time: 75.401864528656 seconds

95df2bdea70967718a51e980dd6eb89a11685cf0  0-model.json
48644a8ce353d20a60a00a0513d9759ee59389a1  1-model.json

Two runs with the new wheel:

[997]	test-merror:0.06801
[998]	test-merror:0.06804
GPU Run Time: 74.65184926986694 seconds

[997]	test-merror:0.06801
[998]	test-merror:0.06804
GPU Run Time: 75.95206379890442 seconds

b2f7d449c2bfd19a74e29f6b00dce99672088300  2-model.json
b2f7d449c2bfd19a74e29f6b00dce99672088300  3-model.json

Zethson · 2020-05-11T09:22:24Z

I can confirm your finding and it is now indeed reproducible!

Thank you very much for your very swift and competent help.

I will also conduct some experiments on reproducibility with Dask soon, so be prepared for more issues related to reproducibility :)

(Let's hope for the best!)

trivialfis · 2020-05-11T09:34:20Z

Dask data partitioning is not reproducible in some cases. You are more than welcomed to join investigating.

Zethson · 2020-05-11T09:35:18Z

@trivialfis Do you by chance have any links where I could read up on those cases?

trivialfis · 2020-05-11T09:44:20Z

@Zethson No, I made the PR for single node deterministic GPU. Dask is the next step, but they are baby steps...

trivialfis added the status: need update label May 6, 2020

hcho3 mentioned this issue May 9, 2020

Upgrade to CUDA 10.0 #5649

Merged

Zethson closed this as completed May 11, 2020

XGBoost 1.1.0 SNAPSHOT gpu_hist still numerically unstable #5632

XGBoost 1.1.0 SNAPSHOT gpu_hist still numerically unstable #5632

Comments

Zethson commented May 5, 2020 • edited Loading

trivialfis commented May 6, 2020 • edited Loading

trivialfis commented May 6, 2020

Zethson commented May 6, 2020 • edited Loading

trivialfis commented May 6, 2020

Zethson commented May 6, 2020 • edited Loading

Zethson commented May 6, 2020 • edited Loading

trivialfis commented May 6, 2020

Zethson commented May 6, 2020

Zethson commented May 6, 2020 • edited Loading

trivialfis commented May 6, 2020 • edited Loading

Zethson commented May 6, 2020

trivialfis commented May 6, 2020

Zethson commented May 6, 2020

trivialfis commented May 6, 2020

RAMitchell commented May 6, 2020

Zethson commented May 7, 2020

trivialfis commented May 7, 2020 • edited Loading

Zethson commented May 7, 2020

Zethson commented May 7, 2020 • edited Loading

Zethson commented May 7, 2020

trivialfis commented May 7, 2020

trivialfis commented May 8, 2020

trivialfis commented May 8, 2020

RAMitchell commented May 8, 2020

hcho3 commented May 8, 2020 • edited Loading

trivialfis commented May 9, 2020

hcho3 commented May 9, 2020

hcho3 commented May 9, 2020

Zethson commented May 9, 2020 • edited Loading

hcho3 commented May 9, 2020 • edited Loading

Zethson commented May 9, 2020

hcho3 commented May 10, 2020

hcho3 commented May 10, 2020 • edited Loading

Zethson commented May 11, 2020

trivialfis commented May 11, 2020 • edited Loading

Zethson commented May 11, 2020 • edited Loading

trivialfis commented May 11, 2020 • edited Loading

Zethson commented May 5, 2020 •

edited

Loading

trivialfis commented May 6, 2020 •

edited

Loading

Zethson commented May 6, 2020 •

edited

Loading

Zethson commented May 6, 2020 •

edited

Loading

Zethson commented May 6, 2020 •

edited

Loading

Zethson commented May 6, 2020 •

edited

Loading

trivialfis commented May 6, 2020 •

edited

Loading

trivialfis commented May 7, 2020 •

edited

Loading

Zethson commented May 7, 2020 •

edited

Loading

hcho3 commented May 8, 2020 •

edited

Loading

Zethson commented May 9, 2020 •

edited

Loading

hcho3 commented May 9, 2020 •

edited

Loading

hcho3 commented May 10, 2020 •

edited

Loading

trivialfis commented May 11, 2020 •

edited

Loading

Zethson commented May 11, 2020 •

edited

Loading

trivialfis commented May 11, 2020 •

edited

Loading