# Tune Multithreading Support for XGBoost


### Impact of the Number of Threads


In [2]:
# Otto, tune number of threads
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time
from matplotlib import pyplot

# load data
data = read_csv('train.csv')
dataset = data.values

# split data into X and y
X = dataset[:,0:94]
y = dataset[:,94]

# encode string class values as integers
label_encoded_y = LabelEncoder().fit_transform(y)

# evaluate the effect of the number of threads
results = []
num_threads = [1, 2, 3, 4]

for n in num_threads:
    start = time.time()
    model = XGBClassifier(nthread=n)
    model.fit(X, label_encoded_y)
    elapsed = time.time() - start
    print(n, elapsed)
    results.append(elapsed)
    
# plot results
pyplot.plot(num_threads, results)
pyplot.ylabel('Speed (seconds)')
pyplot.xlabel('Number of Threads')
pyplot.title('XGBoost Training Speed vs Number of Threads')
pyplot.show()




(1, 150.69700717926025)
(2, 90.86278009414673)
(3, 90.99121499061584)
(4, 93.34687280654907)


### Parallelism When Cross Validating XGBoost Models


In [None]:
# Otto, parallel cross validation
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder
import time

# load data
data = read_csv('train.csv')
dataset = data.values

# split data into X and y
X = dataset[:,0:94]
y = dataset[:,94]

# encode string class values as integers
label_encoded_y = LabelEncoder().fit_transform(y)

# prepare cross validation
kfold = StratifiedKFold(label_encoded_y, n_folds=10, shuffle=True, random_state=7)

# Single Thread XGBoost, Parallel Thread CV
start = time.time()
model = XGBClassifier(nthread=1)
results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='log_loss',n_jobs=-1)
elapsed = time.time() - start
print("Single Thread XGBoost, Parallel Thread CV: %f" % (elapsed))

# Parallel Thread XGBoost, Single Thread CV
start = time.time()
model = XGBClassifier(nthread=-1)
results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='log_loss', n_jobs=1)
elapsed = time.time() - start
print("Parallel Thread XGBoost, Single Thread CV: %f" % (elapsed))
# Parallel Thread XGBoost and CV
start = time.time()
model = XGBClassifier(nthread=-1)
results = cross_val_score(model, X, label_encoded_y, cv=kfold, scoring='log_loss',n_jobs=-1)
elapsed = time.time() - start
print("Parallel Thread XGBoost and CV: %f" % (elapsed))

