In [27]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets
import numpy as np

In [6]:
# multiclass classification
# load data
data = datasets.load_iris()
dataset = data.values
# split data into X and y
X = data.data
Y = data.target 
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y,
test_size=test_size, random_state=seed)
# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
predictions = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
Accuracy: 90.00%


In [10]:
cancer = pd.read_csv('course-data/breast-cancer.csv')

In [11]:
cancer.head(5)

Unnamed: 0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no.1
0,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
2,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
3,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
4,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,left,left_low,no


In [21]:
# binary classification, breast cancer dataset, label and one hot encoded

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

# Load the dataset
cancer = pd.read_csv('course-data/breast-cancer.csv')

# Split data into X and y
X = cancer.iloc[:, 1:]  # Features, excluding the first column (ID)
Y = cancer.iloc[:, 0]   # Labels, first column as label (diagnosis)

# Handle categorical features with string values using one-hot encoding
categorical_columns = X.select_dtypes(include=['object']).columns
column_transformer = ColumnTransformer(
    [('onehot_encoder', OneHotEncoder(), categorical_columns)],
    remainder='passthrough'
)
X_encoded = column_transformer.fit_transform(X)

# Encode string class values as integers for labels
label_encoder = LabelEncoder()
label_encoded_y = label_encoder.fit_transform(Y)

# Split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_encoded, label_encoded_y, test_size=test_size, random_state=seed)

# Fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions for test data
predictions = model.predict(X_test)

# Evaluate predictions
print(model)
print("Chosen Booster:", model.booster)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
Chosen Booster: None
Accuracy: 67.37%


It seems that some parameters of the XGBClassifier are not set correctly, resulting in None values. This can happen if you're using the default constructor for XGBClassifier without specifying any parameters.

In [24]:
# Load the dataset
cancer = pd.read_csv('course-data/breast-cancer.csv')

# Split data into X and y
X = cancer.iloc[:, 1:]  # Features, excluding the first column (ID)
Y = cancer.iloc[:, 0]   # Labels, first column as label (diagnosis)

# Handle categorical features with string values using one-hot encoding
categorical_columns = X.select_dtypes(include=['object']).columns
column_transformer = ColumnTransformer(
    [('onehot_encoder', OneHotEncoder(), categorical_columns)],
    remainder='passthrough'
)
X_encoded = column_transformer.fit_transform(X)

# Encode string class values as integers for labels
label_encoder = LabelEncoder()
label_encoded_y = label_encoder.fit_transform(Y)

# Split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_encoded, label_encoded_y, test_size=test_size, random_state=seed)

# Fit model on training data
model = XGBClassifier(
    booster='gbtree',  # The booster type to use, default is 'gbtree'
    objective='binary:logistic',  # Specify the learning task and the corresponding objective function
    learning_rate=0.1,  # Learning rate, default is 0.3
    max_depth=3,  # Maximum depth of a tree, default is 6
    n_estimators=100,  # Number of trees to fit, default is 100
    random_state=42  # Random seed for reproducibility
)
model.fit(X_train, y_train)

# Make predictions for test data
predictions = model.predict(X_test)

# Evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 71.58%


Run the horse data set several time using:

* X[X == '?'] = 0
* X[X == '?'] = 1
* X[X == '?'] = numpy.nan

In [28]:
# Load the dataset
horse = pd.read_csv('course-data/horse-colic.csv')
# Split data into X and y
X = horse.iloc[:, 1:]  # Features, excluding the first column (ID)
Y = horse.iloc[:, 0]   # Labels, first column as label (diagnosis)
# set missing values to 0
X[X == '?'] = np.nan
# convert to numeric
X = X.astype('float32')
# encode Y class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y,
test_size=test_size, random_state=seed)
# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
predictions = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
Accuracy: 85.86%


Use imputer for missing values

In [31]:
# binary classification, missing data, impute with mean
import numpy
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer as Imputer
# Load the dataset
horse = pd.read_csv('course-data/horse-colic.csv')
# Split data into X and y
X = horse.iloc[:, 1:]  # Features, excluding the first column (ID)
Y = horse.iloc[:, 0]   # Labels, first column as label (diagnosis)
# set missing values to 0
X[X == '?'] = np.nan
# convert to numeric
X = X.astype('float32')
# impute missing values as the mean
imputer = Imputer()
imputed_x = imputer.fit_transform(X)
# encode Y class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(imputed_x, label_encoded_y,
test_size=test_size, random_state=seed)
# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)
print(model)
# make predictions for test data
predictions = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
Accuracy: 84.85%
