# Gradient Boosting is (pretty much) All You Need

Use the power of Gradient Boosted Decision Trees (GBDT) in your browser.

Predict:

- Whether a person makes $50,000 a year or not
- The price of a house in California
- What forest cover type is in a geographic zone

Sources:

- Paper [link](https://arxiv.org/pdf/2110.01889.pdf)
- UCI ML [link](https://archive.ics.uci.edu/ml/index.php)
  - Adult Names and Covertype Data Sets
- California Housing Data Set [link](https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error

import sklearn

f"{sklearn.__version__ = }, {np.__version__ = }, {pd.__version__ = }"

"sklearn.__version__ = '1.0.2', np.__version__ = '1.21.5', pd.__version__ = '1.4.1'"

## Predict Adult Income

- `age`: continuous.
- `workclass`: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
- `fnlwgt`: continuous.
- `education`: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
- `education-num`: continuous.
- `marital-status`: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
- `occupation`: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
- `relationship`: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
- `race`: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
- `sex`: Female, Male.
- `capital-gain`: continuous.
- `capital-loss`: continuous.
- `hours-per-week`: continuous.
- `native-country`: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.


In [12]:
columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "less-than-50k"
]
categorical_columns = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
    "less-than-50k"
]
continuous_columns = [col for col in columns if col not in categorical_columns]
features = [col for col in columns if col != 'less-than-50k']

adult_income_train = pd.read_csv("data/adult.data", names=columns)
adult_income_train[categorical_columns] = adult_income_train[categorical_columns].astype("category")
adult_income_train = adult_income_train[[*categorical_columns, *continuous_columns]]
adult_income_test = pd.read_csv("data/adult.test", names=columns)
adult_income_test[categorical_columns] = adult_income_train[categorical_columns].astype("category")
adult_income_train.iloc[:3]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,less-than-50k,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K,39,77516,13,2174,0,40
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K,50,83311,13,0,0,13
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K,38,215646,9,0,0,40


In [13]:
adult_train_features = adult_income_train[features]
label_encoder = LabelEncoder()
adult_train_labels = label_encoder.fit_transform(adult_income_train['less-than-50k'])
adult_train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             32561 non-null  int64   
 1   workclass       32561 non-null  category
 2   fnlwgt          32561 non-null  int64   
 3   education       32561 non-null  category
 4   education-num   32561 non-null  int64   
 5   marital-status  32561 non-null  category
 6   occupation      32561 non-null  category
 7   relationship    32561 non-null  category
 8   race            32561 non-null  category
 9   sex             32561 non-null  category
 10  capital-gain    32561 non-null  int64   
 11  capital-loss    32561 non-null  int64   
 12  hours-per-week  32561 non-null  int64   
 13  native-country  32561 non-null  category
dtypes: category(8), int64(6)
memory usage: 1.7 MB


In [14]:
adult_test_features = adult_income_test[features]
adult_test_labels = label_encoder.transform(adult_income_test['less-than-50k'])
adult_test_features, adult_test_labels

(       age          workclass  fnlwgt      education  education-num  \
 0       25          State-gov  226802      Bachelors              7   
 1       38   Self-emp-not-inc   89814      Bachelors              9   
 2       28            Private  336951        HS-grad             12   
 3       44            Private  160323           11th             10   
 4       18            Private  103497      Bachelors             10   
 ...    ...                ...     ...            ...            ...   
 16276   39            Private  215419        HS-grad             13   
 16277   64            Private  321403   Some-college              9   
 16278   38            Private  374983      Bachelors             13   
 16279   44            Private   83891      Assoc-voc             13   
 16280   35            Private  182148        HS-grad             13   
 
             marital-status          occupation    relationship    race  \
 0            Never-married        Adm-clerical   Not-in-fa

In [15]:
categorical_mask = [True] * (len(categorical_columns) - 1) + [False] * len(continuous_columns)
ordinal_encoder = make_column_transformer(
    (
        OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
        make_column_selector(dtype_include="category"),
    ),
    remainder="passthrough",
)
model = make_pipeline(ordinal_encoder, HistGradientBoostingClassifier(random_state=47, categorical_features=categorical_mask))
model.fit(adult_train_features, adult_train_labels)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinalencoder',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=nan),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x157bd5b50>)])),
                ('histgradientboostingclassifier',
                 HistGradientBoostingClassifier(categorical_features=[True,
                                                                      True,
                                                                      True,
                                                                      True,
                                                                      True,
                                                                      True,
     

In [16]:
predictions = model.predict(adult_test_features)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
accuracy_score(adult_test_labels, predictions)

0.7720041766476261

In [18]:
california_housing = pd.read_csv('data/cal_housing.tgz', compression='gzip')
covertypes = pd.read_csv('data/covtype.data.gz')

In [None]:



iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


datasets.dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
datasets.dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

bst = xgb.train(param, dtrain, num_round)

st.write(bst)
bst.dump_model('dump.raw.txt')

preds = bst.predict(dtest)
st.write(preds)

best_preds = np.asarray([np.argmax(line) for line in preds])
st.write(best_preds)