# `chefboost` - an alternative Python library for tree-based models

## Setup

In [None]:
from chefboost import Chefboost as chef
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

## Loading the data and preprocessing

In [None]:
X = pd.read_csv("../data/adult.csv")
X = X.rename(columns={"income": "Decision"})
X.head()

In [None]:
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42, stratify=X["Decision"])

## Fitting a `chefboost` model

In [None]:
config = {'algorithm': 'CART'}
model = chef.fit(X_train, config = config)

In [None]:
prediction = chef.predict(model, X_test.iloc[0])
prediction

In [None]:
evaluation = chef.evaluate(model, X_test, task="test")

In [None]:
rules = "outputs/rules/rules.py"
fi = chef.feature_importance(rules).set_index("feature")
fi

In [None]:
fi.plot(kind="barh", title="Feature Importance");

## Comparison with `scikit-learn`

In [None]:
X_sk = pd.get_dummies(X, drop_first=True)
y_sk = X_sk.pop("Decision_>50K")
X_train_sk, X_test_sk, y_train_sk, y_test_sk = train_test_split(X_sk, y_sk, 
                                                                test_size=0.2, 
                                                                random_state=42, 
                                                                stratify=y_sk)

In [None]:
%time

tree = DecisionTreeClassifier()
tree.fit(X_train_sk, y_train_sk)

In [None]:
tree.tree_.max_depth

## Speed up the training with parralel execution

In [None]:
config = {'algorithm': 'CART', 'enableParallelism': True}
model = chef.fit(X_train, config)