In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#| default_exp core

# Core

> A simple API for automating tabular data analysis.

In [None]:
#|hide
#|export
from nbdev.showdoc import *
from fastcore.all import *
from sklearn.datasets import load_breast_cancer, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
import xgboost as xgb

## Regression
The base object in this class is `Regressor`. Basic methods (shaped by sklearn methodology) are `train`, `predict`, and `score`.

In [None]:
#|export
class Regressor:
    "Base regressor class"
    
    def __init__(self, 
                 X: 'numpy.ndarray', # Predictor variables
                 y: 'numpy.ndarray', # Target variable
                 model=xgb.XGBRegressor(), # Model form to train
                 fixed_seed=False, # Random or fixed state variable
                 test_size=0.2): # Train-test split portion
        self.X = X
        self.y = y
        self.test_size = 0.2
        # set seed
        if fixed_seed: self.seed = 42
        else: self.seed = np.random.randint(low=0, high=100)
        # split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, 
                                                                                test_size=self.test_size,
                                                                                random_state=self.seed)
        # instantiate model
        self.model = model
            
    def rmse(self, pred, true):
        return np.sqrt(mse(pred, true))
        
    def train(self):
        "Train the model"
        return self.model.fit(self.X_train, self.y_train)
    
    def predict(self):
        "Return predictions for model"
        return self.model.predict(self.X_test)
    
    def score(self):
        "Score the model according to chosen evaluation metric"
        self.model = self.train()
        preds = self.predict()
        return self.rmse(preds, self.y_test)
        
    def __str__(self): 
        return f"Regressor: {len(self.X_train)} training, {len(self.X_test)} testing. \nModel: {self.model}"

The majority of work we would normally need to do (creating an evaluation set, training the model, making predictions, and scoring the performance) is handled in one step with `Regressor`:

In [None]:
# load the dataset
data = load_diabetes()

# create predictor and target variables
X, y = data.data, data.target

# use the regressor class
reg = Regressor(X, y)
reg.score()

61.37451110412363

You can also print the model out, providing information about the model type, and the number of train/test examples:

In [None]:
print(reg)

Regressor: 353 training, 89 testing. 
Model: XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)


You can test that the fixed seed gives you the same performance for reproducability:

In [None]:
reg1 = Regressor(X, y, fixed_seed=True)
reg2 = Regressor(X, y, fixed_seed=True)
score_1, score_2 = reg1.score(), reg2.score()
test_eq(score_1, score_2)

## Classification
Same idea and methods as `Regressor`.

In [None]:
#|export
class Classifier:
    "Base classifier class"
    
    def __init__(self, 
                 X: 'numpy.ndarray', # Predictor variables
                 y: 'numpy.ndarray', # Target variable
                 test_size=0.2): # Train-test split portion
        self.X = X
        self.y = y
        self.test_size = 0.2
        # split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, 
                                                                                test_size=self.test_size)
        # instantiate model
        self.model = xgb.XGBClassifier()
        
    def train(self):
        "Train the model"
        return self.model.fit(self.X_train, self.y_train)
    
    def predict(self):
        "Return predictions for model"
        return self.model.predict(self.X_test), self.model.predict_proba(self.X_test)[:,1]
    
    def score(self):
        "Score the model according to chosen evaluation metric"
        self.model = self.train()
        preds, preds_proba = self.predict()
        return accuracy_score(preds, self.y_test), roc_auc_score(self.y_test, preds_proba)
        
    def __str__(self): 
        return f"Classifier: {len(self.X_train)} training, {len(self.X_test)} testing. \nModel: {self.model}"

It's very easy to get immediately high accuracy in two lines of code, using `cls.score()`:

In [None]:
# load the dataset
data = load_breast_cancer()

# create predictor and target variables
X, y = data.data, data.target

# use the regressor class
cls = Classifier(X, y)
accuracy, auc = cls.score()
print(f"Accuracy = {np.round(accuracy, 4)}")
print(f"AUC = {np.round(auc, 4)}")

Accuracy = 0.9649
AUC = 0.9974
