In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#| default_exp ML

# Traditional ML

> Using traditional machine learning techniques to solve the problem.

In [None]:
#|export
from teburu.core import *
from nbdev.showdoc import *
from fastcore.all import *
from sklearn.datasets import load_breast_cancer, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
# classification models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
# regression models
import xgboost as xgb
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

## Regression Results

In [None]:
#|export
class RegressionResults:
    "Regression results from model set"
    
    def __init__(self, 
                 X, # Predictor variables
                 y): # Target variables
        self.X, self.y = X, y
        self.lr = linear_model.LinearRegression()
        self.dt = DecisionTreeRegressor()
        self.rf = RandomForestRegressor()
        self.xgboost = xgb.XGBRegressor()
        self.ensemble = VotingRegressor(estimators=[('lr', self.lr), ('dt', self.dt), 
                                                    ('rf', self.rf), ('xgboost', self.xgboost)])
        self.models_list = [self.lr, self.dt, self.rf, self.xgboost, self.ensemble]
        
    def report(self):
        "Generate RMSEs for each model"
        rmses = []
        for model in self.models_list:
            reg = Regressor(self.X, self.y, model)
            rmses.append(reg.score())
        return rmses
    
    def df_report(self):
        "Print out a dataframe of results"
        rmses = self.report()
        models = ["linear_regression", "decision_tree", "random_forest", "xgboost", "ensemble"]
        df_dict = {'models': models, 'RMSE': rmses}
        return pd.DataFrame(df_dict)

With this, it's easy to compare models in increasing order of complexity:

In [None]:
# load the dataset
data = load_diabetes()

# create predictor and target variables
X, y = data.data, data.target

# use the regressor class
results = RegressionResults(X, y)
rmses = results.report()
rmses

[51.4879171646226,
 71.42395972271427,
 59.789747909287755,
 59.629068259985466,
 53.0686961154079]

We can even print out the results as a Pandas `DataFrame`:

In [None]:
results.df_report()

Unnamed: 0,models,RMSE
0,linear_regression,56.599953
1,decision_tree,81.259537
2,random_forest,57.796375
3,xgboost,62.734929
4,ensemble,54.323364


## Classification results
Instead of just the one metric, classification is best described by several metrics: accuracy, and ROC AUC score.

In [None]:
#|export
class ClassificationResults:
    "Classification results from model set"
    
    def __init__(self, X, y):
        self.X, self.y = X, y
        self.lr = linear_model.LogisticRegression()
        self.dt = DecisionTreeClassifier()
        self.rf = RandomForestClassifier()
        self.xgboost = xgb.XGBClassifier()
        self.ensemble = VotingClassifier(estimators=[('lr', self.lr), ('dt', self.dt), 
                                             ('rf', self.rf), ('xgboost', self.xgboost)], voting='hard')
        self.models_list = [self.lr, self.dt, self.rf, self.xgboost, self.ensemble]
        
    def report(self):
        "Generate performance metric for each model"
        accuracies, aucs = [], []
        for model in self.models_list:
            reg = Classifier(self.X, self.y, model)
            accuracy, auc = reg.score()
            accuracies.append(accuracy)
            aucs.append(auc)
        return accuracies, aucs
    
    def df_report(self):
        "Print out a dataframe of results"
        accuracies, aucs = self.report()
        models = ["linear_regression", "decision_tree", "random_forest", "xgboost", "ensemble"]
        df_dict = {'models': models, 'accuracy': accuracies, 'auc': aucs}
        return pd.DataFrame(df_dict)

We call `df_report()` again to get a dataframe of results:

In [None]:
# load the dataset
data = load_breast_cancer()

# create predictor and target variables
X, y = data.data, data.target

# use the classifier class
results = ClassificationResults(X, y)
results.df_report()

Unnamed: 0,models,accuracy,auc
0,linear_regression,0.982456,0.998701
1,decision_tree,0.991228,1.0
2,random_forest,0.973684,0.974017
3,xgboost,0.964912,0.998291
4,ensemble,0.982456,1.0
