This notebook is to go through the MIP API for a classification project to determine how we might use it.

We want to provide sensible defaults as well. We'll make some assumptions around cut points etc.

https://www.berkustun.com/docs/ustun_2017_optimized_risk_scores.pdf

In order to build a proper benchmark, the input features need to all be binary!
The paper provides pre-processed datasets here: https://github.com/ustunb/risk-slim/tree/master/examples/data

The question really is how do we learn these automatically without custom preprocessing?

In [235]:
from sklearn.base import ClassifierMixin
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import SGDClassifier
import tabulate
import math
import numpy as np

In [3]:
svm_mod = SGDClassifier(penalty="elasticnet")

In [4]:
data = load_breast_cancer()

In [5]:
svm_mod.fit(data.data, data.target)
svm_mod.score(data.data, data.target)

0.8014059753954306

In [6]:
# try rounding all coef

In [7]:
coef_scaling = np.max(svm_mod.coef_)

svm_mod.intercept_ = np.round((svm_mod.intercept_ / coef_scaling) * 5)
svm_mod.coef_ = np.round((svm_mod.coef_ / coef_scaling) * 5)
svm_mod.score(data.data, data.target)

0.8084358523725835

In [8]:
svm_mod = SGDClassifier()
svm_mod.fit(data.data, data.target)
print(svm_mod.score(data.data, data.target))
coef_scaling = np.max(svm_mod.coef_)

svm_mod.intercept_ = np.round((svm_mod.intercept_ / coef_scaling) * 5)
svm_mod.coef_ = np.round((svm_mod.coef_ / coef_scaling) * 5)
print(svm_mod.score(data.data, data.target))

0.9226713532513181
0.8541300527240774


In [9]:
from sklearn.base import ClassifierMixin

In [18]:
class RescaledRounding(ClassifierMixin):
    def __init__(self, kambda=5, **kwargs):
        self.kambda = kambda
        self.model = SGDClassifier(**kwargs)
        
    def rescale(self):
        coef_scaling = np.max(np.abs(self.model.coef_))
        self.model.coef_ = np.round((self.model.coef_/coef_scaling)*self.kambda)
        self.model.intercept_ = np.round((self.model.intercept_/coef_scaling)*self.kambda)
        
    def fit(self, X, y):
        self.model.fit(X, y)
        self.rescale()
        
    
    def partial_fit(self, X, y):
        self.model.partial_fit(X, y)
        self.rescale()
        
    def predict(self, X):
        return self.model.predict(X)
               

In [19]:
class NaiveRounding(ClassifierMixin):
    def __init__(self, kambda=5, **kwargs):
        self.kambda = kambda
        self.model = SGDClassifier(**kwargs)
        
    def rescale(self):
        self.model.coef_ = np.maximum(self.model.coef_, -self.kambda)
        self.model.coef_ = np.minimum(self.model.coef_, self.kambda)
        self.model.intercept_ = np.maximum(self.model.intercept_, -self.kambda)
        self.model.intercept_ = np.minimum(self.model.intercept_, self.kambda)
        
        self.model.coef_ = np.round(self.model.coef_)
        self.model.intercept_ = np.round(self.model.intercept_)
        
    def fit(self, X, y):
        self.model.fit(X, y)
        self.rescale()
        
    
    def partial_fit(self, X, y):
        self.model.partial_fit(X, y)
        self.rescale()
               
    def predict(self, X):
        return self.model.predict(X)

In [167]:
mod1 = RescaledRounding()
mod1.fit(data.data, data.target)
print(mod1.score(data.data, data.target))

mod2 = NaiveRounding()
mod2.fit(data.data, data.target)
print(mod2.score(data.data, data.target))


0.6836555360281195
0.8875219683655536


In [234]:
mod1.model.coef_.shape

(1, 30)

In [256]:
class IntegerSGDClassifier(ClassifierMixin):
    """
    Converts SGDClassifier to integer weights. Using a naive/simple
    rescaling scheme. This may not work optimally when updating
    using minibatch; especially if it just one point at a time.
    """
    def __init__(self, kambda=5, kambda_naive=False, column_names=None, **kwargs):
        self.kambda = kambda
        self.kambda_naive = kambda_naive
        self.column_names = column_names
        self.model = SGDClassifier(**kwargs)
        
    def score_table(self, column_names=None):
        assert len(self.model.classes_) == 2, "Scoring Table only implemented for Binary Classification"
        if column_names is not None:
            self.column_names = column_names
        
        if self.column_names is None:
            self.column_names = [f"c{i}" for i in range(self.model.coef_.shape[1])]
        
        table = []
        for idx, val in enumerate(self.integer_coef_.flatten()):
            if val == 0:
                continue
            row = [self.column_names[idx], val]
            table.append(row)
            
        row = ["", -self.integer_intercept[0]]
        table.append(row)
        print(tabulate.tabulate(table))
        
    def rescale(self, naive=False):
        coef_ = self.model.coef_
        intercept_ = self.model.intercept_
        if naive:
            coef_ = np.maximum(coef_, -self.kambda)
            coef_ = np.minimum(coef_, self.kambda)
            intercept_ = np.maximum(intercept_, -self.kambda)
            intercept_ = np.minimum(intercept_, self.kambda)

            coef_ = np.round(coef_)
            intercept_ = np.round(intercept_)
        else:
            coef_scaling = np.max(np.abs(coef_))
            coef_ = np.round((coef_/coef_scaling)*self.kambda)
            intercept_ = np.round((intercept_/coef_scaling)*self.kambda)
            
        self.integer_coef_ = coef_
        self.integer_intercept = intercept_
        
    def fit(self, X, y):
        self.model.fit(X, y)
        self.rescale(self.kambda_naive)
        
    
    def partial_fit(self, X, y):
        self.model.partial_fit(X, y)
        self.rescale(self.kambda_naive)
        
    def predict(self, X):
        self.coef_ = self.model.coef_.copy()
        self.intercept_ = self.model.intercept_.copy()
        self.model.coef_ = self.integer_coef_
        self.model.intercept_ = self.integer_intercept
        y = self.model.predict(X)
        self.model.coef_ = self.coef_.copy()
        self.model.intercept_ = self.intercept_.copy()
        return y
               

In [257]:
mod1 = IntegerSGDClassifier()
mod1.fit(data.data, data.target)
print(mod1.score(data.data, data.target))


0.9138840070298769


In [258]:
mod1.score_table()

---  --
c0    1
c1    1
c2    5
c3    1
c13  -2
c20   1
c22   5
c23  -2
     -1
---  --
