In [1]:
from metrics import *
from bootstrap import *
from metric_classes import *

<function condition_number_xtx at 0x7f1c540d6048>


In [2]:
str(Cp())

"Mallow's Cp"

In [3]:
c = Condition_Number()
#c.evaluate(model)

In [109]:
class EvaluateInfo():
    '''
    Abstract class.
    Use Static EvaluateInfo or DynamicEvaluateInfo.
    '''
    def __init__(self, model, metrics = [], comparisons = [], characteristics = []):
        '''
        Creates an EvaluateInfo object. This object is used to calculate the quality of a model 
        and the distribution of the quality using bootstrap.
        model - the model to use
        metrics - desired metrics are objects of class Metric or with the same interface(see Metric)
        comparisons - desired comparisons are objects of class Comparison or with the same interface(see Comparison)
        characteristics - desired characteristics are objects of class Сharacteristic
        or with the same interface(see Characteristic)
        '''
        self.model = model
        self.characteristics = characteristics # characteristic of the fitted model itself
        self.metrics = metrics # compares y and y_pred
        self.comparisons = comparisons #compares y, y_pred, y_pred_with_reduced_features
        
        all_names = [str(el) for el in metrics] +\
                     [str(el) for el in comparisons] +\
                     [str(el) for el in characteristics]
        
        self.names = dict(zip(all_names, [i for i in range(len(all_names))]))
        #print(self.names)
        
    def _calcQuality(self):
        '''
        Calculates the value of all metrics on test sample
        '''
        
        model = self.model
        for (m, mask) in enumerate(self.masks):
            reduced_X_test = self.X_test[:, mask]
            model.fit(self.X_train.T[mask].T, self.y_train)
            for (i, metric) in enumerate(self.metrics):
                self.quality[i][m] = metric.evaluate(model, reduced_X_test, self.y_test)

            for (i, comp) in enumerate(self.comparisons):
                ind = i + len(self.metrics)
                self.quality[ind][m] = comp.evaluate(self.full, model, self.X_test,
                                                                 reduced_X_test, self.y_test)
            for (i, char) in enumerate(self.characteristics):
                ind = i + len(self.metrics) + len(self.comparisons)
                self.quality[ind][m] = char.evaluate(model)
        
    def fit(self, X_train, y_train, X_test, y_test, masks = None, n_samples=20, len_sample=None):
        '''
        X_train - train features. 2D numpy array or list
        
        y_train - train answers. 2D numpy array or list
        
        X_test - test features. 2D numpy array or list
        
        y_test - test answers. 2D numpy array or list
        
        masks - sets of desired features. List or numpy array of lists or numpy arays. 
        Each list/numpy array represents a boolean mask of taken features.
        If None, one mask containing all features will be used.
        
        n_samples - number of bootstrap samples to generate. Default is 20.
        
        len_sample - length of each bootstrap sample
        If None, it will be equal to the number of features.
        '''
        
        self.len_sample = len_sample
        
        self.n_samples = n_samples

        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)

        self.X_test = np.array(X_test)
        self.y_test = np.array(y_test)
        
        if masks is None:
            masks = np.ones((1, self.X_train.shape[1]), dtype=bool)
        
        self.masks = np.array(masks, dtype=bool)
        
        # model with all features
        self.full = deepcopy(self.model.fit(self.X_train, self.y_train))
        
        self.result = np.zeros((len(self.metrics) + len(self.comparisons) 
                                + len(self.characteristics), len(self.masks), self.n_samples))
        
        self.quality = np.zeros((len(self.metrics) + len(self.comparisons) 
                                + len(self.characteristics), len(self.masks)))
        self._calcQuality()
        
    
    
    def get_quality(self):
        '''Returns the value of metrics'''
        return self.quality
    
    def get_result(self):
        '''Returns the value of metrics generated with bootstrap'''
        return self.result
    
    def _get_mapped(self, array):
        '''Returns the value of something mapped to names'''
        return dict(zip(self.names.keys(), list(array)))
    
    def get_mapped_quality(self):
        '''Returns the value of metrics mapped to their names'''
        return self._get_mapped(self.quality)
    
    def get_mapped_result(self):
        '''Returns the value of metrics generated with bootstrap mapped to their names'''
        return self._get_mapped(self.result)
    
    def _by_name(self, name, array):
        '''Return element of array corresponding to the name'''
        if name in self.names:
            return array[self.names[name]]
        else:
            raise NameError('Invalid argument')
        
    def quality_by_name(self, name):
        '''Return quality of a particular metric'''
        return self._by_name(name, self.quality)
            
    def result_by_name(self, name):
        '''Return bootstrap quality of a particular metric'''
        return self._by_name(name, self.result)
            
    

In [110]:
from copy import deepcopy
class EvaluateStaticInfo(EvaluateInfo):
    '''
    Computes the values and distributions of metrics using bootstrap.
    A model is fit on one train sample, but the quality is evaluated on bootstrap test samples
    '''
    def __init__(self, model, metrics = [], comparisons = []):
        super(EvaluateStaticInfo, self).__init__(model, metrics, comparisons, [])
    
    def __eval(self):
        '''
        Computes the bootstrap result
        '''
        model = self.model
        
        sample_X, sample_y = self.boot.values()
        
        self.models = []
        for mask in self.masks:
            self.models += [deepcopy(model.fit(self.X_train[:, mask], self.y_train))]
        
        for (m, mask) in enumerate(self.masks):
            for it in range(self.n_samples):
                reduced_X_cur = (sample_X[it].T[mask]).T
                for (i, metric) in enumerate(self.metrics):
                    self.result[i][m][it] = metric.evaluate(self.models[m], reduced_X_cur, sample_y[it])
                for(i, comp) in enumerate(self.comparisons):
                    ind = i + len(self.metrics)
                    self.result[ind][m][it] = comp.evaluate(self.full, self.models[m], sample_X[it], 
                                                              reduced_X_cur, sample_y[it])

    def fit(self, X_train, y_train, X_test, y_test, masks = None, n_samples=20, len_sample=None):
        '''See EvaluateInfo - fit'''
        super(EvaluateStaticInfo, self).fit(X_train, y_train, X_test, y_test, masks, n_samples, len_sample)
        if self.len_sample is None:
            self.len_sample = len(X_test)
        self.boot = Bootstrap(self.X_test, self.y_test, self.n_samples, self.len_sample)
        self.__eval()
        

In [111]:
from copy import deepcopy
class EvaluateDynamicInfo(EvaluateInfo):
    '''
    Computes the values and distributions of metrics using bootstrap.
    A model is fit on bootstrap train samples, but the quality is evaluated on one test sample
    '''
    def __init__(self, model, metrics = [], comparisons = [], characteristics = []):
        super(EvaluateDynamicInfo, self).__init__(model, metrics, comparisons, characteristics)
    
    def __eval(self):
        '''
        Computes the bootstrap result
        '''
        
        model = self.model
        
        sample_X, sample_y = self.boot.values()
            
        for (m, mask) in enumerate(self.masks):
            reduced_X_test = (self.X_test.T[mask]).T
            for it in range(self.n_samples):
                model.fit((sample_X[it].T[mask]).T, sample_y[it])
                for (i, metric) in enumerate(self.metrics):
                    #print(i)
                    self.result[i][m][it] = metric.evaluate(model, reduced_X_test, self.y_test)
                
                for (i, comp) in enumerate(self.comparisons):
                    ind = i + len(self.metrics) 
                    #print(i)
                    self.result[ind][m][it] = comp.evaluate(self.full, model, self.X_test,
                                                              reduced_X_test, self.y_test)
                for (i, char) in enumerate(self.characteristics):
                    ind = i + len(self.metrics) + len(self.comparisons)
                    #print(i)
                    self.result[ind][m][it] = char.evaluate(model)

                
                

    def fit(self, X_train, y_train, X_test, y_test, masks = None, n_samples=20, len_sample=None):
        '''See EvaluateInfo - fit'''
        super(EvaluateDynamicInfo, self).fit(X_train, y_train, X_test, y_test, masks, n_samples, len_sample)
        if self.len_sample is None:
            self.len_sample = len(X_test)
        self.boot = Bootstrap(self.X_test, self.y_test, self.n_samples, self.len_sample)
        self.__eval()
    

In [112]:
from sklearn.linear_model import LinearRegression
mo = LinearRegression()

In [113]:
import scipy.stats as sps

XX = sps.norm.rvs(size=(10, 5))
X2 = sps.norm.rvs(size=(10, 5))
zz = sps.uniform.rvs(size=(5, 3))
yy = XX @ zz + sps.norm.rvs(scale=0.1, size=(10, 3))
y2 = X2 @ zz + sps.norm.rvs(scale=0.1, size=(10, 3))
maskk = np.ones(XX.shape[1])
maskk[2]=0
maskk[1] = 0
maskk[3] = 0
print(maskk)

metric1 = RSS()
metric2 = Adjusted_Determination_Coefficient()
metric3 = Determination_Coefficient()
metric4 = VIF()
metric5 = BIC()
char1 = Condition_Number()
comp1 = Cp()

[ 1.  0.  0.  0.  1.]


In [114]:
info = EvaluateDynamicInfo(mo, [metric1, metric2, metric3, metric4, metric5],
                           [comp1])#, [char1])

In [115]:
info.fit(list(XX), list(yy), list(X2), list(y2))#, masks=[np.ones(len(XX.T)), maskk])

In [116]:
info.get_result()

array([[[  7.58032500e-01,   3.53191860e+00,   1.03386878e+01,
           1.13917138e-01,   2.32691077e-01,   4.32210875e-01,
           1.64686009e-01,   3.87500582e+04,   1.90631139e-01,
           1.12072021e-01,   1.04633536e-01,   1.56497193e+00,
           5.74458519e+00,   1.46676425e+00,   4.40350527e-01,
           5.95452971e+00,   1.56497193e+00,   3.08889312e+02,
           1.79306552e-01,   2.17911436e-01]],

       [[  9.88312261e-01,   9.45543044e-01,   8.40592741e-01,
           9.98243566e-01,   9.96412248e-01,   9.93335948e-01,
           9.97460786e-01,  -5.96468528e+02,   9.97060750e-01,
           9.98272015e-01,   9.98386706e-01,   9.75870450e-01,
           9.11427001e-01,   9.77384667e-01,   9.93210447e-01,
           9.08189967e-01,   9.75870450e-01,  -3.76261588e+00,
           9.97235358e-01,   9.96640128e-01]],

       [[  9.84972907e-01,   9.29983914e-01,   7.95047810e-01,
           9.97741728e-01,   9.95387176e-01,   9.91431933e-01,
           9.96735296e

In [117]:
info.get_quality()

array([[  0.76805396],
       [  0.98815774],
       [  0.98477424],
       [ 84.44337652],
       [ 12.28097943],
       [  1.        ]])

In [118]:
info.result_by_name("Mallow's Cp")

array([[  9.86952136e-01,   4.59852923e+00,   1.34608872e+01,
          1.48319185e-01,   3.02961886e-01,   5.62735036e-01,
          2.14419842e-01,   5.04522599e+04,   2.48200189e-01,
          1.45916858e-01,   1.36232011e-01,   2.03758069e+00,
          7.47940309e+00,   1.90971510e+00,   5.73332797e-01,
          7.75274915e+00,   2.03758069e+00,   4.02171366e+02,
          2.33455669e-01,   2.83718914e-01]])

In [119]:
info.get_mapped_result()

{'Adj Det Coef': array([[  7.58032500e-01,   3.53191860e+00,   1.03386878e+01,
           1.13917138e-01,   2.32691077e-01,   4.32210875e-01,
           1.64686009e-01,   3.87500582e+04,   1.90631139e-01,
           1.12072021e-01,   1.04633536e-01,   1.56497193e+00,
           5.74458519e+00,   1.46676425e+00,   4.40350527e-01,
           5.95452971e+00,   1.56497193e+00,   3.08889312e+02,
           1.79306552e-01,   2.17911436e-01]]),
 'BIC': array([[  9.86952136e-01,   4.59852923e+00,   1.34608872e+01,
           1.48319185e-01,   3.02961886e-01,   5.62735036e-01,
           2.14419842e-01,   5.04522599e+04,   2.48200189e-01,
           1.45916858e-01,   1.36232011e-01,   2.03758069e+00,
           7.47940309e+00,   1.90971510e+00,   5.73332797e-01,
           7.75274915e+00,   2.03758069e+00,   4.02171366e+02,
           2.33455669e-01,   2.83718914e-01]]),
 'Det Coef': array([[   0.98497291,    0.92998391,    0.79504781,    0.99774173,
            0.99538718,    0.99143193,    0.