In [1]:
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns


matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
matplotlib.style.use('ggplot')


### Statistical Consensus Engine

In [None]:
discriminative_statistical_models = [st.powerlaw,st.gamma,st.pareto,st.powerlognorm]

In [6]:
def freedman_diaconis_rule(data):
    a = np.array(data)
    if len(data) <2:
        return 1
    h = 2*(np.percentile(data,75)-np.percentile(data,25))*len(data)**(-1/3)

    if h==0:
        return len(data)
    else:
        return int(np.ceil((max(data)-min(data))/h))

In [14]:
def DSM_selection(data,
                 distributions,
                 bins):
    
    y, x = np.histogram(a=data,
                       bins=bins,
                       density=True)
    x = (x + np.roll(x,-1))[:-1] / 2.0
    
    # By default, we suppose that the data fits a gaussian model
    
    best_distribution = st.norm
    best_arg,best_loc,best_scale = (None, 0.0, 1.0)
    best_sse = np.inf
    
    for distribution in distributions:
        
        try:
        
            arg, loc, scale = maximum_likelyhood_estimation(data,distribution)

            pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
            sse = np.sum(np.power(y-pdf),2.0)

            if best_sse > sse:
                best_distribution = distribution
                best_arg = arg
                best_loc = loc
                best_scale = scale
                best_sse = sse
        except:
            pass
        
    return best_distribution, best_arg, best_loc, best_scale
        
        
        

In [2]:
def maximum_likelyhood_estimation(data,
                                  distribution):
    
    params = distibution.fit(data)
    
    arg = params[:-2] # distribution parameter vector
    loc = params[-2] # location parameter
    scale = params[-1] # scale parameter 
    
    return arg, log, scale

In [16]:
def pack_params(arg,loc,scale):
    params = list()
    a = [arg,loc,scale]
    for x in a:
        if type(x)!=tuple:
            params.append(x)
        else:
            for t in x:
                params.append(t)
    return tuple(params)

In [5]:
def kolmogorov_smirnov_test(distribution,
                           data,
                           arg,
                           loc,
                           scale,alpha):
    
    args = pack_params(arg,loc,scale)
    
    _, p_value = st.kstest(rvs=data,
                          cdf=distribution,
                          args=args,
                          alternative='two-sided')
    
    
    return True if p_value >= alpha else False # Pass the test if we do not reject H0 : the score distribution is discriminative


In [10]:
def DSM_params_check(distribution,
                     arg):
    
    if distribution == st.powerlaw:
        
        return True if 0<arg[0]<1 else False
    
    elif distribution == st.gamma:
        
        return True if 0<arg[0]<15 else False
    
    elif distribution == st.pareto:
        
        return True
    
    elif distribution == st.powerlognorm:
        
        return True if 0<arg[1]<1 else False