In [1]:
import numpy as np
import scipy.stats

In [2]:
text_in = '''

LightGBM:                                     mean t_int_l t_int_h    std                           values
    Designed features:                                                                                    
        Age group   |   0.626 \pm 0.004   | 0.6323  0.6268  0.6379 0.0040  [0.628 0.630 0.630 0.635 0.638]
        Gender      |   0.875 \pm 0.004   | 0.8798  0.8751  0.8844 0.0033  [0.874 0.879 0.881 0.882 0.882]
        Assessment  |   0.591 \pm 0.003   | 0.5909  0.5872  0.5946 0.0027  [0.588 0.588 0.591 0.593 0.594]
        Retail      |   0.545 \pm 0.001   | 0.5454  0.5439  0.5469 0.0011  [0.544 0.545 0.545 0.546 0.547]
    CPC embeddings:         
        Age group   |   0.595 \pm 0.004   | 0.5939  0.5900  0.5977 0.0028  [0.590 0.593 0.594 0.595 0.598]
        Gender      |   0.848 \pm 0.004   | 0.8572  0.8545  0.8600 0.0020  [0.855 0.856 0.857 0.858 0.860]
        Assessment  |   0.584 \pm 0.004   | 0.5838  0.5790  0.5886 0.0035  [0.578 0.584 0.585 0.586 0.587]
        Retail      |   0.527 \pm 0.001   | 0.5265  0.5252  0.5278 0.0009  [0.525 0.526 0.527 0.527 0.528]
    MeLES embeddings:       
        Age group   |   0.639 \pm 0.006   | 0.6419  0.6372  0.6466 0.0034  [0.636 0.642 0.643 0.644 0.645]
        Gender      |   0.872 \pm 0.005   | 0.8821  0.8801  0.8840 0.0014  [0.880 0.882 0.882 0.882 0.884]
        Assessment  |   0.604 \pm 0.003   | 0.6041  0.5994  0.6088 0.0034  [0.598 0.604 0.605 0.606 0.607]
        Retail      |   0.544 \pm 0.001   | 0.5439  0.5421  0.5457 0.0013  [0.542 0.544 0.544 0.545 0.545]

Scores:                       
    Supervised learning:    
        Age group   |   0.631 \pm 0.010   | 0.6285  0.6172  0.6399 0.0082  [0.619 0.625 0.626 0.631 0.641]
        Gender      |   0.871 \pm 0.007   | 0.8741  0.8654  0.8828 0.0063  [0.865 0.872 0.873 0.879 0.881]
        Assessment  |   0.601 \pm 0.006   | 0.6010  0.5948  0.6072 0.0045  [0.596 0.600 0.600 0.601 0.608]
        Retail      |   0.543 \pm 0.002   | 0.5425  0.5403  0.5447 0.0016  [0.540 0.542 0.542 0.544 0.544]
    CPC fine-tuning:        
        Age group   |   0.621 \pm 0.007   | 0.6210  0.6166  0.6254 0.0032  [0.617 0.618 0.622 0.623 0.625]
        Gender      |   0.873 \pm 0.007   | 0.8777  0.8726  0.8829 0.0037  [0.874 0.875 0.878 0.879 0.883]
        Assessment  |   0.611 \pm 0.005   | 0.6115  0.6047  0.6184 0.0049  [0.603 0.611 0.613 0.615 0.616]
        Retail      |   0.546 \pm 0.003   | 0.5461  0.5429  0.5492 0.0023  [0.542 0.546 0.546 0.547 0.548]
    MeLES fine-tuning:      
        Age group   |   0.643 \pm 0.007   | 0.6383  0.6331  0.6435 0.0037  [0.632 0.637 0.641 0.641 0.641]
        Gender      |   0.888 \pm 0.002   | 0.8959  0.8910  0.9009 0.0036  [0.890 0.896 0.898 0.898 0.898]
        Assessment  |   0.614 \pm 0.003   | 0.6135  0.6095  0.6176 0.0029  [0.608 0.614 0.615 0.615 0.616]
        Retail      |   0.549 \pm 0.001   | 0.5490  0.5479  0.5500 0.0008  [0.548 0.549 0.549 0.550 0.550]
'''


In [3]:
def get_scores(l):
    if l[:8] != ' ' * 8:
        return (None, None)
    
    dataset, mean, scores = [x.strip() for x in l.split('|')]
    scores = scores[-30:-1]
    scores = np.array([float(x) for x in scores.split()])
    return dataset, scores

In [4]:
def t_interval(x, p=0.95, eps=1e-9):
    n = len(x)
    s = x.std(ddof=1)

    return scipy.stats.t.interval(p, n - 1, loc=x.mean(), scale=(s + eps) / (n ** 0.5))

In [5]:
def get_deltas(dataset, scores_x, baselines):
    if not dataset:
        return None, None
    """
    scores_x - row for estimate
    scores_y - baseline for this dataset
    
    """
    scores_y = baselines[dataset]
    
    # all possible pairs
    pairs = (scores_x.reshape(-1, 1) - scores_y.reshape(1, -1)).flatten()
    
    # t-stat intervals with 95% configence
    t_l, t_h = t_interval(pairs)
    
    # mean uplift for row vs. baseline
    x = pairs.mean()
    # delta for 95% confidence interval for `x`
    delta = t_h - x
    
    # return x and delta in %%
    return x / scores_y.mean() * 100, delta / scores_y.mean() * 100

In [6]:
def print_deltas(o):
    dataset, scores, (uplift_mean, uplift_delta) = o
    if not dataset:
        return ''
    
    t_l, t_h = t_interval(scores)
    x = scores.mean()
    d = t_h - x
    
    return ' | '.join([
        '',
        f'{x:.3f} \pm {d:.3f}',
        f'{uplift_mean:4.1f}% \pm {uplift_delta:4.1f}%'
    ])

In [7]:
text = text_in.split('\n')
text = [(l, get_scores(l)) for l in text]
baselines = {dataset: scores for l, (dataset, scores) in text[:8] if dataset}
text = [(l, (dataset, scores, get_deltas(dataset, scores, baselines))) for l, (dataset, scores) in text]
text = [l + print_deltas(o) for l, o in text]
text_out = '\n'.join(text)

print(text_out)



LightGBM:                                     mean t_int_l t_int_h    std                           values
    Designed features:                                                                                    
        Age group   |   0.626 \pm 0.004   | 0.6323  0.6268  0.6379 0.0040  [0.628 0.630 0.630 0.635 0.638] | 0.632 \pm 0.005 |  0.0% \pm  0.3%
        Gender      |   0.875 \pm 0.004   | 0.8798  0.8751  0.8844 0.0033  [0.874 0.879 0.881 0.882 0.882] | 0.880 \pm 0.004 |  0.0% \pm  0.2%
        Assessment  |   0.591 \pm 0.003   | 0.5909  0.5872  0.5946 0.0027  [0.588 0.588 0.591 0.593 0.594] | 0.591 \pm 0.003 |  0.0% \pm  0.3%
        Retail      |   0.545 \pm 0.001   | 0.5454  0.5439  0.5469 0.0011  [0.544 0.545 0.545 0.546 0.547] | 0.545 \pm 0.001 |  0.0% \pm  0.1%
    CPC embeddings:         
        Age group   |   0.595 \pm 0.004   | 0.5939  0.5900  0.5977 0.0028  [0.590 0.593 0.594 0.595 0.598] | 0.594 \pm 0.004 | -6.0% \pm  0.3%
        Gender      |   0.848 \pm 0.004 

In [8]:
baselines

{'Age group': array([0.628, 0.63 , 0.63 , 0.635, 0.638]),
 'Gender': array([0.874, 0.879, 0.881, 0.882, 0.882]),
 'Assessment': array([0.588, 0.588, 0.591, 0.593, 0.594]),
 'Retail': array([0.544, 0.545, 0.545, 0.546, 0.547])}