In [1]:
import pandas as pd
from statistics import median
from projects import project_list

In [2]:
data = pd.read_csv('ci_skip_all_results.csv')
data['builds_saved'] = pd.to_numeric(data['builds_saved'])

In [3]:
data = data[~(data['update_method'] == 'half_exp')]

In [4]:
new_d = data[ (data['method'] == 'new_dynamic') & (data['ci_skip'] == 0)]
static = data[ (data['method'] == 'baseline_static') & (data['ci_skip'] == 0)]
base_d = data[ (data['method'] == 'baseline_dynamic') & (data['ci_skip'] == 0)]
ci_skip_lwd = data[ (data['method'] == 'new_dynamic') & (data['ci_skip'] == 1)]
ci_skip_bd = data[ (data['method'] == 'baseline_dynamic') & (data['ci_skip'] == 1)]
ci_skip_bs = data[ (data['method'] == 'baseline_static') & (data['ci_skip'] == 1)]

In [5]:
projects = list(set(ci_skip['project'].tolist()))
algorithms = ['BATCHBISECT', 'BATCHSTOP4', 'BATCHDIVIDE4']
types = set(ci_skip['update_method'].tolist())

In [6]:
higher_ci_skip = []

In [18]:
for alg in algorithms:
    wo_alg = new_d[ new_d['algorithm'] == alg]
    w_alg = ci_skip[ ci_skip['algorithm'] == alg]
    
    for t in types:
        wo_t = wo_alg[ wo_alg['update_method'] == t]
        w_t = w_alg[ w_alg['update_method'] == t]
        
        wo_t = wo_t.sort_values(by='project')
        w_t = w_t.sort_values(by='project')
        
        
        for x in range(len(wo_t)):
            wo_row = wo_t.iloc[x]
            w_row = w_t.iloc[x]
            
            if w_row['builds_saved'] > wo_row['builds_saved']:
                higher_ci_skip.append(w_row['builds_saved'] - wo_row['builds_saved'])

In [None]:
higher_ci_skip = []

for alg in algorithms:
    wo_alg = base_d[ base_d['algorithm'] == alg]
    w_alg = ci_skip[ ci_skip['algorithm'] == alg]
    
    for t in types:
        wo_t = wo_alg[ wo_alg['update_method'] == t]
        w_t = w_alg[ w_alg['update_method'] == t]
        
        wo_t = wo_t.sort_values(by='project')
        w_t = w_t.sort_values(by='project')
        
        
        for x in range(len(wo_t)):
            wo_row = wo_t.iloc[x]
            w_row = w_t.iloc[x]
            
            if w_row['builds_saved'] > wo_row['builds_saved']:
                higher_ci_skip.append(w_row['builds_saved'] - wo_row['builds_saved'])
                

In [20]:
median(higher_ci_skip)

0.6743365310014369

In [8]:
differences = {}

In [9]:
for alg in algorithms:
    differences[alg] = []
    base_alg = static[ static['algorithm'] == alg]
    new_alg = new_d[ new_d['algorithm'] == alg]
    batch_sizes = set(base_alg['batch_size'].tolist())
    
    for b in batch_sizes:
        base_b = base_alg[ base_alg['batch_size'] == b]
        
        for t in types:
            new_t = new_alg[ new_alg['update_method'] == t]
            factors = set(new_t['factor'].tolist())
            
            if len(factors) > 1:
                for f in factors:
                    new_f = new_t[ new_t['factor'] == f]
                    
                    new_f_l = new_f.sort_values(by='project', ascending=True)['builds_saved'].tolist()
                    base_b_l = base_b.sort_values(by='project', ascending=True)['builds_saved'].tolist()
                    diff = [new_f_l[d] - base_b_l[d] for d in range(len(new_f_l))]
                    differences[alg].extend(diff)
            else:
                new_t_l = new_t.sort_values(by='project', ascending=True)['builds_saved'].tolist()
                base_b_l = base_b.sort_values(by='project', ascending=True)['builds_saved'].tolist()
                
                diff = [new_t_l[d] - base_b_l[d] for d in range(len(new_t_l))]
                differences[alg].extend(diff)
                
        

In [10]:
for alg in algorithms:
    print(alg, median(differences[alg]))

BATCHBISECT 11.945096280638339
BATCHSTOP4 7.758870011821394
BATCHDIVIDE4 4.342687602737854


In [11]:
print(median([median(differences[alg]) for alg in algorithms]))

7.758870011821394


In [12]:
bd_differences = {}

In [13]:
for alg in algorithms:
    differences[alg] = []
    new_alg = new_d[ new_d['algorithm'] == alg]
    ci_alg = ci_skip[ ci_skip['algorithm'] == alg]
        
    for t in types:
        new_t = new_alg[ new_alg['update_method'] == t]
        ci_t = ci_alg[ ci_alg['update_method'] == t]
        
        factors = set(new_t['factor'].tolist())

        if len(factors) > 1:
            for f in factors:
                new_f = new_t[ new_t['factor'] == f]
                ci_f = ci_t[ ci_t['factor'] == f]
                
                new_f_l = new_f.sort_values(by='project', ascending=True)['builds_saved'].tolist()
                ci_f_l = ci_f.sort_values(by='project', ascending=True)['builds_saved'].tolist()
                
                diff = [new_f_l[d] - ci_f_l[d] for d in range(len(new_f_l))]
                differences[alg].extend(diff)
        else:
            new_t_l = new_t.sort_values(by='project', ascending=True)['builds_saved'].tolist()
            ci_t_l = ci_t.sort_values(by='project', ascending=True)['builds_saved'].tolist()

            diff = [new_t_l[d] - ci_t_l[d] for d in range(len(new_t_l))]
            differences[alg].extend(diff)
                
        

In [14]:
for alg in algorithms:
    print(alg, median(differences[alg]))

BATCHBISECT 0.16155572812684582
BATCHSTOP4 0.13064713910889925
BATCHDIVIDE4 0.11798173630899811


In [15]:
ciskip_differences = {}

In [16]:
for alg in algorithms:
    differences[alg] = []
    base_alg = base_d[ base_d['algorithm'] == alg]
    new_alg = new_d[ new_d['algorithm'] == alg]
    batch_sizes = set(base_alg['batch_size'].tolist())
    
        
    for t in types:
        new_t = new_alg[ new_alg['update_method'] == t]
        factors = set(new_t['factor'].tolist())

        if len(factors) > 1:
            for f in factors:
                new_f = new_t[ new_t['factor'] == f]

                new_f_l = new_f.sort_values(by='project', ascending=True)['builds_saved'].tolist()
                base_alg_l = base_alg.sort_values(by='project', ascending=True)['builds_saved'].tolist()
                diff = [new_f_l[d] - base_alg_l[d] for d in range(len(new_f_l))]
                differences[alg].extend(diff)
        else:
            new_t_l = new_t.sort_values(by='project', ascending=True)['builds_saved'].tolist()
            base_alg_l = base_alg.sort_values(by='project', ascending=True)['builds_saved'].tolist()

            diff = [new_t_l[d] - base_alg_l[d] for d in range(len(new_t_l))]
            differences[alg].extend(diff)

In [22]:
for alg in algorithms:
    print(alg)
    a_d = new_d[new_d['algorithm'] == alg]
    a_base = base_d[base_d['algorithm'] == alg]
    print('baseline_dynamic \t\t\t', median(a_base['builds_saved'].tolist()))
    for t in types:
        d_t = a_d[ a_d['update_method'] == t]
        factors = set(d_t['factor'].tolist())
        if len(factors) > 1:
            for f in factors:
                f_t = d_t[ d_t['factor'] == f]
                builds_saved = f_t['builds_saved'].tolist()
                print(t, '\t\t\t', f, '\t\t',median(builds_saved))
        else:
            builds_saved = d_t['builds_saved'].tolist()
            print(t, '\t\t\t', -1, '\t\t',median(builds_saved))

BATCHBISECT
baseline_dynamic 			 39.66995847022551
random_random 			 -1 		 41.58598432457354
random_exponential 			 -1 		 38.88903749549511
linear 			 1.0 		 41.556822032250714
linear 			 2.0 		 42.064758662269035
linear 			 3.0 		 40.22059084299748
linear 			 4.0 		 41.22431122124572
stagger 			 2.0 		 40.45997801184524
stagger 			 3.0 		 41.29543057547525
random_linear 			 -1 		 38.580345426818454
stagger_mfu 			 2.0 		 26.873267665460183
stagger_mfu 			 3.0 		 34.430115185693275
exponential 			 2.0 		 39.67088697379154
exponential 			 3.0 		 41.612943645968194
BATCHSTOP4
baseline_dynamic 			 44.3179241984737
random_random 			 -1 		 46.950452777292746
random_exponential 			 -1 		 46.63385243394134
linear 			 1.0 		 47.570909220448556
linear 			 2.0 		 48.639020825721595
linear 			 3.0 		 47.342525357307515
linear 			 4.0 		 49.902144307983434
stagger 			 2.0 		 46.48479404388709
stagger 			 3.0 		 47.58284345109908
random_linear 			 -1 		 47.40670767848266
stagger_mfu 			 2.0 		 40.3

In [50]:
for alg in algorithms:
    a_d = new_d[new_d['algorithm'] == alg]
    builds_saved = a_d['builds_saved'].tolist()
    print(alg, '\t\t',median(builds_saved))

BATCHBISECT 		 39.815217969534224
BATCHSTOP4 		 46.88148340248962
BATCHDIVIDE4 		 40.7501382340958


In [53]:
for alg in algorithms:
    a_d = static[static['algorithm'] == alg]
    batch_sizes = set(a_d['batch_size'].tolist())
    
    for b in batch_sizes:
        b_d = a_d[ a_d['batch_size'] == b]
        builds_saved = b_d['builds_saved'].tolist()
        
        print(alg, b, '\t\t',median(builds_saved))

BATCHBISECT 8 		 25.642507871423533
BATCHBISECT 16 		 24.287399629930704
BATCHBISECT 2 		 20.73354451369034
BATCHBISECT 4 		 26.15485172271487
BATCHSTOP4 8 		 38.717668377684795
BATCHSTOP4 16 		 38.2995328478506
BATCHSTOP4 4 		 37.75175249727814
BATCHDIVIDE4 8 		 38.717668377684795
BATCHDIVIDE4 16 		 28.447407757259306
BATCHDIVIDE4 4 		 37.75175249727814


In [15]:
# min_max_db = {}
# db = {}
# for alg in algorithms:
#     print(alg)
#     db[alg] = {}
#     min_max_db[alg] = {}
    
#     skip_alg = ci_skip[ ci_skip['algorithm'] == alg]
#     base_alg = base[ base['algorithm'] == alg]
    
# #     print(base_alg)
    
#     builds = base_alg['builds_saved'].tolist()
#     projects = base_alg['project'].tolist()
    
#     for i in range(len(projects)):
#         db[alg][projects[i]] = [builds[i], 0]
        
#     for t in types:
#         print(t)
#         skip_alg_t = skip_alg[ skip_alg['update_method']==t]
#         skip_alg_t = skip_alg_t.sort_values(by=['project'])
        
# #         print(skip_alg_t)
        
#         builds = skip_alg_t['builds_saved'].tolist()
#         projects = skip_alg_t['project'].tolist()
        
#         for i in range(len(projects)):
#             db[alg][projects[i]][1] = builds[i]
        
#         l = []
        
#         for p in db[alg]:
#             l.append(abs(db[alg][p][0]-db[alg][p][1]))
        
#         projects = list(db[alg].keys())
        
#         print('Min: {} {}'.format(min(l), projects[l.index(min(l))]))
#         print('Max: {} {}'.format(max(l), projects[l.index(max(l))]))
#         min_max_db[alg][t] = [(min(l), projects[l.index(min(l))]), (max(l), projects[l.index(max(l))])]
        
# #         print(db[alg])

In [16]:
# for alg in db:
#     keys = list(db[alg].keys())
#     vals = list(db[alg].values())
    
#     print('For {}, \nMinimum {} = {}, \nMaximum = {} = {}'.format(alg, keys[vals.index(min(vals))], min(vals), keys[vals.index(max(vals))], max(vals)))
    

In [17]:
# med_medians = []

# for alg in algorithms:
    
    
    
#     print(alg)
    
#     skip_alg = ci_skip[ ci_skip['algorithm'] == alg]
#     base_alg = base[ base['algorithm'] == alg]
    
#     batch_sizes = set(base_alg['batch_size'].tolist())
    
#     for b in batch_sizes:
#         print(b)
#         base_alg_b = base_alg[ base_alg['batch_size'] == b]
        
#         builds = base_alg_b['builds_saved'].tolist()
#         base_median = median(builds)
#         print(base_median)
        
#         for t in types:
#             meds = []
#             print(t)
#             skip_alg_t = skip_alg[ skip_alg['update_method']==t]
#             factors = set(skip_alg_t['factor'].tolist())

#             if len(factors) > 1:

#                 for f in factors:

#                     skip_alg_t_f = skip_alg_t[ skip_alg_t['factor'] == f]
#                     builds = skip_alg_t_f['builds_saved'].tolist()
#                     print(len(builds))
#                     skip_t_f_median = median(builds)

#                     print(base_median, skip_t_f_median, skip_t_f_median-base_median)
#                     meds.append(skip_t_f_median-base_median)
#             else:

#                 builds = skip_alg_t['builds_saved'].tolist()
#                 skip_t_median = median(builds)

#                 print(base_median, skip_t_median, skip_t_median-base_median)
#                 meds.append(skip_t_median-base_median)
#             medians.append(meds)