In [15]:
import pandas as pd
from statistics import mean, median
import sys
import pprint
import pickle

In [16]:
project_ytest_lib = {}

In [17]:
def output_values(Y_data):
    Y_t = []
    for e in Y_data:
        if e == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t

In [18]:
def get_first_failures(df):
    
    results = df['tr_status'].tolist()
    length = len(results)
    verdict = ['keep']
    prev = results[0]
    
    for i in range(1, length):
        if results[i] == 0:
            if prev == 0:
                verdict.append('discard')
                #print(i+1)
            else:
                verdict.append('keep')
        else:
            verdict.append('keep')
        prev = results[i]
    
    df['verdict'] = verdict
    df = df[ df['verdict'] == 'keep' ]
    df.drop('verdict', inplace=True, axis=1)
    return df

In [19]:
def str_to_list(s):
    if s == '[]':
        return [0]
    l = s[1:-1].split(', ')
    l = [int(x) for x in l]
    return l

In [20]:
def separate_versions(results):
    version_dfs = []
    
    for i in range(1,11):
        ver = results[ results['version']==i]
        version_dfs.append(ver)
    return version_dfs

In [21]:
def get_project_delays(ci, y_test, batch_size):
    
    
    y_test = output_values(y_test)
    
    sbs_list = []
    missed = []
    b = batch_size
    
    i = 0

    while i < len(ci):

        if ci[i] == 0:
            if y_test[i] == 0:
                sbs_list.append(0)

            while len(missed) > 0:
                ind = missed.pop()
                sbs_list.append(i - ind)

            b -= 1
            if b == -1:
                b = batch_size - 1

        if ci[i] == 1:
            if y_test[i] == 0:
                missed.append(i)

        i += 1
    while len(missed) > 0:
            sbs_list.append(i - missed.pop())
    
    return sbs_list
    

In [22]:
def start_result_collection(filename):
    
    global project_ytest_lib
    
    results = pd.read_csv(filename)
    results = results.rename(columns={'project_delays':'sbs_delays', 'batch_delays':'total_batch_delay', 'batch_median':'batch_delays'})
    
    projects = set(results['project'].tolist())
    all_versions = pd.DataFrame()
        
    for p in projects:
        
        p_name = p.split('/')[1]
        #getting project data
        p_data = results[ results['project']==p]
        pframe = pd.DataFrame()
        
        #splitting data into versions
        versions = separate_versions(p_data)
        
        for start in range(0,10):
            if len(versions[start]) > 0:
                pframe = versions[start]
                break
        
        #starting with the first version's project frame
        for x in range(len(pframe)):
            row = pframe.iloc[x]            
            alg = row['algorithm']
            b = row['batch_size']
            conf = row['confidence']
            
            if p in project_ytest_lib:
                y_test = project_ytest_lib[p]
            else:
                test_file = '../data/25_1_travis_data/' + p
                y_test = pd.read_csv(test_file, usecols=['tr_build_id', 'tr_status'])
                project_ytest_lib[p] = y_test
                
            
            index_file = 'datasets/' + p_name + '_' + str(row['version']) + '.pkl'
            with open(index_file, 'rb') as infile:
                test_indexes = pickle.load(infile)
            
            ver_xtest = y_test [ y_test['tr_build_id'].isin(test_indexes)]
            ver_ytest = ver_xtest['tr_status'].tolist()
                
            
            final_batch_delays = str_to_list(row['batch_delays'])
            final_ci = str_to_list(row['ci'])
            final_sbs_delays = get_project_delays(final_ci, ver_ytest, b)
            
            
            #appending other frames to outer frame
            for i in range(start+1,10):
                next_ver = versions[i]
                
                #extracting corresponding outer row for each version
                new_df = next_ver[ (next_ver['algorithm']==alg) & (next_ver['batch_size']==b) & (next_ver['confidence']==conf)]
                
                if len(new_df) > 0:
                    new_row = new_df.iloc[0]
                else:
                    continue
                
                total_reqd_builds = (row['project_reqd_builds']*row['testall_size']) + (new_row['project_reqd_builds']*new_row['testall_size'])
                total_missed_builds = (row['project_missed_builds']*row['testall_size']) + (new_row['project_missed_builds']*new_row['testall_size'])
                total_saved_builds = (row['project_saved_builds']*row['testall_size']) + (new_row['project_saved_builds']*new_row['testall_size'])
                total_size = row['testall_size'] + new_row['testall_size']
                
                row['project_reqd_builds'] = total_reqd_builds/total_size
                row['project_missed_builds'] = total_missed_builds/total_size
                row['project_saved_builds'] = total_saved_builds/total_size
                row['testall_size'] = total_size
                
                index_file = 'datasets/' + p_name + '_' + str(new_row['version']) + '.pkl'
                with open(index_file, 'rb') as infile:
                    test_indexes = pickle.load(infile)
                            
                ver_xtest = y_test [ y_test['tr_build_id'].isin(test_indexes)]
                ver_ytest = ver_xtest['tr_status'].tolist()
                
                new_ci = str_to_list(new_row['ci'])
                final_sbs_delays.extend(get_project_delays(new_ci, ver_ytest, b))
                final_batch_delays.extend(str_to_list(new_row['batch_delays']))
                final_ci.extend(new_ci)
                
                row['sbs_delays'] = final_sbs_delays
                row['batch_delays'] = final_batch_delays
                row['ci'] = final_ci
                                                            
            pframe.iloc[x] = row
        all_versions = all_versions.append(pframe)
    
    return all_versions

In [23]:
filenames = ['all_timeout_results.csv']
file_root = './'
dfs = []

for file in filenames[:1]:
    dfs.append(start_result_collection(file_root+file))

#combining dfs
final_df = pd.DataFrame()
final_df = dfs[0]

for i in range(1, len(dfs)):
    final_df = final_df.append(dfs[i])

final_df.to_csv('all_combined_results.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


In [24]:
final_df = final_df.loc[:, ~final_df.columns.str.contains('^Unnamed')]

In [25]:
final_df.to_csv('all_combined_results.csv')

In [26]:
final_df

Unnamed: 0,version,project,algorithm,batch_size,confidence,project_reqd_builds,project_missed_builds,project_saved_builds,sbs_delays,testall_size,total_batch_delay,batch_delays,ci
53181,1,floragunncom-search-guard/floragunncom-search-...,BATCHBISECT,1,2,55.629506,0.110926,44.370494,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3606,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, ..."
53182,1,floragunncom-search-guard/floragunncom-search-...,BATCHBISECT,1,3,39.462008,2.468109,60.537992,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, ...",3606,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, ..."
53183,1,floragunncom-search-guard/floragunncom-search-...,BATCHBISECT,1,4,33.361065,0.388242,66.638935,"[0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, ...",3606,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, ..."
53184,1,floragunncom-search-guard/floragunncom-search-...,BATCHBISECT,1,5,26.067665,4.104271,73.932335,"[0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, ...",3606,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, ..."
53185,1,floragunncom-search-guard/floragunncom-search-...,BATCHBISECT,1,6,22.878536,4.215197,77.121464,"[0, 0, 0, 0, 0, 1, 2, 3, 0, 1, 2, 0, 0, 0, 0, ...",3606,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
73867,1,ngageoint-geowave/ngageoint-geowave.csv,BATCHSTOP4,16,16,46.772132,3.020134,24.496644,"[7, 8, 9, 10, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0, ...",6258,3600.0,"[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
73868,1,ngageoint-geowave/ngageoint-geowave.csv,BATCHSTOP4,16,17,41.099393,4.202621,27.372963,"[8, 9, 10, 11, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0,...",6258,3600.0,"[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
73869,1,ngageoint-geowave/ngageoint-geowave.csv,BATCHSTOP4,16,18,45.078300,3.835091,27.085331,"[9, 10, 11, 12, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0...",6258,3486.0,"[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
73870,1,ngageoint-geowave/ngageoint-geowave.csv,BATCHSTOP4,16,19,42.154043,4.426334,28.331735,"[10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, ...",6258,3480.0,"[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
