In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import math

In [3]:
formulaCompressionTestHome = "C:\\Users\\jonat\\Documents\\Dataspread\\FormulaCompressionTest"

In [4]:
def getMiscDict(path):
    dirList = path.split('\\')
    syncType = dirList[len(dirList) - 7]
    memoryStorage = dirList[len(dirList) - 6]
    baseline = dirList[len(dirList) -5]
    if baseline.lower() == 'comp':
        baseline = 'TACO'
    elif baseline.lower() == 'async':
        baseline = 'Antifreeze'
    elif baseline.lower() == 'pgimpl':
        baseline = 'NoComp'
    testSpreadsheet = dirList[len(dirList) - 4]
    run = int(dirList[len(dirList) - 3].split('-')[1])
    rows = int(dirList[len(dirList) - 2].split('-')[1])
    return {'Sync/Async':syncType, 'Memory Storage':memoryStorage, 'Baseline':baseline,
            'Test Spreadsheet':testSpreadsheet, 'Run':run, 'Rows':rows}

def getMetrics(path):
    """
    Takes in a path string to a .stat file and returns a Pandas DataFrame of the file's contents
    """
    reportsFile = open(path, 'r')
    dataDict = {}
    dataDict = getMiscDict(path)
    for line in reportsFile:
        if ': ' in line:
            metric, value = line.split(": ")
            if metric not in dataDict:
                dataDict[metric] = []
            try:
                dataDict[metric].append(int(value))
            except ValueError:
                dataDict[metric].append(float(value))
            except:
                dataDict[metric].append(value)
    reportsFile.close
    
    return pd.DataFrame(dataDict)


def getAllMetrics(basePath, miscDict={'sync type':'', 'mem type':'', 'impl':'', 'test':'', 'run':0}):
    """
    Takes in a file path and searches all of its subdirectories for .stat files and returns all of
    their contents in a Pandas DataFrame
    """
    data = pd.DataFrame()
    for file in os.listdir(basePath):
        filePath = os.path.join(basePath, file)
        if os.path.isdir(filePath):
            data = data.append(getAllMetrics(filePath))
        elif file == "core.stat":
            data = data.append(getMetrics(filePath))
    return data


def plotData(data, 
             researchQuestion,
             plotFunc,
             relevantColumns,
             sheets=['Rate', 'RunningTotalFast', 'RunningTotalSlow'], 
             memStorage = ['DB', 'IN_MEM', 'MEM'],
             syncMethod=['async', 'sync'],
             save=False,
             plotHomePath='../plots/Dixin-Exp'):
    plt.rcParams.update({'font.size': 23, 'figure.figsize': (12,8), 'lines.linewidth':3, 'font.family':'serif'})
    for sheet in sheets:
        for sync in syncMethod:
            for mem in memStorage:
                dataSlice = pd.DataFrame()
                try:
                    dataSlice = data.loc[sheet, mem, sync]
                except:
                    print('No data for (' + sheet + ', ' + mem + ', ' + sync + ')')
                
                if len(dataSlice > 0):
                    relevantData = pd.pivot_table(dataSlice, 
                                              index='Rows', 
                                              columns=dataSlice.index)[[relevantColumns]][relevantColumns]
                    relevantData = relevantData[relevantData.columns.tolist()[-2:] + relevantData.columns.tolist()[:-2]]
                    splitData = [relevantData[relevantData.index < 100000], relevantData[relevantData.index >= 100000]]
                    if len(splitData[0]) > 0:
                        plotFunc(splitData[0], researchQuestion, sheet, mem, sync, False, save)
                    if len(splitData[1]) > 0:
                        plotFunc(splitData[1], researchQuestion, sheet, mem, sync, True, save)
                        
def findYMax(data):
    digits = 0
    maxTime = 0
    for column in data.columns:
        maxTime = max(maxTime, data[column].max())
    digitTracker = maxTime
    while digitTracker > 10:
        digits += 1
        digitTracker = digitTracker // 10
    return math.ceil(maxTime / 10 ** digits) * 10**digits + 10 ** (digits - 1)
        
def checkForAntifreeze(df):
    return len(df['Antifreeze']) - df['Antifreeze'].isnull().sum()


def savePlot(plot, homePath, rq, sheet, mem, big):
    filePath = homePath+rq+'/'+sheet+'/'+mem+'/'
    Path(filePath).mkdir(parents=True, exist_ok=True)
    if big:
        filePath += sheet+'_'+mem+'_big.pdf'
    else:
        filePath += sheet+'_'+mem+'_small.pdf'
    plt.savefig(filePath, bbox_inches='tight')
    print('Plot saved to ' + filePath)

In [5]:
#RQ1
def plotTotalTime(plotData, researchQuestion, sheet, mem, sync, big, save):
    plt.figure()
    display(plotData)
    numAntifreeze = checkForAntifreeze(plotData)
    if numAntifreeze == 0:
        plotData.drop(columns='Antifreeze', inplace=True)
    elif numAntifreeze < 5:
        'label last point'
    
    lineplt = sns.lineplot(markers=True, markersize=20, ci=None, dashes=False, alpha=.5, data=plotData)
    lineplt.yaxis.grid()
    lineplt.legend(markerscale=3)
    for index, label in enumerate(lineplt.get_xticklabels()):
        if index % 2 == 0:
            label.set_visible(False)
        else:
            label.set_visible(True)
    lineplt.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_xticks()/1000])
    lineplt.set_ybound(upper=findYMax(plotData))
    if not big and sheet=='RunningTotalSlow':
        lineplt.set_ybound(upper=1000)
    if 0 < numAntifreeze < 5:
        row = plotData.iloc[numAntifreeze - 1]
        lineplt.text(x=int(row.name) + lineplt.get_xlim()[1] * .015, 
                       y=plotData.loc[row.name, 'Antifreeze'] + ((lineplt.get_ylim()[1] - lineplt.get_ylim()[0]) * .03),
                       fontsize=20,
                       s='Did not\nfinish after')
    handles, labels = lineplt.get_legend_handles_labels()
    leg = lineplt.legend(
        bbox_to_anchor=(0, 0, 1, 1),
        loc="upper left",
        handles=handles[0:], 
        labels=labels[0:],
        markerscale=3
        )
    if big: 
        lineplt.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_yticks()/1000])
    lineplt.set_ylabel('Time of returning control (ms)')
    lineplt.set_xlabel('Number of Rows ')
    if save: 
        savePlot(lineplt, '../plots/Dixin-Exp/', researchQuestion, sheet, mem, big)
        plt.close()
        
        
def plotBars(testData, numRows, big, save=False):
    plt.rcParams.update({'font.size': 40, 'figure.figsize': (12,8), 'lines.linewidth':3, 'font.family':'serif'})
    relevantData = testData[testData['Rows'] == numRows]
    try:
        relevantData = relevantData.loc[:, 'MEM', 'async']
    except:
        relevantData = relevantData.loc[:, 'IN_MEM', 'async']
    relevantData = relevantData[['Total time to update cells (ms)']]
    relevantData = relevantData.reset_index(level=['Test Spreadsheet', 'Baseline']).pivot(index='Test Spreadsheet', columns='Baseline', values='Total time to update cells (ms)')
    relevantData = relevantData[relevantData.columns.tolist()[-2:] + relevantData.columns.tolist()[:-2]]
    display(relevantData)
    ax = relevantData.plot.bar()
    yMax = 200
    ax.set_ylabel('Time of returning control (ms)')
    ax.set_xlabel('')
    ax.set_xticklabels(['Rate', 'Fast', 'Slow'])
    ax.set_ybound(upper=yMax)
    ax.yaxis.grid()
    for label in ax.get_xticklabels():
        label.set_rotation(0)
    for bar in ax.patches:
        if bar.get_height() > yMax:
            ax.annotate('{:.1f}'.format(bar.get_height() / 1000) + '\nsec',
                        (bar.get_x() + bar.get_width() * 1.7, yMax * .9), ha='center', va='center',
                        size=23, xytext=(0, -8),
                        textcoords='offset points',
                        fontsize=40)
    handles, labels = ax.get_legend_handles_labels()
    leg = ax.legend(
        bbox_to_anchor=(.025, .1, 1, 1),
        loc="lower right",
        handles=handles[0:], 
        labels=labels[0:],
        markerscale=3
        )
    
    if save:
        filePath = '../plots/Dixin-Exp/rq1/MemBarPlots/'
        Path(filePath).mkdir(parents=True, exist_ok=True)
        if big:
            plt.savefig(filePath + 'mem_' + 'big.pdf', bbox_inches="tight")
        else:
            plt.savefig(filePath + 'mem_' + 'small.pdf', bbox_inches="tight")
        plt.close()
        
        
formulaCompressionTestHome = "C:\\Users\\jonat\\Documents\\Dataspread\\FormulaCompressionTest"
testPath = formulaCompressionTestHome + "\\experiments_data\\rq1\\report"
pd.set_option('max_rows', 115)
testData = getAllMetrics(testPath)
testData = testData.groupby(['Test Spreadsheet', 'Memory Storage', 'Sync/Async', 'Baseline', 'Rows']).mean()
testData = testData.reset_index(level='Rows')
#display(testData)
plotData(testData, 'rq1', plotTotalTime, 'Total time of getting dependents (ms)', memStorage=['DB'], save=True)

plotBars(testData, 10000, False, save=True)
plotBars(testData, 500000, True, save=True)

Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5000,67.666667,1.0,1.0
10000,125.333333,1.666667,1.0
15000,192.0,1.666667,1.0
20000,253.666667,1.333333,1.0
25000,326.333333,1.333333,


Plot saved to ../plots/Dixin-Exp/rq1/Rate/DB/Rate_DB_small.pdf


  lineplt.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_xticks()/1000])


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100000,979.0,1.0,
200000,2058.0,1.0,
300000,3160.0,1.333333,
400000,4298.0,1.0,
500000,5300.0,1.333333,


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Plot saved to ../plots/Dixin-Exp/rq1/Rate/DB/Rate_DB_big.pdf
No data for (Rate, DB, sync)


  lineplt.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_xticks()/1000])
  lineplt.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_yticks()/1000])


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5000,299.0,1.333333,1.0
10000,579.666667,1.333333,1.0
15000,871.0,1.666667,1.0
20000,1031.0,2.0,
25000,1329.666667,2.0,


  lineplt.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_xticks()/1000])


Plot saved to ../plots/Dixin-Exp/rq1/RunningTotalFast/DB/RunningTotalFast_DB_small.pdf


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100000,11415.0,1.333333,
200000,14268.0,2.0,
300000,16651.0,1.666667,
400000,24765.0,2.0,
500000,31623.0,1.666667,


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  lineplt.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_xticks()/1000])
  lineplt.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_yticks()/1000])


Plot saved to ../plots/Dixin-Exp/rq1/RunningTotalFast/DB/RunningTotalFast_DB_big.pdf
No data for (RunningTotalFast, DB, sync)


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5000,162.666667,1.0,1.0
10000,286.333333,1.0,1.0
15000,463.0,1.0,1.0
20000,636.333333,1.0,
25000,811.333333,1.0,


  lineplt.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_xticks()/1000])


Plot saved to ../plots/Dixin-Exp/rq1/RunningTotalSlow/DB/RunningTotalSlow_DB_small.pdf


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100000,3392.333333,1.333333,
200000,5882.666667,1.0,
300000,8969.666667,1.0,
400000,13466.333333,1.0,
500000,18014.333333,1.0,


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  lineplt.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_xticks()/1000])
  lineplt.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in lineplt.get_yticks()/1000])


Plot saved to ../plots/Dixin-Exp/rq1/RunningTotalSlow/DB/RunningTotalSlow_DB_big.pdf
No data for (RunningTotalSlow, DB, sync)


Baseline,NoComp,TACO,Antifreeze
Test Spreadsheet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rate,162.0,2.0,1.0
RunningTotalFast,142.0,2.666667,1.666667
RunningTotalSlow,155.666667,2.0,1.333333


Baseline,NoComp,TACO
Test Spreadsheet,Unnamed: 1_level_1,Unnamed: 2_level_1
Rate,10292.666667,2.666667
RunningTotalFast,6606.333333,2.333333
RunningTotalSlow,9723.333333,3.666667


In [6]:
#RQ2
def plotBatchLoadingTime(plotData, researchQuestion, sheet, mem, sync, big, save):
    display(plotData)
    plt.figure()
    yMax = findYMax(plotData[['NoComp', 'TACO']])
    if sheet == 'RunningTotalFast':
        yMax = 80000
    numAntifreeze = checkForAntifreeze(plotData)
    if numAntifreeze == 0:
        plotData.drop(columns='Antifreeze', inplace=True)
    cappedData = plotData.copy()
    for column in plotData.columns:
        cappedData[column] = plotData[column].map(lambda x: min(x, yMax * .95))
    batchPlot = sns.lineplot(markers=True, markersize=20, ci=None, dashes=False, alpha=.5, color="black", data=cappedData)

    for index, label in enumerate(batchPlot.get_xticklabels()):
        if index % 2 == 0:
            label.set_visible(False)
        else:
            label.set_visible(True)
    batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in batchPlot.get_xticks()/1000])
    if 0 < numAntifreeze < 5:
        row = plotData.iloc[numAntifreeze - 1]
        batchPlot.text(x=int(row.name) + batchPlot.get_xlim()[1] * .015, 
                       y=cappedData.loc[row.name, column] - (batchPlot.get_ylim()[1] + batchPlot.get_ylim()[0]) * .045,
                       fontsize=20,
                       s='Did not\nfinish after')
    batchPlot.yaxis.grid()
    batchPlot.set_ybound(upper=yMax)
    xAdjust = - max(plotData.index.values) * .03
    yAdjust = yMax * .1
    xMax = max(plotData.index.values)
    for column in plotData.columns:
        for i, row in plotData[plotData[column] > cappedData[column]].iterrows():
            batchPlot.text(x=int(row.name) + xAdjust, 
                           y=cappedData.loc[row.name, column] - yAdjust, 
                           s='{:,.1f}'.format(row[column] / 60000) + ' min')
    batchPlot.legend(markerscale=3)
    handles, labels = batchPlot.get_legend_handles_labels()
    leg = batchPlot.legend(
        bbox_to_anchor=(.57, .1, 1, 1),
        loc="lower left",
        handles=handles[0:], 
        labels=labels[0:],
        markerscale=3
        )
    batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()/1000])
    batchPlot.set_ylabel('Time of building the graph (ms)')
    batchPlot.set_xlabel('Number of Rows')
    if save: 
        savePlot(batchPlot, '../plots/Dixin-Exp/', researchQuestion, sheet, mem, big)
        plt.close()
        
        
        
        
        
formulaCompressionTestHome = "C:\\Users\\jonat\\Documents\\Dataspread\\FormulaCompressionTest"
testPath = formulaCompressionTestHome + "\\experiments_data\\rq2\\report"
pd.set_option('max_rows', 115)
testData = getAllMetrics(testPath)
testData = testData.groupby(['Test Spreadsheet', 'Memory Storage', 'Sync/Async', 'Baseline', 'Rows']).mean()
testData = testData.reset_index(level='Rows')
#display(testData)

plotData(testData, 'rq2', plotBatchLoadingTime, 'Total time of adding the batch (ms)', memStorage=['DB'], save=True)



Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5000,8065.333333,37.333333,26888.333333
10000,19865.666667,63.0,94770.333333
15000,30636.666667,65.333333,458252.333333
20000,44446.333333,80.666667,867453.0
25000,56966.666667,89.333333,


Plot saved to ../plots/Dixin-Exp/rq2/Rate/DB/Rate_DB_small.pdf


  batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in batchPlot.get_xticks()/1000])
  batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()/1000])


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100000,159493.0,192.666667,
200000,299052.0,292.0,
300000,490979.0,393.666667,
400000,596563.0,483.0,
500000,748533.0,499.666667,


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in batchPlot.get_xticks()/1000])
  batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()/1000])


Plot saved to ../plots/Dixin-Exp/rq2/Rate/DB/Rate_DB_big.pdf
No data for (Rate, DB, sync)


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5000,9099.0,57.0,236127.3
10000,19512.333333,59.333333,569920.3
15000,34126.0,63.0,1444335.0
20000,43717.333333,86.666667,
25000,64261.666667,90.0,


Plot saved to ../plots/Dixin-Exp/rq2/RunningTotalFast/DB/RunningTotalFast_DB_small.pdf


  batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in batchPlot.get_xticks()/1000])
  batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()/1000])


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100000,167951.0,167.333333,
200000,324233.0,269.0,
300000,463598.0,390.333333,
400000,647305.0,449.0,
500000,782883.0,462.0,


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in batchPlot.get_xticks()/1000])


Plot saved to ../plots/Dixin-Exp/rq2/RunningTotalFast/DB/RunningTotalFast_DB_big.pdf
No data for (RunningTotalFast, DB, sync)


  batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()/1000])


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5000,4296.666667,26.666667,130542.7
10000,8718.666667,34.666667,348346.3
15000,13056.0,34.0,1558271.0
20000,19624.0,34.0,
25000,26527.0,65.333333,


  batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in batchPlot.get_xticks()/1000])


Plot saved to ../plots/Dixin-Exp/rq2/RunningTotalSlow/DB/RunningTotalSlow_DB_small.pdf


  batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()/1000])


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100000,83338.0,94.333333,
200000,166016.0,126.666667,
300000,229874.0,175.666667,
400000,310085.666667,164.666667,
500000,386317.666667,228.333333,


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
  batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in batchPlot.get_xticks()/1000])
  batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()/1000])


Plot saved to ../plots/Dixin-Exp/rq2/RunningTotalSlow/DB/RunningTotalSlow_DB_big.pdf
No data for (RunningTotalSlow, DB, sync)


In [47]:
#RQ3
def plotRQ3(plotData, researchQuestion, sheet, mem, sync, big, save):
    display(plotData)
    plt.figure()
    if big:
        plotData.iloc[1, 0] = float('nan')
        plotData.iloc[2, 0] = float('nan')
    numAntifreeze = checkForAntifreeze(plotData)
    if numAntifreeze == 0:
        plotData.drop(columns='Antifreeze', inplace=True)
    elif numAntifreeze < 5:
        'label last point'
    cappedData = plotData.copy()
    for column in plotData.columns:
        plotData[column] = plotData[column] / 1000
    yMax = 100
    for column in plotData.columns:
        cappedData[column] = plotData[column].map(lambda x: min(x, yMax * .95))
    batchPlot = sns.lineplot(markers=True, markersize=35, ci=None, dashes=False, alpha=.5, color="black", data=cappedData)
    #batchPlot.set_xticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_xticks()/1000])
    if big:
        batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in range(0, 501, 100)])
    else:
        batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in range(0, 11, 2)])
    batchPlot.yaxis.grid()
    xAdjust = - max(plotData.index.values) * .03
    yAdjust = yMax * .1
    xMax = max(plotData.index.values)
    for column in plotData.columns:
        for i, row in plotData[plotData[column] > cappedData[column]].iterrows():
            xMax = max(plotData.index.values) * 1.14
            batchPlot.text(x=int(row.name) + xAdjust, 
                           y=cappedData.loc[row.name, column] - yAdjust, 
                           s='{:.1f}'.format(row[column] / 60) + ' min',
                          fontsize=25)
    batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()])
    batchPlot.set_ybound(upper=yMax)
    batchPlot.set_xbound(upper=xMax)
    batchPlot.legend(markerscale=5)
    batchPlot.set_ylabel('Time of finishing the modification (ms)')
    batchPlot.set_xlabel('Number of Rows Modified')
    if big:
        display(cappedData)
        batchPlot.text(x=100000 + batchPlot.get_xlim()[1] * .03, 
                       y=95.0 - (batchPlot.get_ylim()[1] + batchPlot.get_ylim()[0]) * .045,
                       fontsize=20,
                       s='Did not\nfinish after')
    if save: 
        savePlot(batchPlot, '../plots/Dixin-Exp/', researchQuestion, sheet, mem, big)
        plt.close()


testPath = formulaCompressionTestHome + "\\experiments_data\\rq3\\report"
pd.set_option('max_rows', 115)
testData = getAllMetrics(testPath)
testData = testData.groupby(['Test Spreadsheet', 'Memory Storage', 'Sync/Async', 'Baseline', 'Rows']).mean()
testData = testData.reset_index(level='Rows')

plotData(testData, 'rq3', plotRQ3, 'Total time of refreshing the cache (ms)', sheets=['RefreshCache'], memStorage=['DB', 'MEM'], save=True)

Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,7744.0,278.666667,744962.0
4000,11460.0,474.666667,795208.0
6000,27402.0,751.0,844966.0
8000,35325.0,833.666667,877087.0
10000,44308.0,1019.666667,928016.0


Plot saved to ../plots/Dixin-Exp/rq3/RefreshCache/DB/RefreshCache_DB_small.pdf


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plotData[column] = plotData[column] / 1000
  batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in range(0, 11, 2)])
  batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()])


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100000,1698297.0,7537.666667,
200000,3818311.0,16265.666667,
300000,6172711.0,21775.333333,
400000,,28923.666667,
500000,,40823.333333,


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Baseline,NoComp,TACO
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1
100000,95.0,7.537667
200000,,16.265667
300000,,21.775333
400000,,28.923667
500000,,40.823333


Plot saved to ../plots/Dixin-Exp/rq3/RefreshCache/DB/RefreshCache_DB_big.pdf


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,8342.0,298.666667,693942.333333
4000,13624.333333,452.0,821938.333333
6000,23228.666667,644.333333,853950.666667
8000,36259.333333,809.333333,873881.0
10000,42339.0,945.333333,902972.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plotData[column] = plotData[column] / 1000
  batchPlot.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in range(0, 11, 2)])
  batchPlot.set_yticklabels(['{:.0f}'.format(x) + 'K' for x in batchPlot.get_yticks()])


Plot saved to ../plots/Dixin-Exp/rq3/RefreshCache/MEM/RefreshCache_MEM_small.pdf


Baseline,NoComp,TACO,Antifreeze
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100000,,9536.666667,
200000,,14229.0,
300000,,22676.0,
400000,,30132.333333,
500000,,69960.666667,


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Baseline,NoComp,TACO
Rows,Unnamed: 1_level_1,Unnamed: 2_level_1
100000,,9.536667
200000,,14.229
300000,,22.676
400000,,30.132333
500000,,69.960667


Plot saved to ../plots/Dixin-Exp/rq3/RefreshCache/MEM/RefreshCache_MEM_big.pdf
No data for (RefreshCache, DB, sync)
No data for (RefreshCache, MEM, sync)


In [21]:

testPaths = [formulaCompressionTestHome + "\\experiments_data\\rq1\\report",
             formulaCompressionTestHome + "\\experiments_data\\rq2\\report",
             formulaCompressionTestHome + "\\experiments_data\\rq3\\report"]
for testPath in testPaths:
    testData = getAllMetrics(testPath)
    testData = testData.groupby(['Test Spreadsheet', 'Memory Storage', 'Sync/Async', 'Baseline', 'Rows']).mean()
    testData = testData.groupby(['Memory Storage', 'Baseline', 'Rows']).max()
    display(testData)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Run,Total test time (ms),Number of cells to update,Number of cells updated,Total time to update cells (ms),Total time after the update (ms),Total time of getting dependents (ms),Total time of adding the batch (ms),Total time of refreshing the cache (ms),Area under curve
Memory Storage,Baseline,Rows,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DB,Antifreeze,5000,2.0,446511.0,5001.0,0.0,1.666667,0.0,1.0,236127.3,203875.7,0.0
DB,Antifreeze,10000,2.0,1113132.0,10001.0,0.0,1.666667,0.0,1.0,569920.3,530616.7,0.0
DB,Antifreeze,15000,2.0,3094977.0,15001.0,0.0,11.0,0.0,1.0,1558271.0,1526966.0,0.0
DB,Antifreeze,20000,2.0,2526406.0,20001.0,0.0,2.0,0.0,1.0,867453.0,1634834.0,0.0
DB,NoComp,5000,2.0,16674.0,5001.0,0.0,307.0,0.0,299.0,9099.0,3.666667,0.0
DB,NoComp,10000,2.0,35318.33,10001.0,0.0,590.666667,0.0,579.666667,19865.67,5.666667,0.0
DB,NoComp,15000,2.0,66158.33,15001.0,0.0,3055.666667,0.0,871.0,34126.0,2.333333,0.0
DB,NoComp,20000,2.0,83505.0,20001.0,0.0,1050.333333,0.0,1031.0,44446.33,8.0,0.0
DB,NoComp,25000,2.0,112602.3,25001.0,0.0,1360.333333,0.0,1329.666667,64261.67,4.0,0.0
DB,NoComp,100000,2.0,315382.0,100001.0,0.0,11526.0,0.0,11415.0,167951.0,35.0,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Run,Total test time (ms),Number of cells to update,Number of cells updated,Total time to update cells (ms),Total time after the update (ms),Total time of getting dependents (ms),Total time of adding the batch (ms),Total time of refreshing the cache (ms),Area under curve
Memory Storage,Baseline,Rows,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DB,Antifreeze,5000,2.0,446511.0,5001.0,0.0,1.666667,0.0,1.0,236127.3,203875.7,0.0
DB,Antifreeze,10000,2.0,1113132.0,10001.0,0.0,1.666667,0.0,1.0,569920.3,530616.7,0.0
DB,Antifreeze,15000,2.0,3094977.0,15001.0,0.0,11.0,0.0,1.0,1558271.0,1526966.0,0.0
DB,Antifreeze,20000,2.0,2526406.0,20001.0,0.0,2.0,0.0,1.0,867453.0,1634834.0,0.0
DB,NoComp,5000,2.0,16674.0,5001.0,0.0,640.333333,0.0,632.333333,9099.0,3.666667,0.0
DB,NoComp,10000,2.0,35318.33,10001.0,0.0,590.666667,0.0,579.666667,19865.67,5.666667,0.0
DB,NoComp,15000,2.0,66158.33,15001.0,0.0,3055.666667,0.0,3037.666667,34126.0,2.333333,0.0
DB,NoComp,20000,2.0,83505.0,20001.0,0.0,1050.333333,0.0,1031.0,44446.33,8.0,0.0
DB,NoComp,25000,2.0,112602.3,25001.0,0.0,1360.333333,0.0,1329.666667,64261.67,4.0,0.0
DB,NoComp,100000,2.0,315382.0,100001.0,0.0,11526.0,0.0,11415.0,167951.0,35.0,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Run,Total test time (ms),Number of cells to update,Number of cells updated,Total time to update cells (ms),Total time after the update (ms),Total time of getting dependents (ms),Total time of adding the batch (ms),Total time of refreshing the cache (ms),Area under curve
Memory Storage,Baseline,Rows,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DB,Antifreeze,2000,2.0,1186802.0,10001.0,0.0,744963.0,0.0,1.0,428234.0,744962.0,0.0
DB,Antifreeze,4000,2.0,1338883.0,10001.0,0.0,935211.0,0.0,1.0,384130.0,795208.0,0.0
DB,Antifreeze,6000,2.0,1352203.0,10001.0,0.0,844970.0,0.0,1.0,479932.0,844966.0,0.0
DB,Antifreeze,8000,2.0,1232627.0,10001.0,0.0,817090.0,0.0,1.0,389287.0,877087.0,0.0
DB,Antifreeze,10000,2.0,1338157.0,10001.0,0.0,908020.0,0.0,1.0,397890.0,928016.0,0.0
DB,NoComp,2000,2.0,29931.0,10001.0,0.0,8612.0,0.0,858.0,10782.0,7744.0,0.0
DB,NoComp,4000,2.0,35625.0,10001.0,0.0,12275.0,0.0,804.0,8861.0,11460.0,0.0
DB,NoComp,6000,2.0,62787.0,10001.0,0.0,29654.0,0.0,2240.0,9881.0,27402.0,0.0
DB,NoComp,8000,2.0,61186.0,10001.0,0.0,31498.0,0.0,3162.0,8560.0,35325.0,0.0
DB,NoComp,10000,2.0,88001.0,10001.0,0.0,47276.0,0.0,2956.0,10301.0,44308.0,0.0
