In [58]:
import pandas as pd
import os
notebook_home= os.path.abspath('')
lookup_results=notebook_home + "/lookup_results"
enron_output=lookup_results + "/enron/output"
enron_input=lookup_results + "/enron/input"

In [59]:
def rename_columns(df, is_long=False):
    rename_map = {
        "TACOLookupSize": "taco_size",
        "TACOLookUpTime": "taco_time",
        "TACOPostProcessedLookupSize": "taco_post_size",
        "TACOPostProcessedLookupTime": "taco_post_time",
        "NoCompLookupSize": "nocomp_size",
        "NoCompLookUpTime": "nocomp_time",
        "NoCompPostProcessedLookupSize": "nocomp_post_size",
        "NoCompPostProcessedLookupTime": "nocomp_post_time",
    }
    
    if is_long:
        for key in rename_map:
            rename_map[key] = "long_" + rename_map[key]
    else:
        for key in rename_map:
            rename_map[key] = "max_" + rename_map[key]
    
    return df.rename(columns=rename_map)

In [60]:
df_taco_max = rename_columns(pd.read_csv(f"{enron_output}/enron_max_taco.csv"), False)
df_taco_long = rename_columns(pd.read_csv(f"{enron_output}/enron_long_taco.csv"), True)
df_nocomp_max = rename_columns(pd.read_csv(f"{enron_output}/enron_max_nocomp.csv"), False)
df_nocomp_long = rename_columns(pd.read_csv(f"{enron_output}/enron_long_nocomp.csv"), True)

In [61]:
df_max = pd.merge(df_taco_max, df_nocomp_max, how="left", on=["fileName", "Max Dep Ref"])
df_long = pd.merge(df_taco_long, df_nocomp_long, how="left", on=["fileName", "Longest Dep Ref"])

In [62]:
len(df_max), len(df_long)

(7382, 7381)

In [63]:
df_merged = pd.merge(df_max, df_long, how="right", on=["fileName"])

In [64]:
# df_merged.to_csv("./enron_result_merged.csv", index=False)

## Analyze Enron results

In [65]:
df_larger_than = df_merged[(df_merged["max_taco_time"] > df_merged["max_nocomp_time"]) & (df_merged["max_nocomp_time"] > 10)]
df_larger_than[["fileName", "max_taco_time", "max_nocomp_time"]]

Unnamed: 0,fileName,max_taco_time,max_nocomp_time
211,matthew_lenhart_000_1_1.pst.1.xls,567.0,483.0
802,vkaminski_000_1_1_1.pst.27.xls,5204.0,130.0
817,mary_fischer_000_1_1.pst.116.xls,24.0,13.0
2265,vkaminski_000_1_1_1.pst.101.xls,26.0,16.0
2510,mary_fischer_000_1_1.pst.114.xls,23.0,16.0
5320,vkaminski_000_1_1_1.pst.113.xls,29.0,14.0
6396,mary_fischer_000_1_1.pst.101.xls,24.0,15.0
7216,vkaminski_000_1_1_1.pst.169.xls,345.0,49.0
7233,jim_schwieger_000_1_1.pst.110.xls,515.0,100.0


In [66]:
threshold=500
df_merged[df_merged["max_nocomp_time"] > threshold][["fileName", "max_nocomp_time"]]

Unnamed: 0,fileName,max_nocomp_time
91,mike_grigsby_000_1_1_1.pst.106.xls,1505.0
753,jim_schwieger_000_1_1.pst.100.xls,10466.0
770,phillip_allen_000_1_1.pst.102.xls,1128.0
789,benjamin_rogers_000_1_1.pst.39.xls,4010.0
1289,phillip_allen_000_1_1.pst.253.xls,1121.0
2465,mike_grigsby_000_1_1_1.pst.108.xls,1825.0
4062,jim_schwieger_000_1_1.pst.71.xls,9869.0
4300,harry_arora_000_1_1.pst.130.xls,9636.0
4546,kevin_presto_000_1_1.pst.16.xls,4946.0
5275,vkaminski_001_1_2_1.pst.62.xls,760.0


In [67]:
df_merged[df_merged["long_nocomp_time"] > threshold][["fileName", "long_nocomp_time"]]

Unnamed: 0,fileName,long_nocomp_time
401,john_griffith_000_1_1.pst.185.xls,4540
753,jim_schwieger_000_1_1.pst.100.xls,8238
789,benjamin_rogers_000_1_1.pst.39.xls,3590
4062,jim_schwieger_000_1_1.pst.71.xls,8807
4300,harry_arora_000_1_1.pst.130.xls,9927
4546,kevin_presto_000_1_1.pst.16.xls,4405
5275,vkaminski_001_1_2_1.pst.62.xls,538
5987,jim_schwieger_000_1_1.pst.106.xls,8946
6161,mark_haedicke_000_1_2.pst.31.xls,873
6982,dutch_quigley_000_1_1.pst.51.xls,9096


In [68]:
df_merged[df_merged["max_taco_time"] > threshold][["fileName", "max_taco_time"]]

Unnamed: 0,fileName,max_taco_time
91,mike_grigsby_000_1_1_1.pst.106.xls,628.0
211,matthew_lenhart_000_1_1.pst.1.xls,567.0
770,phillip_allen_000_1_1.pst.102.xls,542.0
802,vkaminski_000_1_1_1.pst.27.xls,5204.0
1289,phillip_allen_000_1_1.pst.253.xls,551.0
2465,mike_grigsby_000_1_1_1.pst.108.xls,537.0
5969,matthew_lenhart_000_1_1.pst.19.xls,539.0
6142,matthew_lenhart_000_1_1.pst.20.xls,508.0
7028,matthew_lenhart_000_1_1.pst.23.xls,579.0
7233,jim_schwieger_000_1_1.pst.110.xls,515.0


In [69]:
df_merged[df_merged["long_taco_time"] > threshold][["fileName", "long_taco_time"]]

Unnamed: 0,fileName,long_taco_time
802,vkaminski_000_1_1_1.pst.27.xls,4839
7233,jim_schwieger_000_1_1.pst.110.xls,609


In [70]:
df_enron_dep_ref = pd.read_excel(enron_input + "/enron_dep_ref.xlsx")

In [71]:
df_enron_dep_ref

Unnamed: 0,File name,Max Dep Ref,Max Dep,Longest Dep Ref,Longest Dep
0,benjamin_rogers_000_1_1.pst.124.xls,firstrun33000:D9,1,firstrun33000:F17,1
1,benjamin_rogers_000_1_1.pst.159.xls,Sheet1:G4,1,Sheet1:C4,1
2,benjamin_rogers_000_1_1.pst.197.xls,Sheet1:F27,1,Sheet1:F32,1
3,benjamin_rogers_000_1_1.pst.200.xls,Sheet1:F27,1,Sheet1:F32,1
4,benjamin_rogers_001_1_1.pst.138.xls,capacity:U77,1,capacity:S84,1
...,...,...,...,...,...
7397,matthew_lenhart_000_1_1.pst.23.xls,Power Curve:J5,34481,Curves:A1,268
7398,mike_grigsby_000_1_1_1.pst.108.xls,Power Curve:J5,34481,Curves:A1,268
7399,john_lavorato_001_1_1_1.pst.94.xls,CP Trade Data:F3,34739,CP Trade Data:F2,6952
7400,benjamin_rogers_000_1_1.pst.39.xls,Sheet1:Z1,175496,Sheet1:Z1,184
