In [200]:
import pandas as pd
import os
notebook_home = os.path.abspath('')
old = "_old"
lookup_results = notebook_home + "/lookup_results"
enron_output = lookup_results + f"/enron{old}/output"
enron_input = lookup_results + f"/enron{old}/input"
github_input = lookup_results + f"/github{old}/input"
github_output = lookup_results + f"/github{old}/output"

In [201]:
def rename_columns(df, is_long=False):
    rename_map = {
        "TACOLookupSize": "taco_size",
        "TACOLookupTime": "taco_time",
        "TACOPostProcessedLookupSize": "taco_post_size",
        "TACOPostProcessedLookupTime": "taco_post_time",
        "NoCompLookupSize": "nocomp_size",
        "NoCompLookupTime": "nocomp_time",
        "NoCompPostProcessedLookupSize": "nocomp_post_size",
        "NoCompPostProcessedLookupTime": "nocomp_post_time",
    }
    
    if is_long:
        for key in rename_map:
            rename_map[key] = "long_" + rename_map[key]
    else:
        for key in rename_map:
            rename_map[key] = "max_" + rename_map[key]
    
    return df.rename(columns=rename_map)

In [202]:
df_enron_taco_max = rename_columns(pd.read_csv(f"{enron_output}/enron_max_taco.csv"), False)
df_enron_taco_long = rename_columns(pd.read_csv(f"{enron_output}/enron_long_taco.csv"), True)
df_enron_nocomp_max = rename_columns(pd.read_csv(f"{enron_output}/enron_max_nocomp.csv"), False)
df_enron_nocomp_long = rename_columns(pd.read_csv(f"{enron_output}/enron_long_nocomp.csv"), True)
df_github_taco_max = rename_columns(pd.read_csv(f"{github_output}/github_max_taco.csv"), False)
df_github_taco_long = rename_columns(pd.read_csv(f"{github_output}/github_long_taco.csv"), True)
df_github_nocomp_max = rename_columns(pd.read_csv(f"{github_output}/github_max_nocomp.csv"), False)
df_github_nocomp_long = rename_columns(pd.read_csv(f"{github_output}/github_long_nocomp.csv"), True)

## Merge

In [203]:
df_enron_max = pd.merge(df_enron_taco_max, df_enron_nocomp_max, how="left", on=["fileName", "Max Dep Ref"])
df_enron_long = pd.merge(df_enron_taco_long, df_enron_nocomp_long, how="left", on=["fileName", "Longest Dep Ref"])

In [204]:
len(df_enron_max), len(df_enron_long)

(7382, 7381)

In [205]:
df_enron_merged = pd.merge(df_enron_max, df_enron_long, how="right", on=["fileName"])
# df_enron_merged.to_csv("./enron_result_merged.csv", index=False)

In [206]:
df_enron_merged.head()

Unnamed: 0,fileName,Max Dep Ref,max_taco_size,max_taco_time,max_taco_post_size,max_taco_post_time,max_nocomp_size,max_nocomp_time,max_nocomp_post_size,max_nocomp_post_time,Longest Dep Ref,long_taco_size,long_taco_time,long_taco_post_size,long_taco_post_time,long_nocomp_size,long_nocomp_time,long_nocomp_post_size,long_nocomp_post_time
0,chris_germany_000_1_1_1.pst.173.xls,Sheet1:U231,1.0,9.0,1.0,10.0,1.0,30.0,1.0,31.0,Sheet1:T553,1,10,1,11,1,28,1,29
1,hunter_shively_000_1_1.pst.213.xls,Sheet1:R27,1.0,1.0,1.0,1.0,11.0,1.0,1.0,1.0,Sheet1:R27,1,0,1,0,11,1,1,2
2,phillip_allen_000_1_1.pst.220.xls,weekly:I37,10.0,1.0,7.0,1.0,10.0,1.0,7.0,1.0,weekly:F6,10,1,9,1,10,1,9,1
3,sara_shackleton_000_1_1_1.pst.40.xls,Momma Credit Form :I29,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,Momma Credit Form :H33,3,0,3,0,3,0,3,0
4,lindy_donoho_000_1_1_1.pst.122.xls,2001:E1,27.0,1.0,26.0,1.0,27.0,3.0,26.0,3.0,2001:J1,25,2,25,2,25,3,25,3


In [207]:
df_github_max = pd.merge(df_github_taco_max, df_github_nocomp_max, how="left", on=["fileName", "Max Dep Ref"])
df_github_long = pd.merge(df_github_taco_long, df_github_nocomp_long, how="left", on=["fileName", "Longest Dep Ref"])

In [208]:
len(df_github_max), len(df_github_long)

(5343, 5344)

In [209]:
df_github_merged = pd.merge(df_github_max, df_github_long, how="right", on=["fileName"])
# df_github_merged.to_csv("./github_result_merged.csv", index=False)

In [210]:
df_github_merged.head()

Unnamed: 0,fileName,Max Dep Ref,max_taco_size,max_taco_time,max_taco_post_size,max_taco_post_time,max_nocomp_size,max_nocomp_time,max_nocomp_post_size,max_nocomp_post_time,Longest Dep Ref,long_taco_size,long_taco_time,long_taco_post_size,long_taco_post_time,long_nocomp_size,long_nocomp_time,long_nocomp_post_size,long_nocomp_post_time
0,BaoGia2_09122016.xlsx,QR 5@:D18,8.0,8.0,8.0,9.0,8.0,36.0,8.0,37.0,QR 5@:D17,8,8,8,9,8.0,24.0,8.0,26.0
1,tiny.xlsx,Orders:C35,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,Orders:C35,1,0,1,0,1.0,1.0,1.0,1.0
2,GL_Driver_Safety_Report_ferguson_20211018_1441...,Report:Y5,1.0,0.0,1.0,0.0,2216.0,71.0,1.0,75.0,Pivot Tables:M8,1,0,1,0,1.0,0.0,1.0,0.0
3,Complete Raw Data CMC.xlsx,Complete Raw Data:A422,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,Complete Raw Data:A333,2,0,2,0,2.0,0.0,2.0,0.0
4,1. Excel Homework.xlsx,Sheet1:E4030,2.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,Sheet1:J3248,1,0,1,0,1.0,0.0,1.0,0.0


## Analyze Enron results

In [211]:
df_larger_than = df_enron_merged[(df_enron_merged["max_taco_time"] > df_enron_merged["max_nocomp_time"]) & (df_enron_merged["max_nocomp_time"] > 10)]
df_larger_than[["fileName", "max_taco_time", "max_nocomp_time"]]

Unnamed: 0,fileName,max_taco_time,max_nocomp_time
211,matthew_lenhart_000_1_1.pst.1.xls,567.0,483.0
802,vkaminski_000_1_1_1.pst.27.xls,5204.0,130.0
817,mary_fischer_000_1_1.pst.116.xls,24.0,13.0
2265,vkaminski_000_1_1_1.pst.101.xls,26.0,16.0
2510,mary_fischer_000_1_1.pst.114.xls,23.0,16.0
5320,vkaminski_000_1_1_1.pst.113.xls,29.0,14.0
6396,mary_fischer_000_1_1.pst.101.xls,24.0,15.0
7216,vkaminski_000_1_1_1.pst.169.xls,345.0,49.0
7233,jim_schwieger_000_1_1.pst.110.xls,515.0,100.0


In [212]:
threshold=500
df_enron_merged[df_enron_merged["max_nocomp_time"] > threshold][["fileName", "max_nocomp_time"]]

Unnamed: 0,fileName,max_nocomp_time
91,mike_grigsby_000_1_1_1.pst.106.xls,1505.0
753,jim_schwieger_000_1_1.pst.100.xls,10466.0
770,phillip_allen_000_1_1.pst.102.xls,1128.0
789,benjamin_rogers_000_1_1.pst.39.xls,4010.0
1289,phillip_allen_000_1_1.pst.253.xls,1121.0
2465,mike_grigsby_000_1_1_1.pst.108.xls,1825.0
4062,jim_schwieger_000_1_1.pst.71.xls,9869.0
4300,harry_arora_000_1_1.pst.130.xls,9636.0
4546,kevin_presto_000_1_1.pst.16.xls,4946.0
5275,vkaminski_001_1_2_1.pst.62.xls,760.0


In [213]:
df_enron_merged[df_enron_merged["long_nocomp_time"] > threshold][["fileName", "long_nocomp_time"]]

Unnamed: 0,fileName,long_nocomp_time
401,john_griffith_000_1_1.pst.185.xls,4540
753,jim_schwieger_000_1_1.pst.100.xls,8238
789,benjamin_rogers_000_1_1.pst.39.xls,3590
4062,jim_schwieger_000_1_1.pst.71.xls,8807
4300,harry_arora_000_1_1.pst.130.xls,9927
4546,kevin_presto_000_1_1.pst.16.xls,4405
5275,vkaminski_001_1_2_1.pst.62.xls,538
5987,jim_schwieger_000_1_1.pst.106.xls,8946
6161,mark_haedicke_000_1_2.pst.31.xls,873
6982,dutch_quigley_000_1_1.pst.51.xls,9096


In [214]:
df_enron_merged[df_enron_merged["max_taco_time"] > threshold][["fileName", "max_taco_time"]]

Unnamed: 0,fileName,max_taco_time
91,mike_grigsby_000_1_1_1.pst.106.xls,628.0
211,matthew_lenhart_000_1_1.pst.1.xls,567.0
770,phillip_allen_000_1_1.pst.102.xls,542.0
802,vkaminski_000_1_1_1.pst.27.xls,5204.0
1289,phillip_allen_000_1_1.pst.253.xls,551.0
2465,mike_grigsby_000_1_1_1.pst.108.xls,537.0
5969,matthew_lenhart_000_1_1.pst.19.xls,539.0
6142,matthew_lenhart_000_1_1.pst.20.xls,508.0
7028,matthew_lenhart_000_1_1.pst.23.xls,579.0
7233,jim_schwieger_000_1_1.pst.110.xls,515.0


In [215]:
df_enron_merged[df_enron_merged["long_taco_time"] > threshold][["fileName", "long_taco_time"]]

Unnamed: 0,fileName,long_taco_time
802,vkaminski_000_1_1_1.pst.27.xls,4839
7233,jim_schwieger_000_1_1.pst.110.xls,609


In [216]:
df_enron_dep_ref = pd.read_excel(enron_input + "/enron_dep_ref.xlsx")
df_enron_dep_ref

Unnamed: 0,File name,Formulae,Vertices,Edges,TACO Vertices,TACO Edges,Max Dep Ref,Max Dep,Longest Dep Ref,Longest Dep,...,RR-GapFour-NoComp,RR-GapFive,RR-GapFive-NoComp,RR-GapSix,RR-GapSix-NoComp,RR-GapSeven,RR-GapSeven-NoComp,NoType,NoType-NoComp,Unnamed: 36
0,benjamin_rogers_000_1_1.pst.124.xls,18,72,54,24,18,firstrun33000:D9,1,firstrun33000:F17,1,...,0,0,0,0,0,0,0,0,0,54
1,benjamin_rogers_000_1_1.pst.159.xls,15,30,15,10,5,Sheet1:G4,1,Sheet1:C4,1,...,0,0,0,0,0,0,0,0,0,15
2,benjamin_rogers_000_1_1.pst.197.xls,24,48,24,12,6,Sheet1:F27,1,Sheet1:F32,1,...,0,0,0,0,0,0,0,0,0,24
3,benjamin_rogers_000_1_1.pst.200.xls,24,48,24,12,6,Sheet1:F27,1,Sheet1:F32,1,...,0,0,0,0,0,0,0,0,0,24
4,benjamin_rogers_001_1_1.pst.138.xls,415,818,409,88,44,capacity:U77,1,capacity:S84,1,...,0,0,0,0,0,0,0,0,0,409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7396,phillip_allen_000_1_1.pst.253.xls,36455,49227,121520,1945,2029,Power Curve:J5,31846,Curves:A1,268,...,26,8,16,1,2,3,6,341,341,119710
7397,matthew_lenhart_000_1_1.pst.23.xls,39086,52138,132032,1975,2071,Power Curve:J5,34481,Curves:A1,268,...,26,8,16,1,2,3,6,341,341,130226
7398,mike_grigsby_000_1_1_1.pst.108.xls,39075,52104,131968,1972,2069,Power Curve:J5,34481,Curves:A1,268,...,26,8,16,1,2,3,6,341,341,130206
7399,john_lavorato_001_1_1_1.pst.94.xls,69539,104347,312703,37,36,CP Trade Data:F3,34739,CP Trade Data:F2,6952,...,0,0,0,0,0,0,0,3,3,222388


In [217]:
df_enron_complete = df_enron_merged.merge(df_enron_dep_ref, left_on='fileName', right_on='File name')
df_enron_complete

Unnamed: 0,fileName,Max Dep Ref_x,max_taco_size,max_taco_time,max_taco_post_size,max_taco_post_time,max_nocomp_size,max_nocomp_time,max_nocomp_post_size,max_nocomp_post_time,...,RR-GapFour-NoComp,RR-GapFive,RR-GapFive-NoComp,RR-GapSix,RR-GapSix-NoComp,RR-GapSeven,RR-GapSeven-NoComp,NoType,NoType-NoComp,Unnamed: 36
0,chris_germany_000_1_1_1.pst.173.xls,Sheet1:U231,1.0,9.0,1.0,10.0,1.0,30.0,1.0,31.0,...,0,0,0,3,6,0,0,1,1,471
1,hunter_shively_000_1_1.pst.213.xls,Sheet1:R27,1.0,1.0,1.0,1.0,11.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,11
2,phillip_allen_000_1_1.pst.220.xls,weekly:I37,10.0,1.0,7.0,1.0,10.0,1.0,7.0,1.0,...,0,0,0,0,0,0,0,12,12,176
3,sara_shackleton_000_1_1_1.pst.40.xls,Momma Credit Form :I29,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,19
4,lindy_donoho_000_1_1_1.pst.122.xls,2001:E1,27.0,1.0,26.0,1.0,27.0,3.0,26.0,3.0,...,3,52,104,0,0,0,0,38,38,621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7376,theresa_staab_000_1_1.pst.9.xls,Box Draw:E12,16.0,0.0,14.0,0.0,230.0,3.0,199.0,3.0,...,0,0,0,0,0,0,0,26,26,1285
7377,chris_germany_000_1_2.pst.195.xls,Sheet1:B8,63.0,0.0,7.0,0.0,63.0,1.0,7.0,1.0,...,0,0,0,0,0,0,0,0,0,184
7378,dutch_quigley_000_1_1.pst.129.xls,NG-EXOTICS:K7663,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,...,0,0,0,0,0,0,0,4,4,12
7379,don_baughman_000_1_1.pst.127.xls,Sheet1:C47,37.0,0.0,17.0,1.0,37.0,1.0,17.0,1.0,...,0,0,0,0,0,0,0,6,6,12416


In [218]:
threshold = 1000
edge_threshold = 100000
df_enron_large = df_enron_complete[(df_enron_complete['Edges'] > edge_threshold)]

In [219]:
df_enron_large[["fileName", "max_nocomp_time", "Edges"]].sort_values(by=['max_nocomp_time'])

Unnamed: 0,fileName,max_nocomp_time,Edges
2632,jim_schwieger_000_1_1.pst.111.xls,0.0,106850
6213,larry_may_000_1_1.pst.311.xls,0.0,107004
435,benjamin_rogers_000_1_1.pst.18.xls,0.0,158814
2150,sally_beck_000_1_1_1_1.pst.577.xls,0.0,124828
5590,john_lavorato_000_1_1_1.pst.57.xls,0.0,210430
...,...,...,...
6142,matthew_lenhart_000_1_1.pst.20.xls,1819.0,121520
2465,mike_grigsby_000_1_1_1.pst.108.xls,1825.0,131968
789,benjamin_rogers_000_1_1.pst.39.xls,4010.0,876092
4546,kevin_presto_000_1_1.pst.16.xls,4946.0,154820


In [220]:
df_enron_large[df_enron_large["max_nocomp_time"] > threshold][["fileName", "max_nocomp_time", "Edges"]].sort_values(by=['max_nocomp_time'])

Unnamed: 0,fileName,max_nocomp_time,Edges
1289,phillip_allen_000_1_1.pst.253.xls,1121.0,121520
770,phillip_allen_000_1_1.pst.102.xls,1128.0,121501
6161,mark_haedicke_000_1_2.pst.31.xls,1215.0,220270
91,mike_grigsby_000_1_1_1.pst.106.xls,1505.0,120614
6366,rick_buy_000_1_1_1_1.pst.186.xls,1514.0,111182
7028,matthew_lenhart_000_1_1.pst.23.xls,1586.0,132032
5969,matthew_lenhart_000_1_1.pst.19.xls,1749.0,120662
6142,matthew_lenhart_000_1_1.pst.20.xls,1819.0,121520
2465,mike_grigsby_000_1_1_1.pst.108.xls,1825.0,131968
789,benjamin_rogers_000_1_1.pst.39.xls,4010.0,876092


In [221]:
df_enron_large_long = df_enron_large[df_enron_large["max_nocomp_time"] > threshold]

In [222]:
df_enron_large_long.shape[0]/df_enron_large.shape[0]

0.1935483870967742

## Analyze github dataset

In [223]:
df_larger_than = df_github_merged[(df_github_merged["max_taco_time"] > df_github_merged["max_nocomp_time"]) & (df_github_merged["max_nocomp_time"] > 10)]
df_larger_than[["fileName", "max_taco_time", "max_nocomp_time"]]

Unnamed: 0,fileName,max_taco_time,max_nocomp_time
215,DPS Calculator - Ninja.xlsx,302.0,21.0
248,mary_fischer__25393__yr 2001 Commissioning Sum...,34.0,13.0
292,5_4_합성곱신경망.xlsx,98.0,87.0
409,BAO CAO CHI 16.10-22.10 TEMPLATE.xlsx,194.0,193.0
612,mary_fischer__25415__yr 2001 Commissioning Sum...,28.0,12.0
843,LR2.xlsx,998.0,27.0
883,001-functions_financial_amortization_schedules...,157.0,52.0
938,zaloi_1_nogrants.xlsx,262.0,231.0
1147,teeeest.xlsx,5882.0,243.0
1650,Davis SOVC_G2012.xlsx,19.0,11.0


In [224]:
df_github_merged[df_github_merged["max_nocomp_time"] > threshold][["fileName", "max_nocomp_time"]]

Unnamed: 0,fileName,max_nocomp_time
178,gaslines_model_public.xlsx,719910.0
334,big+troubles.xlsx,1468.0
340,WM3796_LT_timeseries_normalization.xlsx,1543.0
394,Fact2Sheet_Corona_20200605.xlsx,6642.0
440,分解法乘法 C.xlsx,78304.0
...,...,...
5073,Fact2Sheet_Corona_20200529.xlsx,5152.0
5174,UST10y_update2.xlsx,1336.0
5226,Detroit data (stata & Eviews outputs).xlsx,2871.0
5264,螺纹_LLT模型-带止损-实盘记录优化版.xlsx,14204.0


In [225]:
df_github_merged[df_github_merged["long_nocomp_time"] > threshold][["fileName", "long_nocomp_time"]]

Unnamed: 0,fileName,long_nocomp_time
178,gaslines_model_public.xlsx,669027.0
511,GaussianClusterTest.xlsx,1510.0
1099,metro.xlsx,7715.0
1103,211018 BTC 2012년부터 변동성 돌파 확인.xlsx,11573.0
1377,Sample_Libreview_forCoding.xlsx,1326.0
1530,葛林布雷選股.xlsx,55612.0
1565,Infiltration_Mohammed.xlsx,1512.0
1607,Regression.xlsx,6622.0
1617,이정인_퍼셉트론엑셀.xlsx,1211.0
1900,Joystick Piloting formulas and calculations Re...,2783.0


In [226]:
df_github_merged[df_github_merged["max_taco_time"] > threshold][["fileName", "max_taco_time"]]

Unnamed: 0,fileName,max_taco_time
1147,teeeest.xlsx,5882.0
1804,GLab_SC2_sequencing_data.xlsx,2102.0
2039,sgl_line.xlsx,47553.0
2919,Southern Water Corp Statistical Analysis.xlsx,1650.0
3224,TPV4.9.xlsx,1425.0
3646,all reit industris per info.xlsx,1351.0
4985,Schema_generator.xlsx,24237.0


In [227]:
df_github_merged[df_github_merged["long_taco_time"] > threshold][["fileName", "long_taco_time"]]

Unnamed: 0,fileName,long_taco_time
1789,rcv_ep2.xlsx,5048
2039,sgl_line.xlsx,46637
2859,Tio Cash Master 1600 1 seventh 0a in the neigh...,2220


In [228]:
df_github_dep_ref = pd.read_excel(github_input + "/github_dep_ref.xlsx")
df_github_dep_ref

Unnamed: 0,File name,Formulae,Vertices,Edges,TACO Vertices,TACO Edges,Max Dep Ref,Max Dep,Longest Dep Ref,Longest Dep,...,RR-GapFour-NoComp,RR-GapFive,RR-GapFive-NoComp,RR-GapSix,RR-GapSix-NoComp,RR-GapSeven,RR-GapSeven-NoComp,NoType,NoType-NoComp,Unnamed: 36
0,HotOcall_latencies_in_cycles.xlsx,400008,400010,400006,8,4,HotOcall_latencies_in_cycles:A1033055,4,HotOcall_latencies_in_cycles:B2,199999,...,0,0,0,0,0,0,0,0,0,400006.0
1,aero_test.xlsx,131532,131533,131532,2,1,Sheet2:A8,131532,Sheet2:A8,131532,...,0,0,0,0,0,0,0,0,0,131532.0
2,datalog1.xlsx,124534,124535,124534,2,1,datalog1:A2,124534,datalog1:A2,124534,...,0,0,0,0,0,0,0,0,0,124534.0
3,기록차이모음.xlsx,299997,599997,599994,9,6,Sheet1:A3,99998,Sheet1:D1,99999,...,0,0,0,0,0,0,0,0,0,599994.0
4,Test13_100k_lines.xlsx,99999,99999,99998,2,1,Sheet1:A2,99998,Sheet1:A2,99998,...,0,0,0,0,0,0,0,0,0,99998.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5431,销售合同及发票管理系统.xlsx,100,300,200,6,4,开票统计表:B40,1,开票统计表:C54,1,...,0,0,0,0,0,0,0,0,0,200.0
5432,销售提成工资核算系统（按金额提成）.xlsx,18111,36449,36334,13,8,个人销售明细查询表:H42,1,个人销售明细查询表:D2,1,...,0,0,0,0,0,0,0,3,3,18338.0
5433,M002_ユーザ登録画面.xlsx,11,11,33,11,11,,0,,0,...,0,0,0,0,0,0,0,11,11,11.0
5434,services_subsectors_sa_q_nace2.xlsx,38,38,76,38,38,,0,,0,...,0,0,0,0,0,0,0,38,38,38.0


In [229]:
df_github_complete = df_github_merged.merge(df_github_dep_ref, left_on='fileName', right_on='File name')
df_github_complete

Unnamed: 0,fileName,Max Dep Ref_x,max_taco_size,max_taco_time,max_taco_post_size,max_taco_post_time,max_nocomp_size,max_nocomp_time,max_nocomp_post_size,max_nocomp_post_time,...,RR-GapFour-NoComp,RR-GapFive,RR-GapFive-NoComp,RR-GapSix,RR-GapSix-NoComp,RR-GapSeven,RR-GapSeven-NoComp,NoType,NoType-NoComp,Unnamed: 36
0,BaoGia2_09122016.xlsx,QR 5@:D18,8.0,8.0,8.0,9.0,8.0,36.0,8.0,37.0,...,0,0,0,0,0,0,0,2,2,182.0
1,tiny.xlsx,Orders:C35,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,101.0
2,GL_Driver_Safety_Report_ferguson_20211018_1441...,Report:Y5,1.0,0.0,1.0,0.0,2216.0,71.0,1.0,75.0,...,0,0,0,0,0,0,0,0,0,9014.0
3,Complete Raw Data CMC.xlsx,Complete Raw Data:A422,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,4980.0
4,1. Excel Homework.xlsx,Sheet1:E4030,2.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,32936.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5339,Wuxia_World_-_Book_of_Information_Volume_1_-_A...,Misc:K67,3.0,0.0,1.0,0.0,30.0,0.0,15.0,0.0,...,0,0,0,0,0,0,0,10,10,218.0
5340,DGSalloc_master.xlsx,DDGS market share estimates:M27,16.0,0.0,4.0,0.0,20.0,0.0,5.0,0.0,...,0,0,0,0,0,0,0,2,2,397.0
5341,UserTransactions.xlsx,Sheet2:N2538,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,2391.0
5342,Productwise Sales comparison in Quntity1 MASTE...,PANEER:Q4,7.0,0.0,5.0,0.0,11.0,1.0,5.0,1.0,...,0,0,0,0,0,0,0,0,0,891.0


In [230]:
df_github_large = df_github_complete[(df_github_complete['Edges'] > edge_threshold)]

In [231]:
df_github_large[["fileName", "max_nocomp_time", "Edges"]].sort_values(by=['max_nocomp_time'])

Unnamed: 0,fileName,max_nocomp_time,Edges
32,NGCC_CCS.xlsx,0.0,603340
2526,89b2a317-9106-43c8-b100-c4c7a2a067db.xlsx,0.0,127750
2527,SSD-TestData-All.xlsx,0.0,199450
5034,201225_Input_data_general_V0.xlsx,0.0,183653
2615,first_lockdown.xlsx,0.0,216854
...,...,...,...
178,gaslines_model_public.xlsx,719910.0,402373
599,BBG_Security_Application_User_List_v5_34.xlsx,,2220971
1912,Master-Calculations-Global-WASH-CBA_GLOBAL_PUB...,,2353656
2144,DINAS PENDIDIKAN GABUNG BOS OK BANGET.xlsx,,1817598


In [232]:
df_github_large[df_github_large["max_nocomp_time"] > threshold][["fileName", "max_nocomp_time", "Edges"]].sort_values(by=['max_nocomp_time'])

Unnamed: 0,fileName,max_nocomp_time,Edges
3295,Kimblesworth CD Weir Logger.xlsx,1014.0,190749
1429,RES_SLP_2012.xlsx,1027.0,351322
3369,Tio Cash Master 0700 PS One Million Test Prime...,1269.0,1256016
778,aim120_amraap.xlsx,1270.0,282775
1617,이정인_퍼셉트론엑셀.xlsx,1291.0,275992
2616,aero_test.xlsx,1298.0,131532
2340,内部ADC计算.xlsx,1321.0,561404
1377,Sample_Libreview_forCoding.xlsx,1357.0,397245
2101,HTMA_3rd_Ch6-2.xlsx,1429.0,527179
4015,File 13 CrUMI_comb.xlsx,1482.0,184109


In [233]:
df_github_large_long = df_github_large[df_github_large["max_nocomp_time"] > threshold]

In [234]:
df_github_large_long.shape[0]/df_github_large.shape[0]

0.09316770186335403