In [1]:
import pandas as pd
import ipywidgets as widgets

In [2]:
cvs_path = "/home/ubuntu/hwpg-ae/analysis_ae/js_results_1736761230.csv"
df = pd.read_csv(cvs_path)

df = df[df["Call"] != "MessagePort.onmessage.set"]
df = df[df["Call"] != "MessagePort.onmessageerror.set"]

# General Information

total_origins = df['origin'].drop_duplicates().count()

# Filter out Empty as this was only there to count origins
df = df[df["Call"] != "Empty"]

# All the CSS stuff was not hooked
# df = df[df["Call"] != "CSS.escape"]
df = df[df["Call"] != "CSS.supports"]
df = df[df["Call"] != "CSS.registerProperty"]
df = df[df["Call"] != "CSS.paintWorklet"]
df = df[df["Call"] != "CSS.px"]
df = df[df["Call"] != "CSS.paintWorklet.get"]
df = df[df["Call"] != "CSS.highlights.get"]

# Remove style set, because it is not specified but still used
df = df[df["Call"] != "HTMLElement.style.set"]

df = df[df["Call"] != "StyleSheetList."]

# Filter out calls for the port 1 & 2 getter as we cannot hook this attribution
df = df[df["Call"] != "MessageChannel.port1.get"]
df = df[df["Call"] != "MessageChannel.port2.get"]

total_appearances = df[df['Appearances JS'] != 0]['origin'].count() # df['origin'].count()
total_executions = df[df['Appearances JS'] != 0]['Appearances JS'].sum() # df['origin'].count()


print("Number of origins:", total_origins)
print("Number of origins with appearance:", df['origin'].drop_duplicates().count())
print("Number of appearances:", df['origin'].count())
print("Number of appearances (only GT):", total_appearances)
print("Number of executions (only GT):", total_executions)

Number of origins: 1
Number of origins with appearance: 1
Number of appearances: 1
Number of appearances (only GT): 1
Number of executions (only GT): 1


In [None]:
# Information about appearances general

for experiment in ['Appearances PG', 'Appearances HAR JS', 'Appearances WARC JS']:
    print('Appearances JS vs.', experiment)
    appearances = df[(df['Appearances JS'] == df[experiment]) & (df['Appearances JS'] != 0)]
    eq_ap = appearances['origin'].count()
    print(f"Equal number of appearances: {eq_ap} / {total_appearances} ({round(eq_ap / total_appearances, 2)})")

    appearances = df[(df['Appearances JS'] != df[experiment])  & (df['Appearances JS'] != 0)]
    neq_ap = appearances['origin'].drop_duplicates().count()
    print(f"Not equal number of appearances (origins): {neq_ap} / {total_origins} ({round(neq_ap / total_origins, 2)})")

    appearances = df[(df['Appearances JS'] < df[experiment])  & (df['Appearances JS'] != 0)]
    gt_ap = appearances['origin'].count()
    gt_orig = appearances['origin'].drop_duplicates().count()
    print(f"Less JS appearances: {gt_ap} / {total_appearances} ({round(gt_ap / total_appearances, 2)})")
    print(f"Less JS appearances (origins): {gt_orig} / {total_origins} ({round(gt_orig / total_origins, 2)})")

    appearances = df[(df['Appearances JS'] > df[experiment])  & (df['Appearances JS'] != 0)]
    lt_ap = appearances['origin'].count()
    lt_orig = appearances['origin'].drop_duplicates().count()
    print(f"More JS appearances: {lt_ap} / {total_appearances} ({round(lt_ap / total_appearances, 2)})")
    print(f"More JS appearances (origins): {lt_orig} / {total_origins} ({round(lt_orig / total_origins, 2)})")
    print()



Appearances JS vs. Appearances PG
Equal number of appearances: 1 / 1 (1.0)
Not equal number of appearances (origins): 0 / 1 (0.0)
Less JS appearances: 0 / 1 (0.0)
Less JS appearances (origins): 0 / 1 (0.0)
More JS appearances: 0 / 1 (0.0)
More JS appearances (origins): 0 / 1 (0.0)

Appearances JS vs. Appearances HAR JS
Equal number of appearances: 1 / 1 (1.0)
Not equal number of appearances (origins): 0 / 1 (0.0)
Less JS appearances: 0 / 1 (0.0)
Less JS appearances (origins): 0 / 1 (0.0)
More JS appearances: 0 / 1 (0.0)
More JS appearances (origins): 0 / 1 (0.0)

Appearances JS vs. Appearances WARC JS
Equal number of appearances: 1 / 1 (1.0)
Not equal number of appearances (origins): 0 / 1 (0.0)
Less JS appearances: 0 / 1 (0.0)
Less JS appearances (origins): 0 / 1 (0.0)
More JS appearances: 0 / 1 (0.0)
More JS appearances (origins): 0 / 1 (0.0)



In [4]:
# How many origins use how many features of the APIs.
def standard_categorizer(call):
    if call.startswith("MessageChannel"):
        return "HTML_MC"
    if call.startswith("MessagePort"):
        return "HTML_MC"
    return "CSS_OM"
    
df["Standard"] = df["Call"].apply(standard_categorizer)

with pd.option_context("display.max_rows", 70):
    # display(df[["Call", "Standard"]].sort_values(by=["Standard", "Call"]).drop_duplicates())
    print("CSS_OM", len(df[df["Standard"] == "CSS_OM"]["Call"].unique()))
    print("HTML_MC", len(df[df["Standard"] == "HTML_MC"]["Call"].unique()))
    print()

    # GT origins with JS standard calls
    print("CSS_OM origins JS", len(df[(df["Appearances JS"] > 0) & (df["Standard"] == "CSS_OM")]["origin"].unique()) / 8479 * 100)
    print("HTML_MC origins JS", len(df[(df["Appearances JS"] > 0) & (df["Standard"] == "HTML_MC")]["origin"].unique()) / 8479 * 100)
    print()
    
    # PG origins with JS standard calls
    print("CSS_OM origins PG", len(df[(df["Appearances PG"] > 0) & (df["Standard"] == "CSS_OM")]["origin"].unique()) / 8479 * 100)
    print("HTML_MC origins PG", len(df[(df["Appearances PG"] > 0) & (df["Standard"] == "HTML_MC")]["origin"].unique()) / 8479 * 100)
    print()
    
    # HAR origins with JS standard calls
    print("CSS_OM origins HAR JS", len(df[(df["Appearances HAR JS"] > 0) & (df["Standard"] == "CSS_OM")]["origin"].unique()) / 8479 * 100)
    print("HTML_MC origins HAR JS", len(df[(df["Appearances HAR JS"] > 0) & (df["Standard"] == "HTML_MC")]["origin"].unique()) / 8479 * 100)
    print()
    
    # WARC origins with JS standard calls
    not_found_correction = len(df[(df["Appearances WARC JS"] == 5) & (df["Appearances JS"] == 0)  & (df["Call"] == "HTMLElement.style.get")])
    print("CSS_OM origins WARC JS", (len(df[(df["Appearances WARC JS"] > 0) & (df["Standard"] == "CSS_OM")]["origin"].unique()) - not_found_correction) / 8479 * 100)
    print("HTML_MC origins WARC JS", len(df[(df["Appearances WARC JS"] > 0) & (df["Standard"] == "HTML_MC")]["origin"].unique()) / 8479 * 100)

CSS_OM 1
HTML_MC 0

CSS_OM origins JS 0.011793843613633684
HTML_MC origins JS 0.0

CSS_OM origins PG 0.011793843613633684
HTML_MC origins PG 0.0

CSS_OM origins HAR JS 0.011793843613633684
HTML_MC origins HAR JS 0.0

CSS_OM origins WARC JS 0.011793843613633684
HTML_MC origins WARC JS 0.0


In [5]:
# Show me origins for wich WARC has more origins than HAR for CSS_OM

with pd.option_context("display.max_rows", 300):
    print(len(df[(df["Appearances WARC JS"] == 5) & (df["Appearances JS"] == 0)  & (df["Call"] == "HTMLElement.style.get")]))
    display(df[(df["Standard"] == "CSS_OM") & (df["origin"].str.contains("http_movilnet.com.ve"))])
    # display(df[(df["Appearances WARC JS"] > 0) & (df["Appearances JS"] == 0)  & (df["Standard"] == "CSS_OM")]["Appearances WARC JS"].value_counts())
    display(df[(df["Appearances WARC JS"] == 2) & (df["Appearances JS"] == 0)  & (df["Standard"] == "CSS_OM")])


0


Unnamed: 0,Call,Appearances JS,Appearances PG,Appearances HAR JS,Appearances WARC JS,origin,Standard


Unnamed: 0,Call,Appearances JS,Appearances PG,Appearances HAR JS,Appearances WARC JS,origin,Standard


In [6]:
# How many origins have GT and PG more than 0 executions and WARC and HAR = 0

for experiment in ['Appearances HAR JS', 'Appearances WARC JS']:
    df_zero = df[(df[experiment] == 0) & (df["Appearances PG"] > 0) & (df["Appearances JS"] > 0)]
    print(f"GT and PG are greater zero, {experiment} is not: {df_zero['origin'].count()}")
    df_zero = df[(df[experiment] == 0) & (df["Appearances PG"] > 0) & (df["Appearances JS"] == 0)]
    print(f"PG is greater zero, GT and {experiment} is not: {df_zero['origin'].count()}")
    df_zero = df[(df[experiment] > 0) & (df["Appearances PG"] == 0) & (df["Appearances JS"] > 0)]
    print(f"PG is zero, GT and {experiment} is not: {df_zero['origin'].count()}")
    print("---")


GT and PG are greater zero, Appearances HAR JS is not: 0
PG is greater zero, GT and Appearances HAR JS is not: 0
PG is zero, GT and Appearances HAR JS is not: 0
---
GT and PG are greater zero, Appearances WARC JS is not: 0
PG is greater zero, GT and Appearances WARC JS is not: 0
PG is zero, GT and Appearances WARC JS is not: 0
---


In [7]:
df_test = df # [df["Call"] == "CSSRuleList.length.get"]

for experiment in ['Appearances PG', 'Appearances HAR JS', 'Appearances WARC JS']:
    df_test["PercDiff " + experiment] = abs(df_test["Appearances JS"] - df_test[experiment]) / (df_test["Appearances JS"]+df_test[experiment]) / 2


df_test['PercDiff Appearances PG'] = df_test['PercDiff Appearances PG'].infer_objects(copy=False).fillna(0)
df_test['PercDiff Appearances HAR JS'] = df_test['PercDiff Appearances HAR JS'].infer_objects(copy=False).fillna(0)
df_test['PercDiff Appearances WARC JS'] = df_test['PercDiff Appearances WARC JS'].infer_objects(copy=False).fillna(0)
display(df_test)

print(df_test[df_test['Appearances JS'] != 0]["PercDiff Appearances PG"].mean())
print(df_test[df_test['Appearances JS'] != 0]["PercDiff Appearances HAR JS"].mean())
print(df_test[df_test['Appearances JS'] != 0]["PercDiff Appearances WARC JS"].mean())


Unnamed: 0,Call,Appearances JS,Appearances PG,Appearances HAR JS,Appearances WARC JS,origin,Standard,PercDiff Appearances PG,PercDiff Appearances HAR JS,PercDiff Appearances WARC JS
0,Window.setTimeout,1,1,1,1,http_localtest.me:8000,CSS_OM,0.0,0.0,0.0


0.0
0.0
0.0


In [8]:
df_pg_diff = df_test[["Call", "PercDiff Appearances PG", "PercDiff Appearances HAR JS", "PercDiff Appearances WARC JS"]]
df_pg_diff = df_pg_diff.groupby(by=["Call"]).mean().sort_values("PercDiff Appearances PG")

filt = (df_pg_diff["PercDiff Appearances PG"] > df_pg_diff["PercDiff Appearances HAR JS"]) | (df_pg_diff["PercDiff Appearances PG"] > df_pg_diff["PercDiff Appearances WARC JS"])

"""
for idx in df_pg_diff.index:
    if (df_pg_diff["PercDiff Appearances PG"][idx] > df_pg_diff["PercDiff Appearances HAR JS"][idx] or
        df_pg_diff["PercDiff Appearances PG"][idx] > df_pg_diff["PercDiff Appearances WARC JS"][idx]):
        print(idx, df_pg_diff["PercDiff Appearances PG"][idx])
"""

display(df_pg_diff[filt])

Unnamed: 0_level_0,PercDiff Appearances PG,PercDiff Appearances HAR JS,PercDiff Appearances WARC JS
Call,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [9]:
differing_appearances = df[df['Appearances JS'] > df['Appearances PG']]
display(differing_appearances['origin'].drop_duplicates().count())

# display(df)

0