In [None]:
import os
from database_connector import connect, postgresql_to_dataframe

In [None]:
with open("../../database.env") as f:
    for line in f:
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

In [None]:
cons = []
for db_name in ["cf", "cf_win", "webkitmac"]:
    os.environ["DB_NAME"] = db_name
    cons.append(connect())

In [None]:
def get_dfs(cons, query, columns):
    dfs = []
    for conn in cons:
        dfs.append(postgresql_to_dataframe(conn, query, columns))
    return dfs

# Analysis

## Observations
- all observations
- most have roughly the number of records they should have
- window.open was broken for Chrome and Firefox (reason: node 14), thus it was redone db=cf_win (with node 17)

In [None]:
urls = 359424
runs = 2
print(f"Should be {urls * runs} records for every inclusion method and browser")

In [None]:
# Count of observations
query = """
SELECT 
    inc_method, browser, COUNT(id)
FROM
    observations GROUP BY inc_method, browser;
"""
columns = ["inc_method", "browser", "count"]
counts = get_dfs(cons, query, columns)

In [None]:
for df in counts:
    display(df)

## Errors and retries
- some retries (specially Firefox) that have no error entry (i.e., failed without an entry in db?)
- mostly almost no errors/retries, Firefox some more

In [None]:
# Count of errors and retries
query = """
SELECT 
    inc_method, browser, retry, error, COUNT(id)
FROM
    observations GROUP BY inc_method, browser, retry, error;
"""
columns = ["inc_method", "browser", "retry", "error", "count"]
errs = get_dfs(cons, query, columns)

In [None]:
for df in errs:
    display(df)

## Notes
- promiseAny -> node 14 did not support it
- Popup event often does not fire: webkit almost never fires it, firefox 50%?, chromium 70%?
- ...

In [None]:
# Count of notes
query = """
SELECT
    regexp_replace(notes, '/echo/\d+/', '/echo/<redacted>/', 'g') AS note,
    inc_method,
    browser,
    count(id)
FROM
    observations
GROUP BY
    note,
    inc_method,
    browser
ORDER BY
    count DESC;
"""
columns = ["note", "inc_method", "browser", "count"]
notes = get_dfs(cons, query, columns)

In [None]:
for df in notes:
    display(df)

## Timing 
- webkit very fast for iframe and stuff (min)?
- webkit averages high as echo + opg only run on imac and it has to go over the network for mbp1, mbp2
- firefox has more std than chromium
- audio, video, img -> firefox and webkit way higher times than chromium

In [None]:
# TIMING
query = """
SELECT
	inc_method,
	browser,
	min(cast (observation->'loading_time' as INT)) as "min loading",
	avg(CAST (observation->'loading_time' as INT)) as "avg loading",
	stddev(CAST (observation->'loading_time' as INT)) as "std loading",
	max(CAST (observation->'loading_time' as INT)) as "max loading",
	min(CAST (observation->'complete_time' as INT)) as "min complete",
	avg(cast (observation->'complete_time' as INT)) as "avg complete",
	stddev(CAST (observation->'complete_time' as INT)) as "std complete",
	max(cast (observation->'complete_time' as INT)) as "max complete"
FROM
	observations
GROUP BY
	inc_method,
	browser
"""
columns = ["inc_method", "browser", "min loading", "avg loading", "std loading", "max loading", "min complete", "avg complete", "std complete", "max complete"]
times = get_dfs(cons, query, columns)

In [None]:
for df in times:
    display(df.describe())

In [None]:
for df in times:
    display(df)

In [None]:
# Time taken
query = """
SELECT
	min(insertion_time), max(insertion_time)
FROM
	observations
"""
columns = ["First entry", "Last entry"]
dates = get_dfs(cons, query, columns)

In [None]:
for df in dates:
    display(df)

In [None]:
dates[2]["Last entry"] - dates[2]["First entry"]