# Headers Experiment Demo Analysis

## Warning

The experiment in the demonstration mode is run without setting up the account framework and registering accounts. Thus, the experiments will run in logged-out mode twice as the sessions are not valid. Still, this can be used to see how the experiments can be ran and what data the experiments produce.

In [None]:
%load_ext autoreload
%autoreload 2

## General Analysis

In [None]:
# imports
from database import Task, URL
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML

In [None]:
# Select data from db
tasks = Task.select().where(Task.job == "demoheaders")
urls = URL.select().where(URL.job == "demoheaders")

# Transform to dataframes
tasks = pd.DataFrame(list(tasks.dicts()))
urls = pd.DataFrame(list(urls.dicts()))

# Update note for login
urls['note'] = pd.merge(urls, tasks, left_on='task', right_on='id', how='left')['note_y']

# Show tasks table
display(tasks)

In [None]:
# Display how many URLs were crawled in each state

display(urls.groupby(['code', 'note'])[['id']].count())

## Header Analysis

In [None]:
# imports
from modules.headersexperiment import Header
import tld
import ast

In [None]:
# Select headers of main pages
headers = Header.select().where(Header.job == "demoheaders", Header.resource == "document", Header.mainframe == True)

# Transform to dataframe
headers = pd.DataFrame(list(headers.dicts()))

# Update headers fromurl
headers['fromurl'] = pd.merge(headers, urls, left_on='fromurl', right_on='id', how='left')['urlfinal']

In [None]:
# Get those rows which originated from and went to the same site

def is_same_site(row):
    try:
        from_site = tld.get_tld(row['fromurl'], as_object=True).fld
        to_site = tld.get_tld(row['tourlfinal'], as_object=True).fld
    except Exception:
        return False
    
    return (row['site'] == from_site) & (row['site'] == to_site)

headers_ss = headers[headers.apply(is_same_site, axis=1)]

In [None]:
# Transform desired headers into columns

def get_considered_headers(headers):
    result = {}
    for item in headers:
        name = item['name'].lower()
        value = item['value'].strip()  # No stripping of whitespace?
        if name in result:
            result[name].append(value)
        else:
            result[name] = [value]

    hsts = result.get('strict-transport-security', None)
    hsts = ','.join(hsts) if hsts else hsts

    xfo = result.get('x-frame-options', None)
    xfo = ','.join(xfo) if xfo else xfo

    return pd.Series({'strict-transport-security': hsts, 'x-frame-options': xfo})

# Unpack headers str list to objects
headers_parsed = headers_ss["headers"].apply(lambda x: ast.literal_eval(x)).apply(get_considered_headers)
headers_parsed = pd.concat([headers_ss, headers_parsed], axis=1)

In [None]:
# Group the HSTS header by site, state, code, and value. Then display the count of the header value for each grouping.

display(pd.DataFrame(headers_parsed.groupby(['site', 'note', 'code', 'strict-transport-security'], dropna=False)[['strict-transport-security']].value_counts(dropna=False)))