In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

from IPython.display import display, HTML

Populating the interactive namespace from numpy and matplotlib


In [2]:
import sys
import pickle
import seaborn as sns
import itertools
from time import time

sys.path.append('..')

import warnings
warnings.filterwarnings("ignore")

In [None]:
from simulations.scenarios import *
from simulations.agent import *
from simulations.utils import Context
from scripts.parse_enron import Message

In [None]:
sns.set_context("paper", font_scale=1.5)
sns.set_style("white")
# sns.set_palette("deep")
sns.set_palette(sns.cubehelix_palette(n_colors=4))

In [None]:
parsed_logs_folder = '../Enron/parsing/'
social_graph = pickle.load(open(parsed_logs_folder + "social.pkl", "rb"))

enron_log = pickle.load(open(parsed_logs_folder + "replay_log.pkl", "rb"))
LOG_SIZE = len(enron_log)

In [None]:
dynamic_private_setting = AgentSettings(key_update_every_nb_sent_emails=None, key_update_every_nb_days=5)
dynamic_public_setting = AgentSettings(key_update_every_nb_sent_emails=None, key_update_every_nb_days=5,
                                       introduction_policy=public_contacts_policy)

## Sample plots

In [None]:
def make_encryption_status_plots(scenario_name, reports, breakpoints=None, mask_fn=None):
    if breakpoints is None:
        breakpoints = range(len(reports) + 1)
        
    fig, axes = plt.subplots(nrows=5, ncols=2)
    fig.set_figwidth(15)
    fig.set_figheight(30)
    axes = list(itertools.chain.from_iterable(axes))

    for i, (offset, report, ax) in enumerate(zip(breakpoints[:-1], reports, axes)):
        if mask_fn is not None:
            mask = mask_fn(report)
        else:
            mask = None
        visualize_encryption_status_history('%s (log chunk @%d)' % (scenario_name, offset),
                                            report.encryption_status_data,
                                            report.link_status_data,
                                            mask=mask,
                                            ax=ax)

In [None]:
def compute_enc_stats_in_batches(enc_status_data, link_status_data, mask=None, batch_size=1000):
    log_size = enc_status_data.index[-1] + 1
    batch_stats_data = pd.DataFrame(columns=['Stale key', 'Encrypted',
                                             'Plaintext (initial contact)', 'Plaintext (follow-up)'])
    for i in range(0, log_size, batch_size):
        enc_status_batch = enc_status_data.loc[i:i+batch_size].dropna()
        link_status_batch = link_status_data.loc[i:i+batch_size].dropna()
        if mask is not None:
            mask_batch = mask.loc[i:i+batch_size]
            enc_status_batch = enc_status_batch[mask_batch]
            link_status_batch = link_status_batch[mask_batch]

        if len(enc_status_batch) == 0:
            continue

        #joined_data_batch = pd.concat({'l': link_status_batch, 'e': enc_status_batch}, axis=1).dropna()
        ## This was done to drop rows where enc. status is None.
        #enc_status_batch = joined_data_batch['e'][0]
        #link_status_batch = joined_data_batch['l']
        
        encrypted_prop = np.mean(enc_status_batch == EncStatus.encrypted) * 100
        stale_prop = np.mean(enc_status_batch == EncStatus.stale) * 100
        
        # Proportion of initial contacts
        plain_status_batch = link_status_batch[enc_status_batch == EncStatus.plaintext]
        greeting_mask = plain_status_batch['greeting'] > 0
        followup_mask = plain_status_batch['followup'] > 0
        greeting_prop = np.sum(greeting_mask & ~followup_mask) / len(link_status_batch) * 100
        followup_prop = np.sum(followup_mask) / len(link_status_batch) * 100
        
        batch_stats_data.loc[i] = [stale_prop, encrypted_prop, greeting_prop, followup_prop]
    return batch_stats_data

def visualize_encryption_status_history(title, enc_status_data, link_status_data, mask=None,
                                        batch_size=1000, legend_kwargs=None, ax=None,
                                        show_title=False, show_legend=False):
    if legend_kwargs is None:
        legend_kwargs = {}
    if ax is None:
        fig, ax = plt.subplots()
    
    batch_stats_data = compute_enc_stats_in_batches(enc_status_data, link_status_data, mask, batch_size)
    batch_stats_data.plot.area(ax=ax)
    
    handles, labels = ax.get_legend_handles_labels()
    legend = ax.legend(handles[::-1], labels[::-1], frameon=True, **legend_kwargs)
    if not show_legend:
        legend.remove()
    if show_title:
        ax.set_title(title)
    ax.set_xlabel("Email sent")
    ax.set_ylabel("Email encryption status, %")
    ax.set_ylim(0, 100)

In [None]:
with dynamic_public_setting.as_default():
    context = Context(enron_log[0:1000], social_graph=social_graph)
    public_report = simulate_claimchain(context)
    
with dynamic_private_setting.as_default():
    context = Context(enron_log[0:1000], social_graph=social_graph)
    private_report = simulate_claimchain(context)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
batch_size = 100

visualize_encryption_status_history('Public ClaimChain',
                                    public_report.encryption_status_data,
                                    public_report.link_status_data,
                                    batch_size=batch_size,
                                    ax=axes[0][0])

visualize_encryption_status_history('Private ClaimChain',
                                    private_report.encryption_status_data,
                                    private_report.link_status_data,
                                    batch_size=batch_size,
                                    ax=axes[0][1])

visualize_encryption_status_history('Public ClaimChain (Userset only)',
                                    public_report.encryption_status_data,
                                    public_report.link_status_data,
                                    public_report.participants_type_data == ParticipantsTypes.userset,
                                    batch_size=batch_size,
                                    ax=axes[1][0])

visualize_encryption_status_history('Private ClaimChain (Userset only)',
                                    private_report.encryption_status_data,
                                    private_report.link_status_data,
                                    private_report.participants_type_data == ParticipantsTypes.userset,
                                    batch_size=batch_size,
                                    ax=axes[1][1])

fig.set_figwidth(15)
fig.set_figheight(10)

## Simulate private ClaimChain at different starting points in the log

In [None]:
LOG_CHUNK_SIZE = 10000

In [None]:
private_reports = []
breakpoints = [int(i) for i in np.linspace(0, LOG_SIZE, 11)]

## Compute new reports
# for i in breakpoints[:-1]:
#     context = Context(enron_log[i : i + LOG_CHUNK_SIZE], social_graph=social_graph)
#     with dynamic_private_setting.as_default():
#         private_reports.append(simulate_claimchain(context))
#     with open('reports/private_claimchain_report-%d.pkl' % i, 'wb') as h:
#         pickle.dump(private_reports[-1], h)

# Load reports
for i in breakpoints[:-1]:
    with open('reports/private_claimchain_report-%d.pkl' % i, 'rb') as h:
        report = pickle.load(h)
        private_reports.append(report)

Global encryption traffic

In [None]:
make_encryption_status_plots('Private ClaimChain', private_reports, breakpoints=breakpoints)

Traffic within the userset

In [None]:
make_encryption_status_plots('Private ClaimChain', private_reports, breakpoints=breakpoints,
                             mask_fn=lambda report: report.participants_type_data == ParticipantsTypes.userset)

Average encryption proportions

In [None]:
def get_average_enc_status_data(report, mask=None, batch_size=1000):
    cuts = [4000, 7000, 10000]

    enc_stats_batches = compute_enc_stats_in_batches(
        report.encryption_status_data, report.link_status_data, mask=mask)
    
    if mask is not None:
        enc_status_data = report.encryption_status_data[mask]
    else:
        enc_status_data = report.encryption_status_data
        
    enc_avg_data = pd.DataFrame(columns=['Last batch average', 'Overall average'])
    for cut in cuts:
        # Average of immediate %
        batch_avg = enc_stats_batches['Encrypted'].loc[cut-batch_size]

        # Overall average
        slice_stats = enc_status_data.loc[:cut].value_counts()
        nb_encrypted = slice_stats.get(EncStatus.encrypted) or 0 
        overall_avg = nb_encrypted / slice_stats.sum() * 100

        enc_avg_data.loc[cut] = (batch_avg, overall_avg)

    return enc_avg_data

In [None]:
summaries = []

for i, (offset, report) in enumerate(zip(breakpoints, private_reports)):
    display(HTML('<h4>@%d</h4>' % offset))
    a = get_average_enc_status_data(report)
    b = get_average_enc_status_data(report,
        mask=report.participants_type_data==ParticipantsTypes.userset)
    df = pd.concat({'global': a, 'userset': b}, axis=1)
    summaries.append(df)
    display(df)

In [None]:
# Stats over all chunks
global_last_batch_df = pd.concat(
    [s['global']['Last batch average'] for s in summaries], axis=1)
userset_last_batch_df = pd.concat(
    [s['userset']['Last batch average'] for s in summaries], axis=1)
global_overall_df = pd.concat(
    [s['global']['Overall average'] for s in summaries], axis=1)
userset_overall_df = pd.concat(
    [s['userset']['Overall average'] for s in summaries], axis=1)

def compute_stats_over_chunks(data):
    return pd.DataFrame({
        'Avg': data.mean(axis=1), 
        'Std': data.std(axis=1),
        'Std * 2.26': data.std(axis=1) * 2.26   # 95% t-val for df=9
    })

display(pd.concat({
    'Last batch average (global)': compute_stats_over_chunks(global_last_batch_df),
    'Last batch average (userset)': compute_stats_over_chunks(userset_last_batch_df),
}, axis=1))

display(pd.concat({
    'Overall average (global)': compute_stats_over_chunks(global_overall_df),
    'Overall average (userset)': compute_stats_over_chunks(userset_overall_df),
}, axis=1))

## Simulate public ClaimChain at one of the points (it's slower)

Pick one chunk

In [None]:
offset = breakpoints[4]
private_report = private_reports[4]

In [None]:
## Compute new reports
#with dynamic_public_setting.as_default():
#    context = Context(enron_log[offset:offset+LOG_CHUNK_SIZE], social_graph=social_graph)
#    public_report = simulate_claimchain(context)
#with open('reports/public_claimchain_report-%d.pkl' % offset, 'wb') as h:
#    pickle.dump(public_report, h)

# Load computed reports
with open('reports/public_claimchain_report-%d.pkl' % offset, 'rb') as h:
    public_report = pickle.load(h)

Compare public and private versions

In [None]:
batch_size = 1000

fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True)
fig.set_tight_layout(tight=True)

visualize_encryption_status_history('Public claims',
                                    public_report.encryption_status_data,
                                    public_report.link_status_data,
                                    batch_size=batch_size,
                                    show_title=True,
                                    ax=axes[0])

visualize_encryption_status_history('Private claims',
                                    private_report.encryption_status_data,
                                    private_report.link_status_data,
                                    batch_size=batch_size,
                                    show_legend=True,
                                    show_title=True,
                                    ax=axes[1])
fig.set_figwidth(10)

for tick in [20, 40, 60, 80]:
    axes[0].axhline(tick, 0, 10000, alpha=.25, linestyle='-')
    axes[1].axhline(tick, 0, 10000, alpha=.25, linestyle='-')
    
for tick in [2000, 4000, 6000, 8000]:
    axes[0].axvline(tick, 0, 100, alpha=.25, linestyle='-')
    axes[1].axvline(tick, 0, 100, alpha=.25, linestyle='-')

fig.savefig('images/enc_status_data_global.pdf')

fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True)
fig.set_tight_layout(tight=True)

visualize_encryption_status_history('Public claims',
                                    public_report.encryption_status_data,
                                    public_report.link_status_data,
                                    public_report.participants_type_data == ParticipantsTypes.userset,
                                    batch_size=batch_size,
                                    show_title=True,
                                    ax=axes[0])

visualize_encryption_status_history('Private claims',
                                    private_report.encryption_status_data,
                                    private_report.link_status_data,
                                    private_report.participants_type_data == ParticipantsTypes.userset,
                                    batch_size=batch_size,
                                    show_title=True,
                                    ax=axes[1])

fig.set_figwidth(10)

for tick in [20, 40, 60, 80]:
    axes[0].axhline(tick, 0, 10000, alpha=.25)
    axes[1].axhline(tick, 0, 10000, alpha=.25)
    
for tick in [2000, 4000, 6000, 8000]:
    axes[0].axvline(tick, 0, 100, alpha=.25, linestyle='-')
    axes[1].axvline(tick, 0, 100, alpha=.25, linestyle='-')
    
fig.savefig('images/enc_status_data_userset.pdf')

In [None]:
print('Public ClaimChain')
a = get_average_enc_status_data(public_report)
b = get_average_enc_status_data(public_report,
    mask=private_report.participants_type_data==ParticipantsTypes.userset)
display(pd.concat({'global': a, 'userset': b}, axis=1))

print('Private ClaimChain')
a = get_average_enc_status_data(private_report)
b = get_average_enc_status_data(private_report,
    mask=private_report.participants_type_data==ParticipantsTypes.userset)
display(pd.concat({'global': a, 'userset': b}, axis=1))

## Visualize storage

In [None]:
def get_average_storage_data(report, batch_size=2500):
    
    def collect_data_points(series_dict):
        data_points_by_batch = defaultdict(list)
        for offset in range(0, LOG_CHUNK_SIZE, batch_size):
            for series in series_dict.values():
                data_slice = series.loc[offset:offset + batch_size] \
                            .dropna().values
                if len(data_slice) == 0:
                    continue
                data_point = data_slice.mean() / 1024
                data_points_by_batch[offset + batch_size].append(data_point)

        for offset in range(0, LOG_CHUNK_SIZE, batch_size):
            points = data_points_by_batch[offset + batch_size]
            data_points_by_batch[offset + batch_size] = pd.Series(points)

        result = pd.DataFrame(data_points_by_batch)
        return result

    bandwidth_data = collect_data_points(report.outgoing_bandwidth_data)
    gossip_storage_data = collect_data_points(report.gossip_store_size_data)
    local_storage_data = collect_data_points(report.local_store_size_data)
    
    return bandwidth_data, gossip_storage_data, local_storage_data

In [None]:
report = private_reports[4]
private_bandwidth_data, private_gossip_storage_data, private_local_storage_data = \
    get_average_storage_data(report)
public_bandwidth_data, public_gossip_storage_data, public_local_storage_data = \
    get_average_storage_data(public_report)

In [None]:
fig, ax = plt.subplots()

ax.legend(frameon=True, loc="right bottom")
ax.set_ylabel("Kb")
ax.set_xlabel("Email sent")
ax.set_ylim(0, 30)
private_bandwidth_data.plot.box(ax=ax)

fig.savefig('images/private_bandwidth.pdf')

In [None]:
fig, ax = plt.subplots()

ax.legend(frameon=True, loc="right bottom")
ax.set_ylabel("Kb")
ax.set_xlabel("Email sent")
ax.set_ylim(0, 2000)
public_bandwidth_data.plot.box(ax=ax)

fig.savefig('images/public_bandwidth.pdf')

In [None]:
fig, ax = plt.subplots()

ax.set_ylabel("Kb")
ax.set_xlabel("Email sent")
ax.set_ylim(0, 2000)
private_gossip_storage_data.plot.box(ax=ax)

fig.savefig('images/private_gossip_storage.pdf')

In [None]:
fig, ax = plt.subplots()

ax.set_ylabel("Kb")
ax.set_xlabel("Email sent")
ax.set_ylim(0, 20000)
public_gossip_storage_data.plot.box(ax=ax)

fig.savefig('images/public_gossip_storage.pdf')

In [None]:
fig, ax = plt.subplots()

ax.set_ylabel("Kb")
ax.set_xlabel("Email sent")
ax.set_ylim(0, 30)
private_local_storage_data.plot.box(ax=ax)

fig.savefig('images/private_local_storage.pdf')

In [None]:
fig, ax = plt.subplots()

ax.set_ylabel("Kb")
ax.set_xlabel("Email sent")
ax.set_ylim(0, 5000)
public_local_storage_data.plot.box(ax=ax)

fig.savefig('images/public_local_storage.pdf')