In [None]:
import json
import glob
from tqdm import tqdm
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import matplotlib.dates as mdates

In [None]:
DATA = "bandwidth.json"

def hostnameToLocation(hostname): 
    """Change hostnames as reported by each VM to a location code."""

    label = hostname.split(".")[0] 
    locationCode = label.split("-")[-1] if hostname != "fruchter-ipfs-probe" else "chs1"
    return locationCode

# Read file from Mongo export
with open(DATA) as f:
    bw = [json.loads(line) for line in f.readlines()]
    for tx in tqdm(bw):
        # Remove Mongo ID
        del tx['_id']

        # Parse hostname        
        tx['VANTAGE'] = hostnameToLocation(tx['VANTAGE'])
        tx['VP_SHORT'] = location[0:3]

        # Timestamp to Python datetime
        tx['TIMESTAMP'] = datetime.fromtimestamp( int(tx['TIMESTAMP']) ) 
        
        # Deal with export errors
        if type(tx['RateIn']) == dict: 
            tx['RateIn'] = np.nan
        if type(tx['RateOut']) == dict: 
            tx['RateOut'] = np.nan
        
# Convert to pandas DataFrame
df = pd.DataFrame(bw)
df['dti'] = pd.DatetimeIndex(df['TIMESTAMP'])

# Map location code to region code
region = {'lax': 'na', 'chs1': 'na', 'chs2': 'na', 'ams': 'eu', 'ams2': 'eu', 'hel': 'eu', 'bom': 'ap', 'sin': 'ap', 'gru': 'sa'}
df['Region'] = df.VANTAGE.map(region)

# Create data subsets
dfSum = df[ df.LOGTYPE == 'bw_all']
dfProto = df[ df.LOGTYPE == 'bw_proto']
# ...and index to timestamp
dfSum.index = dfSum.dti
dfProto.index = dfProto.dti


In [None]:
# Cumulative data usage: all protocols
subset = dfSum[ ['TotalIn', 'TotalOut', 'VANTAGE']]
subset = subset.reset_index().set_index(['dti','VANTAGE'])
subset = subset.stack().reset_index().set_index('dti')
subset.columns = pd.Index(['Vantage', 'Direction', 'Total'])

# Sum over individual VPs
subset['Total MBs'] = subset.Total.astype(np.int) / 1e3

In [None]:
# Cumulative data usage by protocol
subset = dfProto[ ['TotalIn', 'TotalOut', 'VANTAGE', 'NOTE']]
subset = subset[ (subset.NOTE == "/ipfs/kad/1.0.0") | (subset.NOTE == '/ipfs/id/1.0.0') | (subset.NOTE == '/ipfs/ping/1.0.0') ] 
subset = subset[ (subset.VANTAGE != 'mbp13') & (subset.VANTAGE != "big") ]

# Resampled bandwidth data
avg = subset.groupby(['NOTE', 'VANTAGE'])[['TotalIn', 'TotalOut']].resample("10Min").median()

bw = avg.stack().reset_index().set_index('dti')
bw.columns = pd.Index(['Protocol', 'Vantage', 'Direction', 'Transfer'])
bw = bw.reset_index()

# Create a numeric day index for visualization purposes
bw['Day'] = (bw.dti - timedelta(days=22)).dt.day
bw['Direction'] = bw.Direction.replace("CumlIn", "Down").replace("CumlOut", "Up")
bw['Transfer (MB)'] = bw.Transfer / 1e6

g = sns.FacetGrid(data=bw,
                  col='Protocol', hue='Direction', sharey=False)
g.map(sns.lineplot, 'Day', 'Transfer (MB)', estimator='mean')


In [None]:
# Instantaneous rates: startup / first 5 hours
sns.set(rc={'figure.figsize':(6,2)})
avg = subset.loc['2019-04-23 17:00':'2019-04-23 22:00'] \
    .groupby(["NOTE", "Region"])[ ['RateIn', 'RateOut'] ] \ 
    .resample("6Min").mean()

# Create new dataset that's stacked for Seaborn plotting
startup = avg.stack().reset_index().set_index('dti')
startup.columns = pd.Index(['Protocol', 'Vantage', 'Direction', 'Rate'])
startup['Direction' ] = startup.Direction.replace("RateIn", "Down").replace("RateOut", "Up")
startup = startup.reset_index()

# We started measurement at 16H
startup['Hour'] = (startup.dti - timedelta(hours=16)).dt.hour

# Plot
fig, ax = plt.subplots()
sns.lineplot(data=startup, x='Hour', y='Rate', hue='Direction', estimator='mean')
ax.set(xlabel='Hrs After Startup')
ax.set(ylabel='Rate (kB/s)')

In [None]:
# Instantaneous rates: not startup
sns.set(rc={'figure.figsize':(6,2)})

# All data after the first day of measurement
avg = subset.loc['2019-04-24':].groupby(["NOTE", "Region"])[ ['RateIn', 'RateOut'] ].resample("2H").mean()

# Create a new subst of data that's stacked for Seaborn plotting
restOfActivity = avg.stack().reset_index().set_index('dti')
restOfActivity.columns = pd.Index(['Protocol', 'Vantage', 'Direction', 'Rate'])
restOfActivity['Direction' ] = restOfActivity.Direction.replace("RateIn", "Down").replace("RateOut", "Up")

# Plot with log scale due to data's rapid increase
fig, ax = plt.subplots()
sns.lineplot(data=restOfActivity.reset_index(), 
             x='Day', y='Rate', hue='Direction', estimator='mean').set(yscale='log')
ax.set(ylabel='Rate (log kB/s)')


# Unused visualizations

In [None]:
# Instantaneous rates: startup
avg = subset.loc['2019-04-23':'2019-04-23'].groupby(["NOTE", "Region"])[ ['RateIn', 'RateOut'] ].resample("6Min").mean()
startup = avg.stack().reset_index().set_index('dti')
startup.columns = pd.Index(['Protocol', 'Vantage', 'Direction', 'Rate'])
startup['Direction' ] = startup.Direction.replace("RateIn", "Down").replace("RateOut", "Up")

g = sns.FacetGrid(data=startup.reset_index(), 
                  col='Vantage', hue='Direction', sharey=False)
g.map(sns.lineplot, 'dti', 'Rate', estimator="mean")

for ax in g.axes.flat: 
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%H"))
    ax.xaxis.set_minor_formatter(mdates.DateFormatter("%H"))
    

In [None]:
# Graph
g = sns.FacetGrid(data=subset.reset_index(), 
                  col='Vantage', hue='Direction', col_wrap=3)
for ax in g.axes.flat: 
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%-d"))
    ax.xaxis.set_minor_formatter(mdates.DateFormatter("%-d"))
g.map(sns.lineplot, 'dti', 'Total MBs', estimator=None)