In [1]:
from IPython.display import display, Math, Latex, HTML
HTML('''<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }
  
  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''') 

In [2]:
from influxdb import DataFrameClient
from influxdb import InfluxDBClient
import json
import pandas as pd

import datetime 
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
#from datetime import datetime
import dateutil.parser
init_notebook_mode(connected=True)

In [3]:
colors=['F2F3F4', '222222', 'F3C300', '875692', 'F38400', 'A1CAF1', 'BE0032', 'C2B280', 
        '848482', '008856', 'E68FAC', '0067A5', 'F99379', '604E97', 'F6A600', 'B3446C',
        'DCD300', '882D17', '8DB600', '654522', 'E25822', '2B3D26']

In [58]:
colors[1]

'222222'

In [4]:
def barplot_averages(df,column, title,index_col='SK_PI'):
    max_df = df.groupby(index_col)[column].max().reset_index()
    max_df.columns = [index_col, column+'_MAX']
    med_df = df.groupby(index_col)[column].median().reset_index()
    med_df.columns = [index_col, column+'_MEDIAN']
    avg_df = df.groupby(index_col)[column].mean().reset_index()
    avg_df.columns = [index_col, column+'_MEAN']
    med_max_avg_df=pd.merge(pd.merge(max_df, med_df,on=index_col),avg_df, on=index_col)
    trace1 = go.Bar(
                x=med_max_avg_df[index_col],
                y=med_max_avg_df[column+'_MEAN'],
                name='Mean',
        )
    trace2 = go.Bar(
                x=med_max_avg_df[index_col],
                y=med_max_avg_df[column+'_MAX'],
                name='Max',

        )
    trace3 = go.Bar(
                x=med_max_avg_df[index_col],
                y=med_max_avg_df[column+'_MEDIAN'],
                name='Median',

        )
    data = [trace1, trace2, trace3]
    layout = go.Layout(
           # barmode='stack',
            title=title,
            margin = dict(
            l= 30,
            r= 30,
            t= 50,
            b= 250
            )
            #margin=go.layout.Margin(
           # l=50)
            #title="Maximum, mean and median upload speed per device"
        )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [5]:
with open('../credentials.json', 'r') as f_credentials:
    credentials_config = json.load(f_credentials)

In [6]:
host=credentials_config['influxdb_host']
port=8086
dbname = 'net_speed_md'
client = InfluxDBClient(host, port, '', '', dbname)

In [7]:
query_unique_devices = "SHOW TAG VALUES FROM PING WITH KEY=SK_PI;"
result_unique_devices = client.query(query_unique_devices)
points_unique_devices = result_unique_devices.get_points()
device_numbers=[]
for point in points_unique_devices:
    device_numbers.append(point['value'])
device_numbers=list(map(int, device_numbers))
device_numbers= sorted(device_numbers)
#print(device_numbers)

In [8]:
query_ping_counts = 'SELECT COUNT(PING) FROM PING WHERE PING!=0 GROUP BY SK_PI;'
result_ping_counts = client.query(query_ping_counts)
query_ping_counts_dec = "SELECT COUNT(PING) FROM PING WHERE time >= now()-4w  AND PING!=0 GROUP BY SK_PI ;"
result_ping_counts_dec = client.query(query_ping_counts_dec)

In [9]:
query_pingdroprate_counts = 'SELECT COUNT(PING_DROPRATE) FROM PING WHERE PING_DROPRATE!=0 GROUP BY SK_PI;'
result_pingdroprate_counts = client.query(query_pingdroprate_counts)
query_pingdroprate_counts_dec = 'SELECT COUNT(PING_DROPRATE) FROM PING WHERE time >= now()-4w AND PING_DROPRATE!=0 GROUP BY SK_PI;'
result_pingdroprate_counts_dec = client.query(query_pingdroprate_counts_dec)

In [10]:
ping_counts=[]
pingdroprate_counts=[]
for device in device_numbers:
    points_ping_counts=result_ping_counts.get_points(tags={'SK_PI':str(device)})
    points_pingdroprate_counts=result_pingdroprate_counts.get_points(tags={'SK_PI':str(device)})
    point_ping=0
    point_pingdroprate=0
    for point in points_ping_counts:
        point_ping=point['count']
    count_ping=point_ping
    for point in points_pingdroprate_counts:
        point_pingdroprate=point['count']
    count_pingdroprate=point_pingdroprate
    ping_counts.append(count_ping+count_pingdroprate)
    pingdroprate_counts.append(count_pingdroprate)

In [11]:
ping_counts_dec=[]
device_numbers_dec = []
pingdroprate_counts_dec = []
ping_sent_counts_dec = []
for device in device_numbers:
    points_ping_counts_dec=result_ping_counts_dec.get_points(tags={'SK_PI':str(device)})
    points_pingdroprate_counts_dec=result_pingdroprate_counts_dec.get_points(tags={'SK_PI':str(device)})
    point_ping=0
    point_pingdroprate=0
    for point in points_ping_counts_dec:
        point_ping=point['count']
    for point in points_pingdroprate_counts_dec:
        point_pingdroprate=point['count']
    if (point_pingdroprate+point_ping!=0): 
        device_numbers_dec.append(device)
    ping_counts_dec.append(point_pingdroprate+point_ping)
    pingdroprate_counts_dec.append(point_pingdroprate)
    ping_sent_counts_dec.append(point_ping)
#print("Devices, that have data in last 4 weeks: ",device_numbers_dec)

In [12]:
client_df = DataFrameClient(host, port, '', '', dbname)
query_upload = "SELECT * FROM SPEEDTEST_UPLOAD WHERE PROVIDER!='iperf';"
result_upload= client_df.query(query_upload)
upload_df = result_upload['SPEEDTEST_UPLOAD']

In [13]:
upload_df.reset_index(level=0, inplace=True)
upload_df['index']=upload_df['index'].dt.strftime('%Y-%m-%d %H:%M:%S')
upload_df['index'] = pd.to_datetime(upload_df['index'])
upload_df['SK_PI']=pd.to_numeric(upload_df['SK_PI'])
upload_df.rename(columns={'index':'time'}, inplace=True)
upload_df = upload_df[upload_df.UPLOAD != 0]

In [14]:
points_by_device=upload_df.groupby(['SK_PI']).size().reset_index(name='counts').sort_values('SK_PI')
four_weeks_ago = datetime.date.today() - datetime.timedelta(days=28)
upload_df_last4weeks = upload_df[upload_df['time']>four_weeks_ago]
points_by_device_las4weeks = upload_df_last4weeks.groupby(['SK_PI']).size().reset_index(name='counts').sort_values('SK_PI')
merged_results=pd.merge(points_by_device, points_by_device_las4weeks, on='SK_PI', how='outer')
merged_results['counts_y']=merged_results['counts_y'].fillna(0)
merged_results['result']=merged_results['counts_x'].sub(merged_results['counts_y'], axis=0)

# Part1: Number of data points, device reporting times.

In [15]:
trace1 = go.Bar(
            x=device_numbers,
            y=ping_counts_dec,
            name='Last 4 weeks', 
            marker=dict(color=colors[1])
    )
trace2 = go.Bar(
            x=device_numbers,
            y=[a - b for a, b in zip(ping_counts, ping_counts_dec)],
            name='The rest of the time',
            marker=dict(color=colors[2])
    )

trace3 = go.Bar(
            x=merged_results["SK_PI"],
            y=merged_results["counts_y"],
            name='Last 4 weeks',
            marker=dict(color=colors[1])
    )
trace4= go.Bar(
            x=merged_results["SK_PI"],
            y=merged_results['result'],
            name='The rest of the time',
            marker=dict(color=colors[2])
    )
#fig = tools.make_subplots(rows=1, cols=2)#, shared_xaxes=True)
#fig.append_trace(trace1, 1,1)
#fig.append_trace(trace2, 1,1)
#fig.append_trace(trace3, 1,2)
#fig.append_trace(trace4, 1,2)
#fig['layout'].update(barmode='stack',title="Number of datapoints: collectd data on the right, speedtest data on the left")#, width=1000)
data1 = [trace1, trace2]
layout1 = go.Layout(
        barmode='stack',
        title="Number of datapoints :collectd"
    )

fig1 = go.Figure(data=data1, layout=layout1)
iplot(fig1)
data2=[trace3, trace4]
layout2 = go.Layout(
        barmode='stack',
        title="Number of datapoints :speedtest"
    )

fig2 = go.Figure(data=data2, layout=layout2)
iplot(fig2)

In [16]:
query_ping_last = "SELECT LAST(PING), time FROM PING WHERE PING!=0 OR PING_DROPRATE!=0 GROUP BY SK_PI;"
result_ping_last = client.query(query_ping_last)

In [17]:
query_ping_first = "SELECT FIRST(PING), time FROM PING WHERE PING!=0 OR PING_DROPRATE!=0 GROUP BY SK_PI;"
result_ping_first = client.query(query_ping_first)

In [18]:
data=[]
for device in device_numbers:
    points_ping_last=result_ping_last.get_points(tags={'SK_PI':str(device)})
    points_ping_first=result_ping_first.get_points(tags={'SK_PI':str(device)})
    first=0
    last=0
    for point in points_ping_first:
        first=dateutil.parser.parse(point['time']).strftime('%Y-%m-%d %H:%M:%S')
    for point in points_ping_last:
        last=dateutil.parser.parse(point['time']).strftime('%Y-%m-%d %H:%M:%S')
   # print("Device: ", device,"  was reporting from ", first, " to ", last)
    trace = go.Scatter(x=[first,last],y=[device,device], name = device,marker=dict(color=colors[device]))
    data.append(trace)
layout = dict(title = "Device reporting times - collectd")
fig1 = go.Figure(data=data, layout=layout)
iplot(fig1)

start_times=upload_df.groupby('SK_PI')['time'].min()
end_times=upload_df.groupby('SK_PI')['time'].max()
device_numbers=sorted(upload_df['SK_PI'].unique())
#print(device_numbers)
data=[]
for device in device_numbers:
    time_last=end_times[device]
    time_first=start_times[device]
    #print("Device: ", device,"  was reporting from ", time_first, " to ", time_last)
    trace = go.Scatter(x=[time_first,time_last],y=[device,device], name = device,marker=dict(color=colors[device]))
    data.append(trace)
layout = dict(title = "Device reporting times   - speedtest")
fig2 = go.Figure(data=data, layout=layout)
iplot(fig2)

## Summary: 
There are 18 devices set up.     
Devices 1,2,4 6,8 started reporting and then stopped.  
Device 13 has not sent any speedtest data (sent several collectd data points).
Devices 10,13, 14 do not send collectd data but sending speedtest data.  
More detailed timeline can be found in [grafana](https://grafana-connectin.cybera.ca)  

# Part2: Speedtest statistics  by provider

In [30]:
providers_per_device = upload_df.groupby('SK_PI').apply(lambda x: x["PROVIDER"].unique()).apply(pd.Series)
providers_per_device['provider_count'] = providers_per_device.apply(lambda x: x.count(), axis=1)
#providers_per_device.head()
provider_counts_per_device = pd.Series(providers_per_device['provider_count']).value_counts().reset_index()
#provider_counts_per_device
#data = [go.Pie(
#            labels=provider_counts_per_device['index'],
#            values=provider_counts_per_device["provider_count"],
#    )]
#layout = go.Layout(
#        #barmode='stack',
#        title="Number of providers per device"
#    )

#fig1 = go.Figure(data=data, layout=layout)
#iplot(fig1)
points_by_device_by_provider=upload_df.groupby(['SK_PI', 'PROVIDER']).size().reset_index()
providers=upload_df['PROVIDER'].unique()
data=[]
i=0
for provider in providers:
    prov=[]
    for device in device_numbers:
        by_provider=points_by_device_by_provider.loc[(points_by_device_by_provider['SK_PI']==device)&(points_by_device_by_provider['PROVIDER']==provider)]
        if not by_provider.empty:
            prov.append(by_provider[0].iloc[0])
        else:
            prov.append(0)
    trace = go.Bar(x=device_numbers,y=prov, name = provider, marker=dict(color=colors[i]))
    i=i+1
    data.append(trace)
    layout = go.Layout(
        barmode='stack',
        title="Number of data points per device per provider"
    )
fig2 = go.Figure(data=data, layout=layout)
iplot(fig2)

In [23]:
barplot_averages(upload_df,'UPLOAD',"Maximum, mean and median upload speed per provider",index_col='PROVIDER')

query_download = "SELECT * FROM SPEEDTEST_DOWNLOAD WHERE PROVIDER!='iperf';"
result_download= client_df.query(query_download)
download_df = result_download['SPEEDTEST_DOWNLOAD']

download_df.reset_index(level=0, inplace=True)
download_df['index']=download_df['index'].dt.strftime('%Y-%m-%d %H:%M:%S')
download_df['index'] = pd.to_datetime(download_df['index'])
download_df['SK_PI']=pd.to_numeric(download_df['SK_PI'])

download_df.rename(columns={'index':'time'}, inplace=True)
download_df = download_df[download_df.DOWNLOAD != 0]


barplot_averages(download_df,'DOWNLOAD',"Maximum, mean and median download speed per provider", index_col='PROVIDER')

query_ping = "SELECT * FROM SPEEDTEST_PING WHERE PROVIDER!='iperf';"
result_ping= client_df.query(query_ping)
ping_df = result_ping['SPEEDTEST_PING']

ping_df.reset_index(level=0, inplace=True)
ping_df['index']=ping_df['index'].dt.strftime('%Y-%m-%d %H:%M:%S')
ping_df['index'] = pd.to_datetime(ping_df['index'])
ping_df['SK_PI']=pd.to_numeric(ping_df['SK_PI'])

ping_df.rename(columns={'index':'time'}, inplace=True)
ping_df = ping_df[ping_df.PING != 1800000.000] ## Removed outier that was found in the ms sql table

barplot_averages(ping_df,'PING',"Maximum, mean and median ping latency per provider", index_col='PROVIDER')

## Summary:
There are 10 Internet Service Providers.  
Most of the devices have one provider, devices 8 and 14 using 2 providers.Probably moved from one location to another?
Multiple devices using Bell and Keewaytinook provider.   
The rest of the providers used only by one device.  
   
For upload speed CRTC target is 50Mbps:  Commstream Communications, MERLIN, Xplornet Communications and Manitoba Hydro International average upload speeds are around or above 50.
   
For download speed speed CRTC target is 10Mbps. For every provider except Bell MTS and  TeraGo Networks  download speed is above 10.

The largest ping latencies are TeraGo Networks,Bell MTS and Cogent Communications.

# Part3: Speedtest statistics  by test server.

In [31]:
pd.options.mode.chained_assignment = None
upload_df["server"] = upload_df["TEST_SERVER"] + " " +upload_df["PROVINCE"]
test_servers_per_device = upload_df.groupby('SK_PI').apply(lambda x: x['server'].unique()).apply(pd.Series)
test_servers_per_device['server_count'] = test_servers_per_device.apply(lambda x: x.count(), axis=1)
#print(test_servers_per_device["server_count"].head())
test_server_counts_per_device = pd.Series(test_servers_per_device['server_count']).value_counts().reset_index()
#print(test_server_counts_per_device)
data = [go.Bar(
            x=test_servers_per_device.index,
            y=test_servers_per_device["server_count"],
    )]
#data = [go.Pie(
#            labels=test_server_counts_per_device['index'],
#            values=test_server_counts_per_device["server_count"],
#    )]
layout = go.Layout(
        #barmode='stack',
        title="Number of test servers per device"
    )

fig1 = go.Figure(data=data, layout=layout)
#print(points_by_device)
iplot(fig1)
points_by_device_by_server=upload_df.groupby(['SK_PI', 'server']).size().reset_index()
test_servers=upload_df["server"].unique()
data=[]
i=0
for server in test_servers:
    serv=[]
    for device in device_numbers:
        by_server=points_by_device_by_server.loc[(points_by_device_by_server['SK_PI']==device)&(points_by_device_by_server['server']==server)]
        if not by_server.empty:
            serv.append(by_server[0].iloc[0])
        else:
            serv.append(0)
    trace = go.Bar(x=device_numbers,y=serv, name = server, marker=dict(color=colors[i]))
    i=i+1
    data.append(trace)
    layout = go.Layout(
        barmode='stack',
        title="Number of data points per device per test server"
    )
fig2 = go.Figure(data=data, layout=layout)
iplot(fig2)

In [25]:
barplot_averages(upload_df,'UPLOAD',"Maximum, mean and median upload speed per test server",index_col='server')
download_df["server"] = download_df["TEST_SERVER"] + " " +download_df["PROVINCE"]
barplot_averages(download_df,'DOWNLOAD',"Maximum, mean and median download speed per test server", index_col='server')
ping_df["server"] = ping_df["TEST_SERVER"] + " " +ping_df["PROVINCE"]
barplot_averages(ping_df,'PING',"Maximum, mean and median ping latency per test server", index_col='server')

## Summary:
There are 21 test servers. Devices use 3-9 test servers.   
The servers with the highest ypload/download speeds (Axia, Telus, DataHive and Cybera) were actually used by out test device in Cybera office and can be ignored.  
From the rest of the test servers only 'Bell Mobility (Winnipeg  MB)', 'VOI Network Solutions (Winnipeg  MB)' and 'SaskTel (Regina  SK)' have average upload speeds abouve 50Mbps.
 
 For download speed most of the server have average above 10Mbps (except for 'Morenet (Morden  MB)', 'Swift High Speed.com (Winnipeg  MB)','Midco (Minot  ND)', 'Access Communications Co-operative Limited (Regina  SK)' and 'Rogers (Winnipeg  MB)').

 # Part4: statistics by device

In [33]:
query_ping_max_dec = "SELECT MAX(PING), MEAN(PING), MEDIAN(PING) FROM PING WHERE PING!=0  GROUP BY SK_PI;"
result_ping_max_dec = client.query(query_ping_max_dec)

In [34]:
device_max_dec=[]
device_mean_dec = []
device_median_dec = []
for device in device_numbers:
    points_max_dec=result_ping_max_dec.get_points(tags={'SK_PI':str(device)})
    for point in points_max_dec:
        device_median_dec.append(point['median'])
        device_max_dec.append(point['max'])
        device_mean_dec.append(point['mean'])

In [45]:
#trace1 = go.Bar(
#            x=device_numbers,
#            y=device_mean_dec,
#            name='Mean',
#    )
#trace2 = go.Bar(
#            x=device_numbers,
#            y=device_max_dec,
#            name='Max',
#    
#    )
#trace3 = go.Bar(
#            x=device_numbers,
#            y=device_median_dec,
#            name='Median',
#    
#    )
#data = [trace1, trace2, trace3]
#layout = go.Layout(
#       # barmode='stack',
#        title="Collectd: Maximum, mean and median ping latency per device"
#    )
#
#fig1 = go.Figure(data=data, layout=layout)
#iplot(fig1)
#ping_df1=ping_df.loc[ping_df['PROVIDER']!='iperf']
#barplot_averages(ping_df,'PING',"Sppedtest: Maximum, mean and median ping latency per device")

In [57]:
#download_df1=download_df.loc[download_df['PROVIDER']!='iperf']
#upload_df1=upload_df.loc[upload_df['PROVIDER']!='iperf']
#barplot_averages(download_df,'DOWNLOAD',"Maximum, mean and median download speed per device")
#barplot_averages(upload_df,'UPLOAD',"Maximum, mean and median upload speed per device")
data1=[]
for device in device_numbers:
    trace=go.Violin(
    y=download_df.loc[download_df['SK_PI']==device]['DOWNLOAD'], name=device, marker=dict(color=colors[device])
)
    data1.append(trace)
#data1.append(go.Scatter(x=list(range(20)),y=[10] * 20, mode='lines',marker=dict(color='red'), name='10Mps'))
layout1 = go.Layout(
            title="Download speed per device"
        )

fig1 = go.Figure(data=data1, layout=layout1)
iplot(fig1)

data2=[]
for device in device_numbers:
    trace=go.Violin(
    y=upload_df.loc[upload_df['SK_PI']==device]['UPLOAD'], name=device, marker=dict(color=colors[device])
)
    data2.append(trace)
#data2.append(go.Scatter(x=list(range(20)),y=[50] * 20, mode='lines',marker=dict(color='red'), name='50Mps'))
layout2 = go.Layout(
            title="Upload speed per device"
        )

fig = go.Figure(data=data2, layout=layout2)
iplot(fig)

data3=[]
for device in device_numbers:
    trace=go.Violin(
    y=ping_df.loc[ping_df['SK_PI']==device]['PING'], name=device,marker=dict(color=colors[device])
)
    data3.append(trace)

layout3 = go.Layout(
            title="Ping latency per device"
        )

fig = go.Figure(data=data3, layout=layout3)
iplot(fig)