In [47]:
from IPython.display import display, Math, Latex, HTML
HTML('''<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }
  
  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''') 

In [48]:
from influxdb import DataFrameClient
from influxdb import InfluxDBClient
import json
import pandas as pd

import datetime 
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
#from datetime import datetime
import dateutil.parser
init_notebook_mode(connected=True)

In [49]:
colors=['#F2F3F4', '#222222', '#F3C300', '#875692', '#F38400', '#A1CAF1', '#BE0032', '#C2B280', 
        '#848482', '#008856', '#E68FAC', '#0067A5', '#F99379', '#604E97', '#F6A600', '#B3446C',
        '#DCD300', '#882D17', '#8DB600', '#654522', '#E25822', '#2B3D26']

In [50]:
def barplot_averages(df,column, title,index_col='SK_PI', margin=False, line='',xtitle="Device Numbers", ytitle="Mbps"):
    max_df = df.groupby(index_col)[column].max().reset_index()
    max_df.columns = [index_col, column+'_MAX']
    med_df = df.groupby(index_col)[column].median().reset_index()
    med_df.columns = [index_col, column+'_MEDIAN']
    avg_df = df.groupby(index_col)[column].mean().reset_index()
    avg_df.columns = [index_col, column+'_MEAN']
    med_max_avg_df=pd.merge(pd.merge(max_df, med_df,on=index_col),avg_df, on=index_col)
    trace1 = go.Bar(
                x=med_max_avg_df[index_col],
                y=med_max_avg_df[column+'_MEAN'],
                name='Avg',
        )
    trace2 = go.Bar(
                x=med_max_avg_df[index_col],
                y=med_max_avg_df[column+'_MAX'],
                name='Max',

        )
    trace3 = go.Bar(
                x=med_max_avg_df[index_col],
                y=med_max_avg_df[column+'_MEDIAN'],
                name='Median',

        )
    data = [trace1, trace2, trace3]
    if line:
        data.append(line)
    if margin:
        layout = go.Layout(
                title=title,
                xaxis=dict(title=xtitle),
                yaxis=dict(title=ytitle),
                margin = dict(
                l= 60,
                r= 30,
                t= 50,
                b= 200
                )
            )
    else:
        layout = go.Layout(title=title,xaxis=dict(title=xtitle),yaxis=dict(title=ytitle),)
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [51]:
with open('../credentials.json', 'r') as f_credentials:
    credentials_config = json.load(f_credentials)

In [52]:
host=credentials_config['influxdb_host']
port=8086
dbname = 'net_speed_md'
client = InfluxDBClient(host, port, '', '', dbname)

In [53]:
query_unique_devices = "SHOW TAG VALUES FROM PING WITH KEY=SK_PI;"
result_unique_devices = client.query(query_unique_devices)
points_unique_devices = result_unique_devices.get_points()
device_numbers=[]
for point in points_unique_devices:
    device_numbers.append(point['value'])
device_numbers=list(map(int, device_numbers))
device_numbers= sorted(device_numbers)
#print(device_numbers)

In [54]:
query_ping_counts = 'SELECT COUNT(PING) FROM PING WHERE PING!=0 GROUP BY SK_PI;'
result_ping_counts = client.query(query_ping_counts)
query_ping_counts_dec = "SELECT COUNT(PING) FROM PING WHERE time >= now()-4w  AND PING!=0 GROUP BY SK_PI ;"
result_ping_counts_dec = client.query(query_ping_counts_dec)

In [55]:
query_pingdroprate_counts = 'SELECT COUNT(PING_DROPRATE) FROM PING WHERE PING_DROPRATE!=0 GROUP BY SK_PI;'
result_pingdroprate_counts = client.query(query_pingdroprate_counts)
query_pingdroprate_counts_dec = 'SELECT COUNT(PING_DROPRATE) FROM PING WHERE time >= now()-4w AND PING_DROPRATE!=0 GROUP BY SK_PI;'
result_pingdroprate_counts_dec = client.query(query_pingdroprate_counts_dec)

In [56]:
ping_counts=[]
pingdroprate_counts=[]
for device in device_numbers:
    points_ping_counts=result_ping_counts.get_points(tags={'SK_PI':str(device)})
    points_pingdroprate_counts=result_pingdroprate_counts.get_points(tags={'SK_PI':str(device)})
    point_ping=0
    point_pingdroprate=0
    for point in points_ping_counts:
        point_ping=point['count']
    count_ping=point_ping
    for point in points_pingdroprate_counts:
        point_pingdroprate=point['count']
    count_pingdroprate=point_pingdroprate
    ping_counts.append(count_ping+count_pingdroprate)
    pingdroprate_counts.append(count_pingdroprate)

In [57]:
ping_counts_dec=[]
device_numbers_dec = []
pingdroprate_counts_dec = []
ping_sent_counts_dec = []
for device in device_numbers:
    points_ping_counts_dec=result_ping_counts_dec.get_points(tags={'SK_PI':str(device)})
    points_pingdroprate_counts_dec=result_pingdroprate_counts_dec.get_points(tags={'SK_PI':str(device)})
    point_ping=0
    point_pingdroprate=0
    for point in points_ping_counts_dec:
        point_ping=point['count']
    for point in points_pingdroprate_counts_dec:
        point_pingdroprate=point['count']
    if (point_pingdroprate+point_ping!=0): 
        device_numbers_dec.append(device)
    ping_counts_dec.append(point_pingdroprate+point_ping)
    pingdroprate_counts_dec.append(point_pingdroprate)
    ping_sent_counts_dec.append(point_ping)
#print("Devices, that have data in last 4 weeks: ",device_numbers_dec)

In [58]:
client_df = DataFrameClient(host, port, '', '', dbname)
query_upload = "SELECT * FROM SPEEDTEST_UPLOAD WHERE PROVIDER!='iperf';"
result_upload= client_df.query(query_upload)
upload_df = result_upload['SPEEDTEST_UPLOAD']

In [59]:
upload_df.reset_index(level=0, inplace=True)
upload_df['index']=upload_df['index'].dt.strftime('%Y-%m-%d %H:%M:%S')
upload_df['index'] = pd.to_datetime(upload_df['index'])
upload_df['SK_PI']=pd.to_numeric(upload_df['SK_PI'])
upload_df.rename(columns={'index':'time'}, inplace=True)
upload_df = upload_df[upload_df.UPLOAD != 0]

In [60]:
points_by_device=upload_df.groupby(['SK_PI']).size().reset_index(name='counts').sort_values('SK_PI')
four_weeks_ago = datetime.date.today() - datetime.timedelta(days=28)
upload_df_last4weeks = upload_df[upload_df['time']>four_weeks_ago]
points_by_device_las4weeks = upload_df_last4weeks.groupby(['SK_PI']).size().reset_index(name='counts').sort_values('SK_PI')
merged_results=pd.merge(points_by_device, points_by_device_las4weeks, on='SK_PI', how='outer')
merged_results['counts_y']=merged_results['counts_y'].fillna(0)
merged_results['result']=merged_results['counts_x'].sub(merged_results['counts_y'], axis=0)

# 1. Statistics by device

In [61]:
query_download = "SELECT * FROM SPEEDTEST_DOWNLOAD WHERE PROVIDER!='iperf';"
result_download= client_df.query(query_download)
download_df = result_download['SPEEDTEST_DOWNLOAD']

download_df.reset_index(level=0, inplace=True)
download_df['index']=download_df['index'].dt.strftime('%Y-%m-%d %H:%M:%S')
download_df['index'] = pd.to_datetime(download_df['index'])
download_df['SK_PI']=pd.to_numeric(download_df['SK_PI'])

download_df.rename(columns={'index':'time'}, inplace=True)
download_df = download_df[download_df.DOWNLOAD != 0]

In [62]:
query_ping = "SELECT * FROM SPEEDTEST_PING WHERE PROVIDER!='iperf';"
result_ping= client_df.query(query_ping)
ping_df = result_ping['SPEEDTEST_PING']

ping_df.reset_index(level=0, inplace=True)
ping_df['index']=ping_df['index'].dt.strftime('%Y-%m-%d %H:%M:%S')
ping_df['index'] = pd.to_datetime(ping_df['index'])
ping_df['SK_PI']=pd.to_numeric(ping_df['SK_PI'])

ping_df.rename(columns={'index':'time'}, inplace=True)
ping_df = ping_df[ping_df.PING != 1800000.000] ## Removed outier that was found in the ms sql table

In [63]:
query_ping_max_dec = "SELECT MAX(PING), MEAN(PING), MEDIAN(PING) FROM PING WHERE PING!=0  GROUP BY SK_PI;"
result_ping_max_dec = client.query(query_ping_max_dec)

In [64]:
device_max_dec=[]
device_mean_dec = []
device_median_dec = []
for device in device_numbers:
    points_max_dec=result_ping_max_dec.get_points(tags={'SK_PI':str(device)})
    for point in points_max_dec:
        device_median_dec.append(point['median'])
        device_max_dec.append(point['max'])
        device_mean_dec.append(point['mean'])

In [65]:
trace1 = go.Bar(
            x=device_numbers,
            y=device_mean_dec,
            name='Mean',
    )
trace2 = go.Bar(
            x=device_numbers,
            y=device_max_dec,
            name='Max',
    
    )
trace3 = go.Bar(
            x=device_numbers,
            y=device_median_dec,
            name='Median',
    
    )
data = [trace1, trace2, trace3]
layout = go.Layout(
       # barmode='stack',
        title="Collectd: Maximum, mean and median ping latency per device"
    )

#fig1 = go.Figure(data=data, layout=layout)
#iplot(fig1)

In [81]:
upload_line=go.Scatter(x=device_numbers,y=[10] * 18, mode='markers',marker=dict(color='red'), name='10Mps')
download_line=go.Scatter(x=device_numbers,y=[50] * 18, mode='markers',marker=dict(color='red'), name='50Mps')
barplot_averages(download_df,'DOWNLOAD',"Download speed", line=download_line)
barplot_averages(upload_df,'UPLOAD',"Upload speed",line=upload_line)

In [67]:
barplot_averages(ping_df,'PING',"Ping latency",ytitle="Miliseconds")

### Brief note on box plots

Boxplots are an excellent way to summarize statistical data as they readily display how the data is distributed - giving you a better idea as to the range, and more importantly, the distinguishability of your data. However if you've never seen one before, here's a few important things to note about boxplots and what everything they display means. To begin, here is a labeled boxplot

![alt-text](https://cdn-images-1.medium.com/max/800/1*2c21SkzJMf3frPXPAR_gZA.png)

What we see here is a few important pieces 
1. The 'box' itself is centered on the median of the data, and has width equal to the interquartile range, which basically says that the box contains 50% of the data.
2. The lines that extend out are known as "whiskers" and extend from the bottom of the box, to a distance equal to 1.5 times the width of the box on each side (1.5 times the inter quartile range)
3. Finally, any points outside of the whiskers are drawn individually 

In [82]:
data1=[]
for device in device_numbers:
    trace=go.Box(
    y=download_df.loc[download_df['SK_PI']==device]['DOWNLOAD'], name=device, marker=dict(color=colors[device])
)
    data1.append(trace)
data1.append(go.Scatter(x=device_numbers,y=[50] * 18, mode='markers',marker=dict(color='red'), name='50Mps'))
layout1 = go.Layout(
            title="Download speed",
            xaxis=dict(title="Device Number"),
            yaxis=dict(title="Mbps"),
        )

fig1 = go.Figure(data=data1, layout=layout1)
iplot(fig1)

data2=[]
for device in device_numbers:
    trace=go.Box(
    y=upload_df.loc[upload_df['SK_PI']==device]['UPLOAD'], name=device, marker=dict(color=colors[device])
)
    data2.append(trace)
data2.append(go.Scatter(x=device_numbers,y=[10] * 18, mode='markers',marker=dict(color='red'), name='10Mps'))
layout2 = go.Layout(
            title="Upload speed",
            xaxis=dict(title="Device Number"),
            yaxis=dict(title="Mbps"),
        )

fig = go.Figure(data=data2, layout=layout2)
iplot(fig)

data3=[]
for device in device_numbers:
    trace=go.Box(
    y=ping_df.loc[ping_df['SK_PI']==device]['PING'], name=device,marker=dict(color=colors[device])
)
    data3.append(trace)

layout3 = go.Layout(
            title="Ping latency",
            xaxis=dict(title="Device Number"),
            yaxis=dict(title="Miliseconds"),
        )

fig = go.Figure(data=data3, layout=layout3)
iplot(fig)

##  Summary
For the download speed only devices 8 and 14 going above 50Mbps.  
Fot the upload speed  - devices 8,9,11,14,15,17 going above 10Mbs.  
Device 14 shows high variety in dtata for upload/download speeds.  
Device 5 and 16 show lots of outlierr for ling latency.  

# 2. Statistics by Internet Service Provider

In [79]:
providers_per_device = upload_df.groupby('SK_PI').apply(lambda x: x["PROVIDER"].unique()).apply(pd.Series)
providers_per_device['provider_count'] = providers_per_device.apply(lambda x: x.count(), axis=1)
#providers_per_device.head()
provider_counts_per_device = pd.Series(providers_per_device['provider_count']).value_counts().reset_index()
#provider_counts_per_device
#data = [go.Pie(
#            labels=provider_counts_per_device['index'],
#            values=provider_counts_per_device["provider_count"],
#    )]
#layout = go.Layout(
#        #barmode='stack',
#        title="Number of providers per device"
#    )

#fig1 = go.Figure(data=data, layout=layout)
#iplot(fig1)
points_by_device_by_provider=upload_df.groupby(['SK_PI', 'PROVIDER']).size().reset_index()
providers=upload_df['PROVIDER'].unique()
data=[]
i=0
for provider in providers:
    prov=[]
    for device in device_numbers:
        by_provider=points_by_device_by_provider.loc[(points_by_device_by_provider['SK_PI']==device)&(points_by_device_by_provider['PROVIDER']==provider)]
        if not by_provider.empty:
            result=by_provider[0].iloc[0]/points_by_device.loc[points_by_device['SK_PI']==device]['counts'].iloc[0]*100
            prov.append(result)
        else:
            prov.append(0)
    trace = go.Bar(x=device_numbers,y=prov, name = provider, marker=dict(color=colors[i]))
    i=i+1
    data.append(trace)
    layout = go.Layout(
        barmode='stack',
        title="Percentage of service providers by device",
        xaxis=dict(title="Device Number"),
        yaxis=dict(title="Percentage of data"),
    )
fig2 = go.Figure(data=data, layout=layout)
iplot(fig2)

In [83]:
upload_line=go.Scatter(x=providers,y=[10] * 18, mode='markers',marker=dict(color='red'), name='10Mps')
download_line=go.Scatter(x=providers,y=[50] * 18, mode='markers',marker=dict(color='red'), name='50Mps')
barplot_averages(upload_df,'UPLOAD',"Upload speed",index_col='PROVIDER',line=upload_line, xtitle="Provider",margin=True)
barplot_averages(download_df,'DOWNLOAD',"Download speed", index_col='PROVIDER',line=download_line,xtitle="Provider",margin=True)
barplot_averages(ping_df,'PING',"Ping Latency", index_col='PROVIDER',xtitle="Provider",ytitle="Miliseconds",margin=True)

In [77]:
data3=[]
i=0
for provider in providers:
    i=i+1
    trace=go.Box(
    y=ping_df.loc[ping_df['PROVIDER']==provider]['PING'], name=provider,marker=dict(color=colors[i])
)
    data3.append(trace)

layout3 = go.Layout(
            title="Ping latency",
            xaxis=dict(title="Provider"),
            yaxis=dict(title="Miliseconds"),
            margin = dict(
                l= 60,
                r= 30,
                t= 50,
                b= 200
                )
        )

fig = go.Figure(data=data3, layout=layout3)
iplot(fig)

## Summary:
There are 10 Internet Service Providers.  
Most of the devices have one provider, devices 8 and 14 using 2 providers.Probably moved from one location to another?
Multiple devices using Bell and Keewaytinook provider.   
The rest of the providers used only by one device.  
   
For upload speed CRTC target is 10Mbps:  For every provider except Bell MTS,High Speed Crow and  TeraGo Networks  uploadload speed is above 10.
   
For download speed speed CRTC target is 50Mbps. Only Commstream Communications, MERLIN and Manitoba Hydro Internationl reached this number.

The largest ping latencies are Bell MTS,TeraGo Networks and Keewaytinook Okimakanak.

# 3. Number of data points, device reporting times.

In [72]:
trace1 = go.Bar(
            x=device_numbers,
            y=ping_counts_dec,
            name='Last 4 weeks', 
            marker=dict(color=colors[1])
    )
trace2 = go.Bar(
            x=device_numbers,
            y=[a - b for a, b in zip(ping_counts, ping_counts_dec)],
            name='The rest of the time',
            marker=dict(color=colors[2])
    )

trace3 = go.Bar(
            x=merged_results["SK_PI"],
            y=merged_results["counts_y"],
            name='Last 4 weeks',
            marker=dict(color=colors[1])
    )
trace4= go.Bar(
            x=merged_results["SK_PI"],
            y=merged_results['result'],
            name='The rest of the time',
            marker=dict(color=colors[2])
    )
#fig = tools.make_subplots(rows=1, cols=2)#, shared_xaxes=True)
#fig.append_trace(trace1, 1,1)
#fig.append_trace(trace2, 1,1)
#fig.append_trace(trace3, 1,2)
#fig.append_trace(trace4, 1,2)
#fig['layout'].update(barmode='stack',title="Number of datapoints: collectd data on the right, speedtest data on the left")#, width=1000)
data1 = [trace1, trace2]
layout1 = go.Layout(
        barmode='stack',
        title="Number of datapoints :collectd"
    )

fig1 = go.Figure(data=data1, layout=layout1)
iplot(fig1)
data2=[trace3, trace4]
layout2 = go.Layout(
        barmode='stack',
        title="Number of datapoints :speedtest"
    )

fig2 = go.Figure(data=data2, layout=layout2)
iplot(fig2)

In [73]:
query_ping_last = "SELECT LAST(PING), time FROM PING GROUP BY SK_PI;"
result_ping_last = client.query(query_ping_last)

In [74]:
query_ping_first = "SELECT FIRST(PING), time FROM PING  GROUP BY SK_PI;"
result_ping_first = client.query(query_ping_first)

In [75]:
data=[]
for device in device_numbers:
    points_ping_last=result_ping_last.get_points(tags={'SK_PI':str(device)})
    points_ping_first=result_ping_first.get_points(tags={'SK_PI':str(device)})
    first=0
    last=0
    for point in points_ping_first:
        first=dateutil.parser.parse(point['time']).strftime('%Y-%m-%d %H:%M:%S')
    for point in points_ping_last:
        last=dateutil.parser.parse(point['time']).strftime('%Y-%m-%d %H:%M:%S')
   # print("Device: ", device,"  was reporting from ", first, " to ", last)
    trace = go.Scatter(x=[first,last],y=[device,device], name = str(device),marker=dict(color=colors[device]))
    data.append(trace)
layout = dict(title = "Device reporting times - collectd")
fig1 = go.Figure(data=data, layout=layout)
iplot(fig1)

start_times=upload_df.groupby('SK_PI')['time'].min()
end_times=upload_df.groupby('SK_PI')['time'].max()
device_numbers=sorted(upload_df['SK_PI'].unique())
#print(device_numbers)
data=[]
for device in device_numbers:
    time_last=end_times[device]
    time_first=start_times[device]
    #print("Device: ", device,"  was reporting from ", time_first, " to ", time_last)
    trace = go.Scatter(x=[time_first,time_last],y=[device,device], name = str(device),marker=dict(color=colors[device]))
    data.append(trace)
layout = dict(title = "Device reporting times   - speedtest")
fig2 = go.Figure(data=data, layout=layout)
iplot(fig2)

## Summary: 
There are 18 devices set up.     
Devices 1,2,4 6,8 started reporting and then stopped.  
Device 13 has not sent any speedtest data (sent several collectd data points).
Devices 10,13, 14 do not send collectd data but sending speedtest data.    
More detailed timeline can be found in [grafana](https://grafana-connectin.cybera.ca)  