In [None]:
from IPython.display import display, Math, Latex, HTML
HTML('''<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }
  
  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''') 

# ConnectIN draft analysis, part 2 - Jan 24, 2019

This document contains observations for average donwload /upload speeds, ping latency by hour and day of the week.

The analysis is configured to run for the 4 weeks before the current date. In this snapshot, this time period includes the Christmas / New Year breaks, as well as the first weeks of January. Given that the devices investigated here are deployed with organizations that would close over the holidays, this might affect the analysis and outcomes here. 

Data was collected from a total of 18 devices, most of which are located in Manitoba. Some devices stopped reporting and are not displayed on plots. Device  number 3, which is a control device set up in Cybera's offices in Calgary, was excluded from the analysis. 

Note that the timezone used during data collection is UTC, which was converted to 'America/Winnipeg' to represent correct local time.

NB: This is a draft and results need to be further confirmed/verified.

In [None]:
#Load libraries:
from data_exploration import *

In [None]:
all_data=False

In [None]:
#Collect all data?
#all_data=True

#Set up test time interval:
time_interval='4w' #2w

#Set up starting point, by default if will start from current time
#starting_point=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
starting_point="2019-01-24 14:00:00"  # to set upl alternative starting point

In [None]:
if not all_data:#Set up test time interval:
    print("Time interval: ", time_interval)
    print("Starting point:",starting_point )
else:
    print("Selecting all data from database")

In [None]:
#Set up influxdb connection:
client, client_df = connect_to_influxdb()

### Download speed

In [None]:
if not all_data:
    query_download = "SELECT * FROM SPEEDTEST_DOWNLOAD WHERE PROVIDER!='iperf' AND time >= '"+starting_point+"'-"+time_interval+" AND DOWNLOAD>0;"
else:
    query_download = "SELECT * FROM SPEEDTEST_DOWNLOAD WHERE PROVIDER!='iperf' AND DOWNLOAD>0;"
download_df = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_download,table_name='SPEEDTEST_DOWNLOAD')
#download_df.head()
download_df=download_df[download_df["SK_PI"]!=3]
download_df['time']= download_df['time'].dt.tz_localize('UTC').dt.tz_convert('America/Winnipeg')

In [None]:
device_numbers_d=download_df['SK_PI'].unique()
device_numbers_d=list(map(int, device_numbers_d))
device_numbers_d= sorted(device_numbers_d)
#device_numbers

In [None]:
download_df["hour"]=pd.to_numeric(download_df["time"].dt.hour)

In [None]:
by_hour_by_device_d = mean_max_median_min_by2(input_dataframe=download_df,value1="DOWNLOAD", value2="DOWNLOAD",
                                              value3="DOWNLOAD",value4="DOWNLOAD",group_by_value="hour", rename_columns=True)
download_summary_by_hour=mean_max_median_min_by1(download_df,'DOWNLOAD',index_col='hour')
download_summary=mean_max_median_min_by1(download_df,'DOWNLOAD')

In [None]:
by_hour_by_device_d = mean_max_median_min_by2(input_dataframe=download_df,value1="DOWNLOAD", value2="DOWNLOAD",
                                              value3="DOWNLOAD",value4="DOWNLOAD",group_by_value="hour", rename_columns=True)

In [None]:
traces=[]
for device in device_numbers_d:
    #subset=by_hour_by_device_d[by_hour_by_device_d["SK_PI"]==device]
    subset=download_df[download_df["SK_PI"]==device]
    trace = go.Scatter(
        x = subset['hour'],
        #y=subset['DOWNLOAD'],
        y=(subset['DOWNLOAD']-subset['DOWNLOAD'].mean())/subset['DOWNLOAD'].std(),
        #y=subset['mean'],
        #y=(subset['mean']-subset['mean'].mean())/subset['mean'].std(),
        mode = 'markers',
        marker = dict(color=colors[device]),
        name = device
    )
    traces.append(trace)
layout = go.Layout(
        title="Average download speed by hour over the last "+time_interval+ " starting from "+ starting_point,
        xaxis=dict(title="Hour of the day"),
        yaxis=dict(title="Difference to normalized speeds (Mbps)")
        )
#download_line=go.Scatter(x=by_hour_by_device_d['hour'],y=[50] * len(by_hour_by_device_d['hour']), mode='lines',marker=dict(color='red'), name='50Mbps')
#traces.append(download_line)
data = traces
fig = go.Figure(data=data, layout=layout)
iplot(fig)

On this plot we see download speed by hour (x axis) for all the devices. Data was normalized around zero - in order to use devices from all of our devices and to spot trends. We can see that during the night/early morning download speeds seem a little higher and go down during working hours. However, it should be noted that this has not been quantified yet or tested for statistical significance. 

In [None]:
if all_data:
    t="Download speed"
else:
    t="Download speed by hour over the last "+time_interval+ " starting from "+ starting_point
simple_boxplot(dataframe=download_df,plot_value='DOWNLOAD',sort_value='SK_PI',
               title=t, 
               ytitle="Download speed (Mbps)",
               xtitle="Device number", downloadline=True)

[This boxplot](https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51) shows that for most of the devices download speeds are below the CRTC target of -50 Mbps.  
Let's calculate the statistics - percentage of datapoints below 50Mbps for every device.

In [None]:
download_df["below50"]=0
download_df.loc[(download_df["DOWNLOAD"]<50),"below50"]=1
#download_df.head()

In [None]:
summary_download=download_df.groupby("SK_PI")['below50'].count().reset_index()
summary_download.rename(columns={'below50':'download_count'}, inplace=True)

mean_download=round(download_df.groupby("SK_PI")['DOWNLOAD'].mean(),2).reset_index()
mean_download.rename(columns={'DOWNLOAD':'download_mean'}, inplace=True)

std_download=round(download_df.groupby("SK_PI")['DOWNLOAD'].std(),2).reset_index()
std_download.rename(columns={'DOWNLOAD':'download_std'}, inplace=True)

subset_below50=download_df[download_df['below50']==1]
summary_below50=subset_below50.groupby("SK_PI")['below50'].count().reset_index()
summary_below50.rename(columns={'below50':'download_below50'}, inplace=True)

summary_download=pd.merge(summary_download, summary_below50,  how='outer', left_on=['SK_PI'], right_on = ['SK_PI'])
summary_download=pd.merge(summary_download, mean_download,  how='outer', left_on=['SK_PI'], right_on = ['SK_PI'])
summary_download=pd.merge(summary_download, std_download,  how='outer', left_on=['SK_PI'], right_on = ['SK_PI'])

summary_download.fillna(0, inplace=True)
summary_download['Download speeds less than 50  Mbps  (%)']=round(summary_download['download_below50']/summary_download['download_count']*100)
summary_download=summary_download.sort_values(by=['SK_PI'], ascending=[True])
summary_download["device number"]=summary_download["SK_PI"]
summary_download=summary_download.reset_index().set_index("device number")
summary_download[["download_mean","download_std","Download speeds less than 50  Mbps  (%)"]]

Looks like device #8 is the only device that has 100% of data above 50Mbps for download speed. Devices 5, 7, 9, 10, 11, 12, 15, 16, 17, 18 are clearly in underserved areas - 99%-100% of datapoints below CRTC target.
For device #14 the situation is not clear - 32% of data is below 50Mbps and 68% above. 

We will take a more detailed look at device #14.


In [None]:
device_number=14
subset=download_df[download_df["SK_PI"]==device_number]
hist_data = [subset['DOWNLOAD']]
group_labels = ['device '+str(device_number)+ ' download speed']
import plotly.figure_factory as ff
fig = ff.create_distplot(hist_data, group_labels,bin_size=10)
fig['layout']['xaxis'].update(title='Download speed (Mbps)')
iplot(fig)

This plot shows distribution of data for device #14, we can see two distinct clusters - low speeds and high speeds. Data varies from around 30 Mbps and around 160 Mbps and there is nothing in between. Below shows the raw speed data for device 14 and the other devices using a tool called Grafana (a visualization tool that we have set up to display time series data).

![](images/grafana_download14.png)

The panel above definetely shows the New Year break (relatively high speeds) and speeds are going down before and after the break with lots of fluctuation.

![](images/grafana_download14_1.png)

This panel shows how device #14 (green) is different from the rest of devices.The fluctuation from low and high speeds are quite noticable, whereas the other lines are much more stable and do not display the same large fluctuations in speed. 

Let's try analyzing the same device by hour of the day.

In [None]:
subset=download_df[download_df["SK_PI"]==device_number]
simple_boxplot(dataframe=subset,plot_value='DOWNLOAD',sort_value='hour',
               title="Download speed by hour for the device "+str(device_number)+" over the "+time_interval+" starting from "+starting_point,
               ytitle="Download speed (Mbps)",
               xtitle="Hour of the day", downloadline=True)

Some hours are relatively stable (4am, 7am, 10am, 3pm, 5pm, 7pm) and the rest of hours have lots of variety. We can't spot any trend here.   
Let's check by day of the week.

In [None]:
download_df["weekday"]=download_df["time"].dt.weekday_name
download_df["weekday"] = pd.Categorical(download_df["weekday"], ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

In [None]:
subset=download_df[download_df["SK_PI"]==device_number]
simple_boxplot(dataframe=subset,plot_value='DOWNLOAD',sort_value='weekday',
               title="Download speed by day of the week for the device "+str(device_number)+" over the "+time_interval+" starting from "+starting_point,
               ytitle="Mbps",downloadline=True, weekdays=True, jitter=True)

On the jitterplots (dots) left from every boxplot we can see distinct clusters for every day of the week with high and low speeds. At the same time,looking at medians,  we can see that Monday Tuesday and Friday have the worst/lowest speeds compared to Wednesday, Thursday and Saturday. Sunday has the best results.

Another thing we can check is test servers. Let's look at this plot:

![](images/plotly_testservers_14.png)

For device #14, the majority of datapoints  are coming from 2 test servers: Westman Communication (Brandon, MB) and Morenet (Morden, MB).   
Let's check in Grafana the datapoints coming from different test servers:

![](images/grafana_download14_3.png)

We can clearly see that download speeds measured unsing Morden test server are the high speeds cluster and data measured using Brandon test server is the low speeds cluster.

We know that device #14 is located in Dacota Plains, MB. These two test servers geographically are not far from Dacota Plains. Why are the speeds so noticeble different? How different is the packet route  to reach these servers?

In [None]:
#devices_download_below50=summary_download[summary_download['download_below50_percent']>50]['SK_PI'].unique()
#hist_data=[]
#group_labels=[]
#colors_d=[]
#for device_number in devices_download_below50:
#    hist_data.append(download_df[download_df["SK_PI"]==device_number]["DOWNLOAD"])
#    group_labels.append(device_number)
#    colors_d.append(colors[device_number])

#fig = ff.create_distplot(hist_data, group_labels, colors=colors_d,
#                         bin_size=1, show_rug=False)
#
#fig['layout'].update(title='Download speed: distribution for all devices')
#iplot(fig)

## Upload speed

In [None]:
if not all_data:
    query_upload = "SELECT * FROM SPEEDTEST_UPLOAD WHERE PROVIDER!='iperf' AND time >= '"+starting_point+"'-"+time_interval+" AND UPLOAD>0;"
else:
    query_upload = "SELECT * FROM SPEEDTEST_UPLOAD WHERE PROVIDER!='iperf' AND UPLOAD>0;"
upload_df = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_upload,table_name='SPEEDTEST_UPLOAD')
#upload_df.head()
upload_df=upload_df[upload_df["SK_PI"]!=3]
upload_df['time']= upload_df['time'].dt.tz_localize('UTC').dt.tz_convert('America/Winnipeg')

In [None]:
device_numbers_u=upload_df['SK_PI'].unique()
device_numbers_u=list(map(int, device_numbers_u))
device_numbers_u= sorted(device_numbers_u)
#device_numbers_u

In [None]:
upload_df["hour"]=pd.to_numeric(upload_df["time"].dt.hour)
by_hour_by_device_u = mean_max_median_min_by2(input_dataframe=upload_df,value1="UPLOAD", value2="UPLOAD",
                                              value3="UPLOAD",value4="UPLOAD",group_by_value="hour", rename_columns=True)

In [None]:
by_hour_by_device_u = mean_max_median_min_by2(input_dataframe=upload_df,value1="UPLOAD", value2="UPLOAD",
                                              value3="UPLOAD",value4="UPLOAD",group_by_value="hour", rename_columns=True)
traces=[]
for device in device_numbers_d:
    #subset=by_hour_by_device_u[by_hour_by_device_u["SK_PI"]==device]
    subset=upload_df[upload_df["SK_PI"]==device]
    trace = go.Scatter(
        x = subset['hour'],
        #y=subset['UPLOAD'],
        y=(subset['UPLOAD']-subset['UPLOAD'].mean())/subset['UPLOAD'].std(),
        #y=subset['mean'],
        #y=(subset['mean']-subset['mean'].mean())/subset['mean'].std(),
        mode = 'markers',
        marker = dict(color=colors[device]),
        name = device
    )
    traces.append(trace)
layout = go.Layout(
        title="Average upload speed by hour over the last "+time_interval+ " starting from "+ starting_point,
        xaxis=dict(title="Hour of the day"),
        yaxis=dict(title="Difference to normalized speeds (Mbps)")
        )
#download_line=go.Scatter(x=by_hour_by_device_d['hour'],y=[50] * len(by_hour_by_device_d['hour']), mode='lines',marker=dict(color='red'), name='50Mbps')
#traces.append(download_line)
data = traces
fig = go.Figure(data=data, layout=layout)
iplot(fig)

This plot shows the upload speeds for all the devices normalized around zero.
We can see the same trend - speeds are little bit higher during the night morning and take a dip during bussiness hours.

In [None]:
upload_summary=mean_max_median_min_by1(upload_df,'UPLOAD')

In [None]:
simple_boxplot(dataframe=upload_df,plot_value='UPLOAD',sort_value='SK_PI',
               title=t, 
               ytitle="Upload speed (Mbps)",
               xtitle="Hour of the day",uploadline=True)

On this plot we see that for upload speed the situation is a little bit better: some of the devices are abouve the CRTC target of 10 Mbps (red dots).

Let's calculate the statistics - percentage of datapoints below 10Mbps for every device.

In [None]:
upload_df["below10"]=0
upload_df.loc[(upload_df["UPLOAD"]<10),"below10"]=1
#upload_df.head()

In [None]:
summary_upload=upload_df.groupby("SK_PI")['below10'].count().reset_index()
summary_upload.rename(columns={'below10':'upload_count'}, inplace=True)

mean_upload=round(upload_df.groupby("SK_PI")['UPLOAD'].mean(),2).reset_index()
mean_upload.rename(columns={'UPLOAD':'upload_mean'}, inplace=True)

std_upload=round(upload_df.groupby("SK_PI")['UPLOAD'].std(),2).reset_index()
std_upload.rename(columns={'UPLOAD':'upload_std'}, inplace=True)

subset_below10=upload_df[upload_df['below10']==1]
summary_below10=subset_below10.groupby("SK_PI")['below10'].count().reset_index()
summary_below10.rename(columns={'below10':'upload_below10'}, inplace=True)

summary_upload=pd.merge(summary_upload, summary_below10,  how='outer', left_on=['SK_PI'], right_on = ['SK_PI'])
summary_upload=pd.merge(summary_upload, mean_upload,  how='outer', left_on=['SK_PI'], right_on = ['SK_PI'])
summary_upload=pd.merge(summary_upload, std_upload,  how='outer', left_on=['SK_PI'], right_on = ['SK_PI'])

summary_upload.fillna(0, inplace=True)
summary_upload["Upload speeds less than 10  Mbps  (%)"]=round(summary_upload['upload_below10']/summary_upload['upload_count']*100)
summary_upload=summary_upload.sort_values(by=['SK_PI'], ascending=[ True])

summary_upload["device number"]=summary_upload["SK_PI"]
summary_upload=summary_upload.reset_index().set_index("device number")
summary_upload[["upload_mean","upload_std","Upload speeds less than 10  Mbps  (%)"]]

We can clearly see that devices 5, 10, 12, 16 and 18  have 99%-100% of data below 10Mbps.  
Devices 8, 9, 11, 14, 15, 17 look good: they have 1%-0% of data below 10Mbps.  
It's not quite clear for device #7 - it has 77% of data above and 23%of data below CRTC target.  
Let's calculate some other metrics for this device:

In [None]:
device_number=7
subset=upload_df[upload_df["SK_PI"]==device_number]
hist_data = [subset['UPLOAD']]
group_labels = ['device '+str(device_number)+ ' upload speed']
import plotly.figure_factory as ff
fig = ff.create_distplot(hist_data, group_labels,bin_size=0.25)
fig['layout']['xaxis'].update(title='Upload speed (Mbps)')
iplot(fig)


Looking at the distribution plot - we can see that data is concentrated around 6-10Mbps and is not spread as widely as it was for device #14.  The same pattern emerges from the raw data drawn from Grafana below.

![](images/grafana_upload7.png)

Let's check statistics by hour for this device as well:

In [None]:
upload_df["hour"]=pd.to_numeric(upload_df["time"].dt.hour)
subset=upload_df[upload_df["SK_PI"]==device_number]
simple_boxplot(dataframe=subset,plot_value='UPLOAD',sort_value='hour',
               title="Upload speed by hour for the device "+str(device_number)+" over the "+time_interval+" starting from "+starting_point,
               ytitle="Download speed (Mbps)",
               xtitle="Hour of the day", uploadline=True)

On this plot we can see that the only time when device #7 meets the CRTC target and the upload speed is above 10Mbps - is early in the morning (4am-7am). The rest of the time  the upload speed is  below 10Mbps.

In [None]:
upload_df["time_group"]=""
upload_df.loc[(upload_df["hour"]>23)|(upload_df["hour"]<=7),"time_group"]="night 23:00-07:00"
upload_df.loc[(upload_df["hour"]>7)&(upload_df["hour"]<=17),"time_group"]="day 7:00-17:00"
upload_df.loc[(upload_df["hour"]>17)&(upload_df["hour"]<=23),"time_group"]="evening 17:00-23:00"

In [None]:
subset=upload_df[upload_df["SK_PI"]==device_number]
simple_boxplot(dataframe=subset,plot_value='UPLOAD',sort_value='time_group',
               title="Upload speed by timegroup for device "+str(device_number)+" over the "+time_interval+" starting from "+starting_point,
               ytitle="Mbps",uploadline=True, jitter=True)

The same picture emerges on this boxplot - the highest speeds (and median of 9.74Mbps) is seen during the night. Day time has a slightly higher median than what is seen during the evening (8Mbps and 6.8Mbps, respectively). Note that we have not tested for statistical significance yet. 

In [None]:
upload_df["weekday"]=upload_df["time"].dt.weekday_name
upload_df["weekday"] = pd.Categorical(upload_df["weekday"], ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

In [None]:
subset=upload_df[upload_df["SK_PI"]==device_number]
simple_boxplot(dataframe=subset,plot_value='UPLOAD',sort_value='weekday',
               title="Upload speed by day of the week for the device "+str(device_number)+" over the "+time_interval+" starting from "+starting_point,
               ytitle="Mbps",uploadline=True, weekdays=True, jitter=True)

Chcking upload speeds for  device #7 by day of the week does not show any trends. (Wednesday is the fastest for some reason?). And no distinct results if be group by working day/weekend (on the plot below).

In [None]:
upload_df["day_group"]="Weekday"
upload_df.loc[(upload_df["weekday"]=="Sunday")|(upload_df["weekday"]=="Saturday"),"day_group"]="Weekend"
subset=upload_df[upload_df["SK_PI"]==device_number]
simple_boxplot(dataframe=subset,plot_value='UPLOAD',sort_value='day_group',
               title="Upload speed by day group for device "+str(device_number)+" over the "+time_interval+" starting from "+starting_point,
               ytitle="Mbps",uploadline=True, jitter=True)

In [None]:
#devices_upload_below10=summary_upload[summary_upload['upload_below10_percent']>50]['SK_PI'].unique()
#hist_data=[]
#group_labels=[]
#colors_d=[]
#for device_number in devices_upload_below10:
#    hist_data.append(upload_df[upload_df["SK_PI"]==device_number]["UPLOAD"])
#    group_labels.append(device_number)
#    colors_d.append(colors[device_number])
#
#fig = ff.create_distplot(hist_data, group_labels, colors=colors_d,
#                         bin_size=0.25, show_rug=False)
#
# Add title
#fig['layout'].update(title='Upload speed: distribution for all devices')
#iplot(fig)

In [None]:
summary_upload_download=pd.merge(summary_upload, summary_download,  how='outer', left_on=['SK_PI'], right_on = ['SK_PI'])
summary_upload_download=summary_upload_download.sort_values(by=['Download speeds less than 50  Mbps  (%)','Upload speeds less than 10  Mbps  (%)','SK_PI'], ascending=[False, False, True])
summary_upload_download["device number"]=summary_upload_download["SK_PI"]
summary_upload_download=summary_upload_download.reset_index().set_index("device number")
summary_upload_download[['Upload speeds less than 10  Mbps  (%)','Download speeds less than 50  Mbps  (%)']]

Combining datasets for unpload and download speeds together show that devices 5, 10, 16, 12, 18 and 7 have the majority of dataponts below CRTC targets for both upload and download speeds.  

Devices 15, 17, 9, 11 meet the CRTC target for upload speed but do not hit the 50Mbps for download speed.  
Device #14 has only 32% of data below CRTC target for download (but with high fluctuation as we have seen before).  
And device #8 is the only device that consistently meets the upload/download speeds targets.

Let's add latitude and longitude of the 8 devices that we have coordinates for and display them on a map:

In [None]:
d = {'SK_PI': [7,12,14,17,11,16,10,18], 'lat': [51.7834662, 50.366367,49.81892,56.5020674,49.13794,
                                                  50.533,50.9080366,56.0867787], 'long': [-96.6966330,-96.613671,
                                                -98.52097,-94.2086756,-97.24325,-100.3155,-98.5971688,-96.0936590],
                                                'name':['Bloodvein','Brokenhead','Dakota Plains','Fox Lake','Ginew',
                                                        'Keeseekoowenin','Lake Manitoba','York Landing']}
coordinates_df = pd.DataFrame(data=d)
summary_ud_loc=pd.merge(summary_upload_download, coordinates_df,  how='right', left_on=['SK_PI'], right_on = ['SK_PI'])
summary_ud_loc[['Upload speeds less than 10  Mbps  (%)','Download speeds less than 50  Mbps  (%)','lat','long']]

In [None]:
summary_ud_loc['text1'] = summary_ud_loc['SK_PI'].astype(str)+'.'+summary_ud_loc['name'] +': Average download speed '+\
                          summary_ud_loc['download_mean'].astype(str) + 'Mbps , standart deviaton: '+\
                          summary_ud_loc['download_std'].astype(str)
data = [ dict(
        type = 'scattergeo',
        locationmode = 'north america',
        lon = summary_ud_loc['long'],
        lat = summary_ud_loc['lat'],
        text = summary_ud_loc['text1'],
        mode = 'markers',
        marker = dict(
            size = 15,
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'circle',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale='Jet',
            #colorscale = scl,
            cmin = 0,
            color = summary_ud_loc['download_mean'],
            cmax = summary_ud_loc['download_mean'].max(),
            colorbar=dict(
               # title="Percentage of download speed data below 50Mbs"
            )
        ))]

layout = dict(
        title = 'Devices colored by average download speed',
        colorbar = True,
        geo = dict(
            scope = 'north america',
            showland = True,
            landcolor = "rgb(212, 212, 212)",
            countrycolor = "rgb(255, 255, 255)",
            showlakes = True,
            lakecolor = "rgb(255, 255, 255)",
            showsubunits = True,
            showcountries = True,
            resolution = 50,
            projection = dict(
                type = 'kavrayskiy7',
            ),
             lonaxis = dict(
                gridwidth = 2,
                range= [ -110, -80 ],
                dtick = 10
            ),
            lataxis = dict (
                range= [ 47.0, 60.0 ],
                dtick = 10
            )
        ),
    )

fig = dict( data=data, layout=layout )
iplot( fig, validate=False)

On this map we have devices colored by average downlod speed: green and blue colours represent average speeds at or above the CRTC target of 50 Mbps. 
Most of the devices have speeds below or around 30Mbps - the are colored orange to red.
Device #14 - Dacota Plains  - has an average dowload speed of 108Mbps but a standart deviation of 56 which indicates that the data varies a lot.


In [None]:
summary_ud_loc['text'] = summary_ud_loc['SK_PI'].astype(str)+'.'+summary_ud_loc['name'] +': Download speed: '+\
                         summary_ud_loc['Download speeds less than 50  Mbps  (%)'].astype(str)+\
                        '% of the data is below 50Mbps'
data = [ dict(
        type = 'scattergeo',
        locationmode = 'north america',
        lon = summary_ud_loc['long'],
        lat = summary_ud_loc['lat'],
        text = summary_ud_loc['text'],
        mode = 'markers',
        marker = dict(
            size = 15,
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'circle',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale='Greens',
            #colorscale = scl,
            cmin = 0,
            color = summary_ud_loc['Download speeds less than 50  Mbps  (%)'],
            cmax = summary_ud_loc['Download speeds less than 50  Mbps  (%)'].max(),
            colorbar=dict(
               # title="Percentage of download speed data below 50Mbs"
            )
        ))]

layout = dict(
        title = 'Devices colored by percentage of download speed data less than 50  Mbps',
        colorbar = True,
        geo = dict(
            scope = 'north america',
            showland = True,
            landcolor = "rgb(212, 212, 212)",
            countrycolor = "rgb(255, 255, 255)",
            showlakes = True,
            lakecolor = "rgb(255, 255, 255)",
            showsubunits = True,
            showcountries = True,
            resolution = 50,
            projection = dict(
                type = 'kavrayskiy7',
            ),
              lonaxis = dict(
                gridwidth = 2,
                range= [ -110, -80 ],
                dtick = 10
            ),
            lataxis = dict (
                range= [ 47.0, 60.0 ],
                dtick = 10
            )
        ),
    )

fig = dict( data=data, layout=layout )
iplot( fig, validate=False)

On this map we have devices colored by percentage of datapoints below 50Mbps. Most of the devices are dark green: 100% of the data is below the target. The light green is again device #14 with 32% of data below 50Mbps.

These results can be compared with the map listed on CRTC web site: "Areas to Enhance Broadband Access" (https://crtc.gc.ca/eng/internet/band.htm).
All the 8 devices that we have on our map listed as "underserved" on the CRTC map as well.

![](images/manitoba_crtc.png)

### Ping latency

Let's check ping latency for all the devices grouped by hour. 
For the plot below - we have used data coming from collectd. This data is coming every 5 seconds (compared to the speedtest data, which comes in every 3hours and 42 mins) - so there are much more datapoints in the database.

In [None]:
device_numbers=get_tag_values_influxdb(client_influx=client,table_name='PING', tag_name='SK_PI')
device_numbers=list(map(int, device_numbers))
device_numbers= sorted(device_numbers)
ping_mean_query="SELECT MAX(PING),MEAN(PING), MEDIAN(PING) FROM PING WHERE PING!=0 AND time >= '"+starting_point+"'-"+\
                time_interval+" GROUP BY time(1h), SK_PI;"

In [None]:
ping_df=get_3_stats_influxdb(client_influx=client,
                                           query_influx=ping_mean_query,
                                           stat_name1='max',
                                           stat_name2='mean',
                                           stat_name3='median',
                                           device_numbers=device_numbers)
ping_df=ping_df[ping_df["SK_PI"]!=3]
ping_df['time']= ping_df['time'].dt.tz_localize('UTC').dt.tz_convert('America/Winnipeg')

In [None]:
device_numbers_p=ping_df['SK_PI'].unique()
device_numbers_p=list(map(int, device_numbers_p))
device_numbers_p= sorted(device_numbers_p)
#device_numbers

In [None]:
ping_df["hour"]=pd.to_numeric(ping_df["time"].dt.hour)

In [None]:
simple_boxplot(dataframe=ping_df,plot_value='mean',sort_value='hour',
               title="Ping latency by hour for all the devices over the "+time_interval+" starting from "+starting_point,
               ytitle="Miliseconds",
               xtitle="Hour of the day")

On this plot we can see a clear spike in latencies during business hours (8am to 4pm) and a down time during the night (12am to 7am).

In [None]:
ping_df["weekday"]=ping_df["time"].dt.weekday_name
ping_df["weekday"] = pd.Categorical(ping_df["weekday"], ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

In [None]:
simple_boxplot(dataframe=ping_df,plot_value='mean',sort_value='weekday',
               title="Ping latency by hour for all the devices over the "+time_interval+" starting from "+starting_point,
               ytitle="Miliseconds", weekdays=True)

If we check by day of the week - Saturday and Sunday show lower latencies than business days.   
And the same is summarized on the plot below:

In [None]:
ping_df["day_group"]="Weekday"
ping_df.loc[(ping_df["weekday"]=="Sunday")|(ping_df["weekday"]=="Saturday"),"day_group"]="Weekend"

In [None]:
simple_boxplot(dataframe=ping_df,plot_value='mean',sort_value='day_group',
               title="Ping latency by day group for all the devices over the "+time_interval+" starting from "+starting_point,
               ytitle="Miliseconds")

Cheking ping latencies for device #7 that we have examined before we can see the same trend - night time and early morning have the lowest latencies:

In [None]:
by_hour_by_device_p1=mean_max_median_by2(input_dataframe=ping_df,value1="mean", value2="max",
                                          value3="median",group_by_value="hour")
device_number=7
subset=by_hour_by_device_p1[by_hour_by_device_p1["SK_PI"]==device_number]
combined_bar_plot_3traces(xvalues=subset["hour"],
                         yvalues1=subset["max"],
                         yvalues2=subset["mean"],
                         yvalues3=subset["median"],
                         name1="Max",
                         name2="Mean",
                         name3="Median",
                         title="Ping latency(collectd) by hour for the device "+str(device_number)+" over the "+time_interval+" starting from "+starting_point,
                         xtitle="hour",
                         stack=False)#,updatemenus=updatemenus1, annotations=annotations1)

In [None]:
query_ping = "SELECT * FROM SPEEDTEST_PING WHERE PROVIDER!='iperf' AND time >='"+starting_point+"'-"+time_interval+" AND PING!=0;"
ping_speedtest_dataframe = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_ping,table_name='SPEEDTEST_PING')

In [None]:
ping_speedtest_dataframe=ping_speedtest_dataframe[ping_speedtest_dataframe["SK_PI"]!=3]
ping_speedtest_dataframe['time']= ping_speedtest_dataframe['time'].dt.tz_localize('UTC').dt.tz_convert('America/Winnipeg')

In [None]:
ping_speedtest_dataframe["hour"]=pd.to_numeric(ping_speedtest_dataframe["time"].dt.hour)

In [None]:
by_hour_by_device_p2=mean_max_median_by2(input_dataframe=ping_speedtest_dataframe,value1="PING", value2="PING",
                                          value3="PING",group_by_value="hour", rename_columns=True)

In [None]:
device_number=14
subset=by_hour_by_device_p2[by_hour_by_device_p2["SK_PI"]==device_number]
combined_bar_plot_3traces(xvalues=subset["hour"],
                         yvalues1=subset["max"],
                         yvalues2=subset["mean"],
                         yvalues3=subset["median"],
                         name1="Max",
                         name2="Mean",
                         name3="Median",
                         title="Ping latency(speedtest) by hour for the device "+str(device_number)+" over the "+time_interval+" starting from "+starting_point,
                         xtitle="hour",
                         stack=False)

Device #14 does not have any collectd data as data acquisition stopped (most likely due to an error). In this case, we are checking ping latency coming from speedtest. Here we don't notice any obvious trends, except for a spike at 6am, all data looks quite even.

## Next steps

For next steps, we will focus the analysis on solifidying the conclusions that can be drawn from the data. That is, we will set up statistical tests to help determine whether speeds observed are significantly different from the targets set by the CRTC. We will also examine the iperf test data - another testing tool that has not been analyzed yet. 