In [1]:
from IPython.display import display, Math, Latex, HTML
HTML('''<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }
  
  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''') 

# ConnectIN draft analysis, part 3 - Feb 07, 2019

This document contains:
- information about bandwith purchased(possibly outdated), added to the upload/uownload speed graphs; 
- observations for data coming from `iperf` test (download/upload speeds, ping latency)  and comparing this data to the data coming from `speedtest` test.
- statistical analysis of download speed data.


In [2]:
#Load libraries:
from data_exploration import *
#for plotly distribution plots
import plotly.figure_factory as ff

from data_statistics import *
%matplotlib inline

import numpy as np

In [3]:
import pyodbc
def connect_to_mssql():
    with open('../credentials.json', 'r') as f_credentials:
        credentials_config = json.load(f_credentials)
    password=credentials_config['mssql_password']
    srv=credentials_config['mssql_host']
    connection = pyodbc.connect(driver='/usr/local/lib/libtdsodbc.so', server=srv,port='1433', database='net_speed_md', uid='cybera_sql', pwd=password)
    return connection

In [4]:
#Set up influxdb connection:
client, client_df = connect_to_influxdb()

### 1. Bandwidth purchased

Displaying possible bandwidht purchased information(from excel spreadsheet provided).   
Will add it to download/upload speeds graphs.  
On the graphs we will display 4 weeks of data coming from `speedtest` back from the meeting date: `Feb 07, 2019, 14:00`

In [5]:
all_data=False
#Collect all data?
#all_data=True

#Set up test time interval:
time_interval='4w' #2w

#Set up starting point, by default if will start from current time
#starting_point=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
starting_point="2019-01-07 14:00:00"  # to set upl alternative starting point

title_tail=""
query_tail=""

if not all_data:
    ##tail to all the titles
    title_tail=" over the last "+time_interval+ " back  from "+ starting_point
    ##tail for all the influxdb queries
    query_tail=" AND time >= '"+starting_point+"'-"+time_interval

In [6]:
##Matching coordinates from given mac addresses to device numbers from MS SQL table
coordinates_df = pd.read_csv("coordinates.csv")
cnxn = connect_to_mssql()
sql = "SELECT DISTINCT PI_MAC, PK_PI FROM  DIM_PI;"
df_frommssql=pd.read_sql(sql,cnxn)
coordinates_df=pd.merge(coordinates_df, df_frommssql,  how='left', left_on=['mac'], right_on = ['PI_MAC'])
coordinates_df.rename(columns={'PK_PI':'device_number'}, inplace=True)
coordinates_df = coordinates_df[["lat","long","name","mac","device_number","Up","Down"]]

In [7]:
coordinates_df[["name","device_number","Up","Down"]]

Unnamed: 0,name,device_number,Up,Down
0,Bloodvein,13,2.0,10.0
1,Brokenhead,12,5.0,25.0
2,Dakota Plains,14,10.0,10.0
3,Fox Lake,17,10.0,10.0
4,Ginew,11,1.5,5.0
5,Keeseekoowenin,16,0.496,7.0
6,Lake Manitoba,10,5.5,30.0
7,York Landing,18,9.72,2.87
8,Fisher river Band Office,4,,
9,Fisher River Clinic,6,,


In [8]:
query_download = "SELECT * FROM SPEEDTEST_DOWNLOAD WHERE PROVIDER!='iperf' AND DOWNLOAD>0"+ query_tail+";"
download_df = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_download,table_name='SPEEDTEST_DOWNLOAD')
download_df=download_df[download_df["SK_PI"]!=3]

In [9]:
device_numbers_d=download_df['SK_PI'].unique()
device_numbers_d=list(map(int, device_numbers_d))
device_numbers_d= sorted(device_numbers_d)
download_df["hour"]=pd.to_numeric(download_df["time"].dt.hour)

In [10]:
coordinates_subset_d=coordinates_df[coordinates_df["device_number"].isin(device_numbers_d)]
coordinates_subset_d=coordinates_subset_d[np.isfinite(coordinates_subset_d['Down'])]
coordinates_subset_d=coordinates_subset_d.sort_values(by="device_number", ascending=True)

In [11]:
b_line=go.Scatter(x=coordinates_subset_d["device_number"],y=coordinates_subset_d["Down"], mode='markers',marker=dict(color='blue'), name='Bandwidth bought')

t="Download speed by device"+title_tail                                                      
simple_boxplot(dataframe=download_df,plot_value='DOWNLOAD',sort_value='SK_PI',
               title=t, 
               ytitle="Download speed (Mbps)",
               xtitle="Device number", downloadline=True,boughtline=b_line)

For download speeds it looks correct for device 16(all data is concetrated below 7) and it's also possibly correct for devices 10,12. For the rest of the devices the "bandwith bought" dot is significantly lower than actual speeds. Possibly the  information is outdated.

In [12]:
query_upload = "SELECT * FROM SPEEDTEST_UPLOAD WHERE PROVIDER!='iperf' AND UPLOAD>0"+ query_tail+";"
upload_df = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_upload,table_name='SPEEDTEST_UPLOAD')
upload_df=upload_df[upload_df["SK_PI"]!=3]

In [13]:
device_numbers_u=upload_df['SK_PI'].unique()
device_numbers_u=list(map(int, device_numbers_u))
device_numbers_u= sorted(device_numbers_u)
upload_df["hour"]=pd.to_numeric(upload_df["time"].dt.hour)

In [14]:
coordinates_subset_u=coordinates_df[coordinates_df["device_number"].isin(device_numbers_u)]
coordinates_subset_u=coordinates_subset_u[np.isfinite(coordinates_subset_u['Up'])]
coordinates_subset_u=coordinates_subset_u.sort_values(by="device_number", ascending=True)

In [15]:
b_line=go.Scatter(x=coordinates_subset_u["device_number"],
                  y=coordinates_subset_u["Up"], mode='markers',marker=dict(color='blue'), name='Bandwidth bought')

t="Upload speed by device"+title_tail
simple_boxplot(dataframe=upload_df,plot_value='UPLOAD',sort_value='SK_PI',
               title=t, 
               ytitle="Upload speed (Mbps)",
               xtitle="Device number",uploadline=True, boughtline=b_line)

Displaying bandwidht purchased information on Upload speeds graph, shows that it might be  possibly correct only for device 10. 
For device 16, that looked correct on Download speeds graph - excel spreadsheed shows 0.496 but on the graph we see that its concentrated below 4Mbps?
How can we get the correct information on the bandwidth purchased?

### 2. Iperf data

#### Number of data points, reporting times

In [16]:
#starting_point=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
starting_point="2019-02-07 14:00:00"  # to set upl alternative starting point
#print("Starting point:",starting_point )

title_tail=" to the date "+ starting_point

time_interval='4w' #5d

In [17]:
device_numbers=get_tag_values_influxdb(client_influx=client,table_name='SPEEDTEST_DOWNLOAD', tag_name='SK_PI')
device_numbers=list(map(int, device_numbers))
device_numbers= sorted(device_numbers)

In [18]:
query_download_counts = "SELECT COUNT(DOWNLOAD) FROM SPEEDTEST_DOWNLOAD WHERE PROVIDER='iperf' AND time<= '"+starting_point+"' AND DOWNLOAD>0 GROUP BY SK_PI;"
download_counts=get_stats_influxdb(client_influx=client,
                               query_influx=query_download_counts,
                               stat_name='count',
                               device_numbers=device_numbers)

In [19]:
simple_bar_plot(xvalues=device_numbers,
                yvalues=download_counts,
                name="ping datapoints",
                title="Number of data points per device(Iperf) "+ title_tail,
                ytitle="Number of datapoints")

This graph shows number of datapoints collected by `iperf` test until today (meeting date 2019-02-07 14:00). These numbers are not very large, lets examine  reporting times for every device.

In [20]:
query_upload_last = "SELECT LAST(UPLOAD), time FROM SPEEDTEST_UPLOAD WHERE PROVIDER='iperf' AND time <= '"+starting_point+"' AND UPLOAD>0 GROUP BY SK_PI;"
result_upload_last=get_stats_influxdb(client_influx=client,
                               query_influx=query_upload_last,
                               stat_name='time',
                               device_numbers=device_numbers)

In [21]:
query_upload_first = "SELECT FIRST(UPLOAD), time FROM SPEEDTEST_UPLOAD WHERE PROVIDER='iperf' AND time <= '"+starting_point+"' AND UPLOAD>0 GROUP BY SK_PI;"
result_upload_first=get_stats_influxdb(client_influx=client,
                               query_influx=query_upload_first,
                               stat_name='time',
                               device_numbers=device_numbers)

In [22]:
data=[]
for i in range(len(device_numbers)):
    try:
        result_upload_first[i] = dateutil.parser.parse(result_upload_first[i]).strftime('%Y-%m-%d %H:%M:%S')
    except:
        result_upload_first[i]=None
    try:    
        result_upload_last[i] = dateutil.parser.parse(result_upload_last[i]).strftime('%Y-%m-%d %H:%M:%S')
    except:
        result_upload_last[i]=None
    #print("Device: ", device_numbers[i],"  was reporting from ", result_upload_first[i], " to ",result_upload_last[i])
    trace = go.Scatter(x=[result_upload_first[i],result_upload_last[i]],y=[device_numbers[i],device_numbers[i]], 
                       name = device_numbers[i],marker=dict(color=colors[i]))
    data.append(trace)
layout = dict(title = "Device reporting times(iperf) "+ title_tail,xaxis=dict(title="Time"),
        yaxis=dict(title="Device Number"))
fig = go.Figure(data=data, layout=layout)
iplot(fig)

iperf3 stopped listening on the test server on Dec3?  
Able to ping `clearskystatus.info` but all iperf3 test failing:
   >/usr/bin/iperf3 -c clearskystatus.info  
   >iperf3: error - unable to connect to server: Operation timed out

We can see the same in grafana(screenshot below), all devices stopped reporting `iperf` data on Dec 3rd.

![](images/grafana-iperf1.png)

Checking `speedtest` charts in grafana also shows that there is no data since Jan 23rd (screenshot below).

![](images/grafana-speedtest.png)

#### Comparing  iperf and speedtest data
Since `iperf` stopped reporting on Dec 3rd, we will only select all data collected before Dec 4th for both sources `speedtest` and `iperf`.

In [23]:
#Set up starting point, by default if will start from current time
#starting_point=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
starting_point="2018-12-04 00:00:00"  # to set upl alternative starting point
print("Starting point:",starting_point )

title_tail=" to the date "+ starting_point
query_tail=" AND time < '"+starting_point+"'"

Starting point: 2018-12-04 00:00:00


In [24]:
query_download1 = "SELECT * FROM SPEEDTEST_DOWNLOAD WHERE PROVIDER!='iperf' AND DOWNLOAD>0"+ query_tail+";"
download_df1 = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_download1,table_name='SPEEDTEST_DOWNLOAD')
device_numbers_d1=download_df1['SK_PI'].unique()
device_numbers_d1=list(map(int, device_numbers_d1))
device_numbers_d1= sorted(device_numbers_d1)

In [25]:
query_download2 = "SELECT * FROM SPEEDTEST_DOWNLOAD WHERE PROVIDER='iperf' AND DOWNLOAD>0"+ query_tail+";"
download_df2 = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_download2,table_name='SPEEDTEST_DOWNLOAD')
download_df2['DOWNLOAD']=download_df2['DOWNLOAD']*0.001
device_numbers_d2=download_df2['SK_PI'].unique()
device_numbers_d2=list(map(int, device_numbers_d2))
device_numbers_d2= sorted(device_numbers_d2)

In [26]:
result1=pd.DataFrame(download_df1.groupby('SK_PI').size())
result1.columns=['size1']

result2=pd.DataFrame(download_df2.groupby('SK_PI').size())
result2.columns=['size2']
result=result2.join(result1,how='outer')

In [27]:
combined_bar_plot_2traces(xvalues=result.index.astype(int),
                          yvalues1=result['size1'],
                          yvalues2=result['size2'],
                          name1='speedtest',
                          name2='iperf',
                          title="Comparing number of datapoints for speedtest and iperf"+title_tail ,
                          ytitle="Number of datapoints")

This graph shows how many datapoints were collected by `speedtest` and `iperf` up to December 4th. Both tests are set up every 3hrs 42 mins ,but looking at the graph the numbers are slightly different: less data was collected by `iperf`?  
Grafana screenshot below shows that `iperf` test(yellow) was not as consistent as`speedtest` test (green), there are lots of gaps in data. (We have choosen to display ping latency chart, as Download/Upload speeds are collected in different units by `speedtest` and `iperf` and more difficult to visualy compare in grafana).

![](images/grafana-speedtest1.png)

In [28]:
t="Download speed by device for speedtest vs iperf data"+title_tail
scatterplot_2groups(title=t,dataframe1=download_df1,dataframe2=download_df2,
                    plot_value="DOWNLOAD",ytitle="Download speed (Mbps)")

Plotting Download speeds by device, we can see that `iperf` data is more concentrated at the bottom (lower speeds) and `speedtest` data it more  to the top (higher speeds). Lets take a closer look at Mean/Median chart:

In [29]:
result11=pd.DataFrame(download_df1.groupby('SK_PI')['DOWNLOAD'].mean())
result11.columns=['mean1']
result12=pd.DataFrame(download_df1.groupby('SK_PI')['DOWNLOAD'].median())
result12.columns=['median1']
result1=result11.join(result12,how='outer')

result21=pd.DataFrame(download_df2.groupby('SK_PI')['DOWNLOAD'].mean())
result21.columns=['mean2']
result22=pd.DataFrame(download_df2.groupby('SK_PI')['DOWNLOAD'].median())
result22.columns=['median2']
result2=result21.join(result22,how='outer')

result=result2.join(result1,how='outer')

In [30]:
device_numbers_d=result.index
download_line=go.Scatter(x=device_numbers_d,y=[50] * len(device_numbers_d), mode='markers',marker=dict(color='red'), name='50Mbps')

combined_bar_plot_4traces(xvalues=result.index,
                         yvalues1=result["mean1"],
                         yvalues2=result["mean2"],
                         yvalues3=result["median1"],
                         yvalues4=result["median2"],
                         name1="Mean speedtest",
                         name2="Mean iperf",
                         name3="Median speedtest",
                         name4="Median iperf",
                         title="Download speed by device" +title_tail,
                         ytitle="Mbps",
                         line=download_line,
                         stack=False)

Here its interesting to see that for devices 1,2,4,5,12  the data from both sources looks similar.
But for devices 3, 8, 7 and 11 the `speedtest` and `iperf` speeds are very different(speedtest has much better results). Device #3 is our control device located in Calgary, so its trying to use test server in Manitoba and shows worse results. Where are devices 8 and 9 located?  Device 11 is in Ginew, Manitoba.

In [31]:
query_upload1 = "SELECT * FROM SPEEDTEST_UPLOAD WHERE PROVIDER!='iperf' AND UPLOAD>0"+ query_tail+";"
upload_df1 = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_upload1,table_name='SPEEDTEST_UPLOAD')
device_numbers_u1=upload_df1['SK_PI'].unique()
device_numbers_u1=list(map(int, device_numbers_u1))
device_numbers_u1= sorted(device_numbers_u1)

In [32]:
query_upload2 = "SELECT * FROM SPEEDTEST_UPLOAD WHERE PROVIDER='iperf' AND UPLOAD>0"+ query_tail+";"
upload_df2 = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_upload2,table_name='SPEEDTEST_UPLOAD')
upload_df2['UPLOAD']=upload_df2['UPLOAD']*0.001
device_numbers_u2=download_df2['SK_PI'].unique()
device_numbers_u2=list(map(int, device_numbers_u2))
device_numbers_u2= sorted(device_numbers_u2)

In [33]:
t="Upload speed by device for speedtest vs iperf data"+title_tail
scatterplot_2groups(title=t,dataframe1=upload_df1,dataframe2=upload_df2,
                    plot_value="UPLOAD",ytitle="Upload speed (Mbps)")

For upload speeds we see the same trend: `iperf` data is concentrated at the bottom.

In [34]:
result11=pd.DataFrame(upload_df1.groupby('SK_PI')['UPLOAD'].mean())
result11.columns=['mean1']
result12=pd.DataFrame(upload_df1.groupby('SK_PI')['UPLOAD'].median())
result12.columns=['median1']
result1=result11.join(result12,how='outer')

result21=pd.DataFrame(upload_df2.groupby('SK_PI')['UPLOAD'].mean())
result21.columns=['mean2']
result22=pd.DataFrame(upload_df2.groupby('SK_PI')['UPLOAD'].median())
result22.columns=['median2']
result2=result21.join(result22,how='outer')

result=result2.join(result1,how='outer')

In [35]:
device_numbers_u=result.index
upload_line=go.Scatter(x=device_numbers_d,y=[10] * len(device_numbers_u), mode='markers',marker=dict(color='red'), name='10Mbps')

combined_bar_plot_4traces(xvalues=result.index,
                         yvalues1=result["mean1"],
                         yvalues2=result["mean2"],
                         yvalues3=result["median1"],
                         yvalues4=result["median2"],
                         name1="Mean speedtest",
                         name2="Mean iperf",
                         name3="Median speedtest",
                         name4="Median iperf",
                         title="Upload speed by device" +title_tail,
                         ytitle="Mbps",
                         line=upload_line,
                         stack=False)

Comparing this chart with the same  for Download speed we can see that more devices having worse results for iperf test. It looks fairly similar for devices 1,2,4 and 5, for the rest of the devices the speeds are quite different between `iperf`  and `speedtest`

In [36]:
upload_df1["hour"]=pd.to_numeric(upload_df1["time"].dt.hour)
upload_df2["hour"]=pd.to_numeric(upload_df2["time"].dt.hour)

In [37]:
device_number=7
subset1=upload_df1[upload_df1["SK_PI"]==device_number]
subset2=upload_df2[upload_df2["SK_PI"]==device_number]
boxplot_2groups(dataframe1=subset1,dataframe2=subset2,plot_value='UPLOAD',sort_value='hour',
               title="Upload speed by hour for device: "+str(device_number)+" (orange - speedtest, purple - iperf)"+title_tail,
               ytitle="Mbps", xtitle="Hour of the day", uploadline=True)

This plot ("upload speed by hour of the day for device #7") we used in last meeting's report (Jan 24, 2019) to demonstrated that this device reaches CRTC target(10Mbps) for upload speed only at night/early morning with `speedtest` data.  
Now we have added data coming from `iperf` as well and we can see the same trend - speeds are going up between 2 to 5 am and then going down during business hours. But at the same time according to the `iperf` data this device never reaches CRTC target of 10Mbps.  
The same tendency we can see on the plot below.

In [38]:
upload_df1["time_group"]=""
upload_df1.loc[(upload_df1["hour"]>23)|(upload_df1["hour"]<=7),"time_group"]="night 23:00-07:00"
upload_df1.loc[(upload_df1["hour"]>7)&(upload_df1["hour"]<=17),"time_group"]="day 7:00-17:00"
upload_df1.loc[(upload_df1["hour"]>17)&(upload_df1["hour"]<=23),"time_group"]="evening 17:00-23:00"
upload_df2["time_group"]=""
upload_df2.loc[(upload_df2["hour"]>23)|(upload_df2["hour"]<=7),"time_group"]="night 23:00-07:00"
upload_df2.loc[(upload_df2["hour"]>7)&(upload_df2["hour"]<=17),"time_group"]="day 7:00-17:00"
upload_df2.loc[(upload_df2["hour"]>17)&(upload_df2["hour"]<=23),"time_group"]="evening 17:00-23:00"

In [39]:
device_number=7
subset1=upload_df1[upload_df1["SK_PI"]==device_number]
subset2=upload_df2[upload_df2["SK_PI"]==device_number]
boxplot_2groups(dataframe1=subset1,dataframe2=subset2,plot_value='UPLOAD',sort_value='time_group',
               title="Upload speed by time of the day for device: "+str(device_number)+" (orange - speedtest, purple - iperf)"+title_tail,
               ytitle="Mbps", xtitle="", uploadline=True, jitter=True)

In [40]:
query_ping1 = "SELECT * FROM SPEEDTEST_PING WHERE PROVIDER!='iperf' AND PING>0"+ query_tail+";"
ping_df1 = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_ping1,table_name='SPEEDTEST_PING')
device_numbers_p1=ping_df1['SK_PI'].unique()
device_numbers_p1=list(map(int, device_numbers_p1))
device_numbers_p1= sorted(device_numbers_p1)

In [41]:
query_ping2 = "SELECT * FROM SPEEDTEST_PING WHERE PROVIDER='iperf' AND PING>0"+ query_tail+";"
ping_df2 = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_ping2,table_name='SPEEDTEST_PING')
device_numbers_p2=ping_df2['SK_PI'].unique()
device_numbers_p2=list(map(int, device_numbers_p2))
device_numbers_p2= sorted(device_numbers_p2)

In [42]:
t="Ping latency by device for speedtest vs iperf data"+title_tail
scatterplot_2groups(title=t,dataframe1=ping_df1,dataframe2=ping_df2,
                    plot_value="PING",ytitle="Ping latency(Miliseconds)")

For the ping latency we can  see the same result: purple dots(iperf) are more cconcentrated on the bottom, wich indicates actually better (lower) latencies coming from `iperf`.

In [43]:
result11=pd.DataFrame(ping_df1.groupby('SK_PI')['PING'].mean())
result11.columns=['mean1']
result12=pd.DataFrame(ping_df1.groupby('SK_PI')['PING'].median())
result12.columns=['median1']
result1=result11.join(result12,how='outer')

result21=pd.DataFrame(ping_df2.groupby('SK_PI')['PING'].mean())
result21.columns=['mean2']
result22=pd.DataFrame(ping_df2.groupby('SK_PI')['PING'].median())
result22.columns=['median2']
result2=result21.join(result22,how='outer')

result=result2.join(result1,how='outer')
#result

In [44]:
combined_bar_plot_4traces(xvalues=result.index,
                         yvalues1=result["mean1"],
                         yvalues2=result["mean2"],
                         yvalues3=result["median1"],
                         yvalues4=result["median2"],
                         name1="Mean speedtest",
                         name2="Mean iperf",
                         name3="Median speedtest",
                         name4="Median iperf",
                         title="Ping latency by device" +title_tail,
                         ytitle="Miliseconds",
                         stack=False)

This graph shows that for almost all of the devices (except for #3 (Calgary control device) and #8(?)), median and average ping latencies coming from `iperf`  are better then latencies coming from `speedtest`.

## 3. Statistics (download speed)
In order to evaluate if device reaches CRTC goal of 50Mbps, we tried to calculate percentage of datapoints below and above the threshold in the last report (Jan 24th). This time we will try to apply more formal, statistical approach.  
We will apply normality tests, resmapling and t-tests on download speed data. The same way it can be applied to upload speed data and ping latency. (Except for ping latency we will compare with control device rather than with the threshold).  
We will take all the data we have in speedtest database for today(meeting date 2019-02-07 14:00) and keep only devices with minimum 100 datapoints.

In [45]:
#Collect all data?
all_data=False

#Set up test time interval:
#time_interval='4w' #2w

#Set up starting point, by default if will start from current time
#starting_point=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
starting_point="2019-02-07 14:00:00"  # to set upl alternative starting point

title_tail=""
query_tail=""

if not all_data:
    ##tail to all the titles
    title_tail=" to the date "+ starting_point
    ##tail for all the influxdb queries
    query_tail=" AND time <= '"+starting_point+"'"

In [46]:
if not all_data:#Set up test time interval:
    print("Selecting all data(speedtest) from the database back from starting point:",starting_point )
else:
    print("Selecting all data(speedtest) from the database")

Selecting all data(speedtest) from the database back from starting point: 2019-02-07 14:00:00


In [47]:
min_sample_size=100
print("Minimum sample size:",min_sample_size)

Minimum sample size: 100


In [48]:
query_download = "SELECT * FROM SPEEDTEST_DOWNLOAD WHERE PROVIDER!='iperf' AND DOWNLOAD>0"+ query_tail+";"
download_df = get_dataframe_from_influxdb(client_df=client_df,query_influx=query_download,table_name='SPEEDTEST_DOWNLOAD')

In [49]:
result=pd.DataFrame(download_df.groupby('SK_PI').size())
result.columns=['size']
tobe_excluded=result[result['size']<=min_sample_size].index
download_df=download_df[~download_df['SK_PI'].isin(tobe_excluded)]
device_numbers_d=download_df['SK_PI'].unique()
device_numbers_d=list(map(int, device_numbers_d))
device_numbers_d= sorted(device_numbers_d)
print("Devices that are going to be statistically tested:",device_numbers_d)

Devices that are going to be statistically tested: [2, 3, 4, 5, 7, 9, 10, 11, 12, 14, 15, 16, 17, 18]


#### Test if data is normally distributed:
First step we need to do is to test if data is [normally distributed](https://www.varsitytutors.com/hotmath/hotmath_help/topics/normal-distribution-of-data). This will impact the method we can choose for the statistical testing.  
To find out if data is normally distributed or not we will perform visual tests (distribution plots) and statistical tests (Kolmogorov-Smirnov).

In [50]:
t='Distribution plots for download speed per device'+title_tail
dist_subplots(dataframe=download_df,plot_value='DOWNLOAD',device_numbers=device_numbers_d, title=t)

This is the format of your plot grid:
[ (1,1) x1,y1 ]    [ (1,2) x2,y2 ]    [ (1,3) x3,y3 ]    [ (1,4) x4,y4 ]  
[ (2,1) x5,y5 ]    [ (2,2) x6,y6 ]    [ (2,3) x7,y7 ]    [ (2,4) x8,y8 ]  
[ (3,1) x9,y9 ]    [ (3,2) x10,y10 ]  [ (3,3) x11,y11 ]  [ (3,4) x12,y12 ]
[ (4,1) x13,y13 ]  [ (4,2) x14,y14 ]  [ (4,3) x15,y15 ]  [ (4,4) x16,y16 ]



Visual test show that possibly device #14 and #10 fall under the bell curve.
Let's compare with Kolmogorov-Smirnov test:

In [51]:
alpha = 0.05

matrix_sw = []
for device in device_numbers_d:
    subset=download_df[download_df["SK_PI"]==device]
    shapiro_results =scipy.stats.shapiro(subset['DOWNLOAD'])
    ks_results = scipy.stats.kstest(subset['DOWNLOAD'], cdf='norm',args=(subset['DOWNLOAD'].mean(), subset['DOWNLOAD'].std()))
    dagostino_results = scipy.stats.normaltest(subset['DOWNLOAD'])
    matrix_sw.append(
    [device, len(subset['DOWNLOAD']) - 1, shapiro_results[0], shapiro_results[1], ks_results[0], ks_results[1],
    dagostino_results[0], dagostino_results[1]])

df = pd.DataFrame(matrix_sw)

df.columns=['Device number', 'Degrees of freedom', 'Shapiro-Wilk Test Statistic', 'Shapiro-Wilk p-value',
     'Kolmogorov-Smirnov test Statistic','Kolmogorov-Smirnov p-value',
     "D'Agostino's K-squared test Statistic","D'Agostino's K-squared p-value" ]

df=df.reset_index().set_index("Device number")
df.drop(['index'], axis=1, inplace=True)
df["Shapiro-Wilk"]='n'
df.loc[(df["Shapiro-Wilk p-value"]>alpha),"Shapiro-Wilk"]='y'
df["Kolmogorov-Smirnov"]='n'
df.loc[(df["Kolmogorov-Smirnov p-value"]>alpha),"Kolmogorov-Smirnov"]='y'
df["D'Agostino's K-squared"]='n'
df.loc[(df["D'Agostino's K-squared p-value"]>alpha),"D'Agostino's K-squared"]='y'

result=df[["Shapiro-Wilk","Kolmogorov-Smirnov","D'Agostino's K-squared"]]
result_table = ff.create_table(result, index=True)
#iplot(result_table)

In [52]:
normally_distributed_d=list(result[result["Kolmogorov-Smirnov"]=='y'].index)
not_normally_distributed_d=list(result[result["Kolmogorov-Smirnov"]=='n'].index)
print("Normally distributed devices according to the Kolmogorov-Smirnov test: ",normally_distributed_d)

Normally distributed devices according to the Kolmogorov-Smirnov test:  [2, 10, 15]


#### Test if mean of the population of normally distributed data statistically different from the threshold of 50Mbps:
Now we can run [t-test](https://researchbasics.education.uconn.edu/t-test/) for normally distributed data.

A t-test is commonly used to determine whether the mean of a population significantly differs from a specific value (called the hypothesized mean) or from the mean of another population.

For normally distributed devices  - how far are they from the treshold of 50Mbps?   
**H0** - null hypothesis - mean is = 50 (less than 50)  
**Ha** - altenative hypothesis -mean is greater than 50  

Confidence interval 95%

**1-tailed 1-sample t-test**:  
p/2 <= alpha: reject H0    
t-statistics> 0 and p/2 > alpha: fail to reject H0, mean is greater than 50  

In [53]:
true_mean = 50
alpha = 0.05
list_t = []
for device in normally_distributed_d:
#for device in device_numbers_d:
    subset=download_df[download_df["SK_PI"]==device]
    onesample_results = scipy.stats.ttest_1samp(subset["DOWNLOAD"], true_mean)
    list_t.append(
    [device, round(subset["DOWNLOAD"].mean(),2), round(subset["DOWNLOAD"].std(),2), round(onesample_results[0],2), onesample_results[1]/2])

df1 = pd.DataFrame(list_t)

df1.columns=['Device number',"Mean","Standart deviation","t-statistics","p-value/2"]
df1=df1.reset_index().set_index("Device number")
df1 = df1.drop('index', 1)
df1["Statistically > 50Mbps"]='n'
df1.loc[(df1["t-statistics"]>0) & (df1["p-value/2"]< alpha),"Statistically > 50Mbps"]='y'
df1=df1[["Mean","Statistically > 50Mbps"]]
result_table = ff.create_table(df1, index=True)
iplot(result_table)

According to the t-test results we can statistically prove that for these 3 devices with normally distributed data, if the data will be collected in the same way the mean of the population will not go above 50Mbps(based on the sample of data we collected so far).  
We can visually prove it with our current sample of data:

In [54]:
t="Download speed by device "+title_tail
#simple_boxplot(dataframe=download_df,plot_value='DOWNLOAD',sort_value='SK_PI',
simple_boxplot(dataframe=download_df[download_df["SK_PI"].isin(normally_distributed_d)],plot_value='DOWNLOAD',sort_value='SK_PI',
               title=t, 
               ytitle="Download speed (Mbps)",
               xtitle="Device number", downloadline=True, jitter=True)

#### Test if data that is not normally distributed  statistically different from the threshold of 50Mbps

In [55]:
print("Devices with data not normally distributed: ",not_normally_distributed_d)

Devices with data not normally distributed:  [3, 4, 5, 7, 9, 11, 12, 14, 16, 17, 18]


How can we statistically evaluate the data that is not normally distributed? According to the [SamKnows methodology](https://availability.samknows.com/broadband/uploads/methodology/SamKnows_Sample_Size_Whitepaper_20150610.pdf)  we can apply resampling and use Central Limit Theorem:

>If the download speed distribution is not normal, the mean has an unknown
>distribution and strictly speaking the t-test is inapplicable. However according to
>the central limit theorem, as the sample size increases, the distribution of the
>mean tends to be normal. Therefore if the sample size is big enough, the t-test
>and confidence interval are valid even if the download speed is not from a normal
>distribution.

We will create 500 samples with replacement with 45 values each and calculate mean for every sample. SamKnows recommends using 1000 samples (sample size 45)

In [56]:
sample_size_r=45
num_samples_r=500
matrix_r=[]
for device in not_normally_distributed_d:
    list_r=[]
    subset=download_df[download_df["SK_PI"]==device]
    for i in range(num_samples_r):
        sample = resample(subset["DOWNLOAD"], replace=True, n_samples=sample_size_r, random_state=i)
        list_r.append(sample.mean())
    matrix_r.append([device]+list_r)
    
df_r = pd.DataFrame(matrix_r)
df_r=df_r.transpose()
df_r.columns = df_r.iloc[0]
df_r=df_r.reindex(df_r.index.drop(0))
#df_r.head()

In [57]:
alpha = 0.05

matrix_sw = []
for device in not_normally_distributed_d:
    subset=df_r[device]
    shapiro_results =scipy.stats.shapiro(subset)
    ks_results = scipy.stats.kstest(subset, cdf='norm',args=(subset.mean(), subset.std()))
    dagostino_results = scipy.stats.normaltest(subset)
    matrix_sw.append(
    [device, len(subset) - 1, shapiro_results[0], shapiro_results[1], ks_results[0], ks_results[1],
     dagostino_results[0], dagostino_results[1]])

df = pd.DataFrame(matrix_sw)

df.columns=['Device number', 'Degrees of freedom', 'Shapiro-Wilk Test Statistic', 'Shapiro-Wilk p-value',
     'Kolmogorov-Smirnov test Statistic','Kolmogorov-Smirnov p-value',
     "D'Agostino's K-squared test Statistic","D'Agostino's K-squared p-value" ]

df=df.reset_index().set_index("Device number")
df.drop(['index'], axis=1, inplace=True)
df["Shapiro-Wilk"]='n'
df.loc[(df["Shapiro-Wilk p-value"]>alpha),"Shapiro-Wilk"]='y'
df["Kolmogorov-Smirnov"]='n'
df.loc[(df["Kolmogorov-Smirnov p-value"]>alpha),"Kolmogorov-Smirnov"]='y'
df["D'Agostino's K-squared"]='n'
df.loc[(df["D'Agostino's K-squared p-value"]>alpha),"D'Agostino's K-squared"]='y'

result=df[["Shapiro-Wilk","Kolmogorov-Smirnov","D'Agostino's K-squared"]]
result_table = ff.create_table(result, index=True)
#iplot(result_table)

In [58]:
normally_distributed_d1=list(result[result["Kolmogorov-Smirnov"]=='y'].index)
#print("Normally distributed devices according to the Kolmogorov-Smirnov test: ",normally_distributed_d1)

After we have applied this method we transformed data from almost all the devices to "normal"  and can apply t-test the same was we applied it before:

In [59]:
true_mean = 50
alpha = 0.05
list_t = []
for device in normally_distributed_d1:
    subset=df_r[device]
    onesample_results = scipy.stats.ttest_1samp(subset, true_mean)
    list_t.append(
    [device, round(subset.mean(),2), round(subset.std(),2), round(onesample_results[0],2), onesample_results[1]/2])

df2 = pd.DataFrame(list_t)

df2.columns=['Device number',"Mean","Standart deviation","t-statistics","p-value/2"]
df2=df2.reset_index().set_index("Device number")
df2 = df2.drop('index', 1)
df2["Statistically > 50Mbps"]='n'
df2.loc[(df2["t-statistics"]>0) & (df2["p-value/2"]< alpha),"Statistically > 50Mbps"]='y'
df2=df2[["Mean","Statistically > 50Mbps"]]
result_table = ff.create_table(df2, index=True)
iplot(result_table)

According to the t-test results we can statistically prove that for  devices 5,7,9,11,12,16,17,18, if the data will be collected in the same way the mean of the population will not go above 50Mbps(based on the sample of data we collected so far).  
For the devices 3 and 14  we can statistically prove that mean of the data will go above 50Mbps (Current means are 172 and 92)  
We can visually prove it with our current sample of data:

In [60]:
t="Download speed by device "+title_tail
#simple_boxplot(dataframe=download_df,plot_value='DOWNLOAD',sort_value='SK_PI',
simple_boxplot(dataframe=download_df[download_df["SK_PI"].isin(not_normally_distributed_d)],plot_value='DOWNLOAD',sort_value='SK_PI',
               title=t, 
               ytitle="Download speed (Mbps)",
               xtitle="Device number", downloadline=True, jitter=True)

For device 14, part of the data is below 50 Mbps and part of the data is above. We can statistically prove that mean of the data will stay above 50Mbps. But does it mean this device is in "ok" or "underserved" area? How  does CRTC defines "underserved" area? There is no clear definition on the CRTC web-site.

### Next Steps

For next steps, we will focus on creating the dashboard comingin most intersting and usefulf graphs with statistical testing. We will also try to find out from the CRTC the definition of "underserved" region.