In [1]:
import pandas as pd
from sqlalchemy import create_engine
import pycountry
import json
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def convert_country_names(df):
    for i in range(0, len(df)):
        abbrev = df.at[i, 'country']
        country = pycountry.countries.get(alpha_2=abbrev)
        df.at[i, 'country'] = country.name + country.flag
    return df

# Packet Loss

In [3]:
engine = create_engine("postgresql://postgres:postgres@postgres:5432/postgres") # postgresql://user:password@host:port/databasename
temp_df = pd.read_sql_query("SELECT SUM(sent_packets) as sent, SUM(received_packets) as rcvd, country FROM ping_data GROUP BY country;", con=engine)

In [4]:
losses = []
losses_percent = []
for _, row in temp_df.iterrows():
    ratio = 1 - (row['rcvd'] / row['sent'])
    losses.append(ratio)
    losses_percent.append(ratio * 100)

df = convert_country_names(temp_df)
df.join(pd.DataFrame({'loss_ratio': losses, 'loss_percent': losses_percent}))

Unnamed: 0,sent,rcvd,country,loss_ratio,loss_percent
0,1034184,1031883,Austria🇦🇹,0.002225,0.222494
1,1458335,1447150,Australia🇦🇺,0.00767,0.766971
2,465870,465213,Belgium🇧🇪,0.00141,0.141026
3,600501,573321,Brazil🇧🇷,0.045262,4.526221
4,4571922,4547207,Canada🇨🇦,0.005406,0.540582
5,5416125,4401392,Germany🇩🇪,0.187354,18.735406
6,2332480,2295240,Spain🇪🇸,0.015966,1.596584
7,9480963,8320991,France🇫🇷,0.122347,12.234749
8,4442860,4420661,United Kingdom🇬🇧,0.004997,0.499656
9,463038,461862,Greece🇬🇷,0.00254,0.253975


# Latency

In [5]:
cdata = pd.read_sql_query("SELECT DISTINCT country FROM ping_data;", con=engine)
countries = cdata["country"].to_list()

In [6]:
rtts = {}
def latency_by_country(country):
    temp_data = pd.read_sql_query("SELECT result AS json, country FROM ping_data WHERE country = '" + country + "';", con=engine)

    for point in temp_data["json"]:
        l = json.loads(point)
        for e in l:
            o = e[next(iter(e))]
            if (o != "*"):
                try:
                    rtts[country].append(float(o))
                except: pass

for c in countries:
    rtts[c] = []
    latency_by_country(c)

In [7]:
def is_outlier(points, thresh=3.5):
    """
    Returns a boolean array with True if points are outliers and False 
    otherwise.

    Parameters:
    -----------
        points : An numobservations by numdimensions array of observations
        thresh : The modified z-score to use as a threshold. Observations with
            a modified z-score (based on the median absolute deviation) greater
            than this value will be classified as outliers.

    Returns:
    --------
        mask : A numobservations-length boolean array.

    References:
    ----------
        Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
        Handle Outliers", The ASQC Basic References in Quality Control:
        Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. 
    """
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

def visualize_latencies(latencies, filter = False):
    lats = [latencies[i] for i in latencies]

    if filter:
        filtered = [[latency for latency in ls if latency < 1000] for ls in lats]
        plt.boxplot(filtered)
    else:
        plt.boxplot(lats)
    
    # Add labels and title
    plt.xlabel('Country')
    plt.ylabel('Average Latency (ms)')
    plt.title('Average Latency by Country (Data Filtered = ' + str(filter) + ')')

    # Display the plot
    plt.tight_layout()
    plt.show()

In [8]:
visualize_latencies(rtts, filter = True)
visualize_latencies(rtts, filter = False)

TypeError: visualize_latencies() got an unexpected keyword argument 'filtered'