# SEN163A - Fundamentals of Data Analytics
# Assignment 2 - Large-scale Internet Data Analysis
### Ir. Jacopo De Stefani - [J.deStefani@tudelft.nl](mailto:J.deStefani@tudelft.nl)
### Joao Pizani Flor, M.Sc. - [J.p.pizaniflor@tudelft.nl](mailto:J.p.pizaniflor@tudelft.nl)

### 05-03-2022
## Group 2
- Emmanuel M Boateng - '5617642'
- Joost Oortwijn - '4593472'
- Philip Busscher - ''4611993''
- Floris Kool - ''4975243''


# Introduction
This notebook presents the results of our analysis of potential ASN's which can be used for hosting the mobile banking IT infrastructure for GNI Bank. Based on three datasets (described in chapter 1), we have idenified 4 locations which are, based on the available data, the best hosting options for GNI Bank. The indepth analysis consists of 4 parts and is shown in chapter 2. The final conclusion of our analysis is presented in chapter 3. In addition, we have identified different limitations to the available data which could decrease the usability of our recommendations in chapter 3, therefore these should be taken into account. These limitations are described in chapter 1.2.

# 1. Dataset description

Short description of the 4 datasets used

In [3]:
#Description of variables used accross the entire notebook

#AS_df - Complete AS dataset as provided
#P_df - Complete probe dataset as provided
#EU_list - list of countries in EU
#ipv4_df - Complete ip2location dataset
    
#as_probe_joined_df - Merge of AS and Probe dataset, from 2.1 on filtered to contain only type hosting and location from EU
#AS_Probe_RIPE_df - Merge of AS and Probe dataset with probe ids in RIPE dataset and ASNs of type hosting and location from EU
#display_df - Same as AS_Probe_RIPE_df with removed duplicate ASNs

#RIPE_df - Complete useful contents of a single hour of ripe data (used for 2.1 & 2.2)
#RIPE_HostAS_df - Entries of a single hour of ripe data with probe connected to an EU ASN with type host,
    #From 2.2 on also filtered to only contain entries with destination address in EU
    
#Complete_ASN_Set - Set of ASNs of hosting type from EU and in complete RIPE dataset (Ended up being the same for each hour of data)
#Complete_RIPE_Entries_df - Complete set of RIPE entries with probe ASN in eu and type host and destination in EU
    #Can be loaded from all ripe files
    #Also saved in RIPE_00-23.pkl (RIPE_00-5.pkl since we didn't read through the entire set)
    
#ASN_Country_Avg_df - Combination of each country, ASN and average ping
#ASN_Country_Matrix_df - Combination of each country, ASN and average ping, with Country as index and ASN as column labels

## 1.1 Opening the data

In [1]:
import pickle
import time
import bz2
import os
import sys
import json
import pandas as pd
import numpy as np
import ipaddress
import io

### AS and Probe datasets

In [12]:
#AS Dataset

AS_Filename = 'data/AS_dataset.pkl'

with open(AS_Filename, 'rb') as file:
    
    AS_df = pickle.load(file)

In [13]:
#Probe dataset

Probe_Filename = 'data/probe_dataset.pkl'

with open(Probe_Filename, 'rb') as file:
    
    P_df = pickle.load(file)

In [14]:
# EU country codes retrieved from: https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Country_codes
EU_list = ['BE','BG','CZ','DK','DE','EE','IE','EL','ES','FR','HR','IT','CY','LV','LT','LU','HU','MT','NL','AT','PL','PT','RO','SI','SK','FI','SE']

In [15]:
#Merge the AS and Probe datasets
as_probe_joined_df = pd.merge(P_df,AS_df, on='ASN')

### IP2Location dataset

In [16]:
#IP 2 Location dataset

IP_Filename = "data/IP2LOCATION-LITE-DB1.CSV"

ipv4_df = pd.read_csv(IP_Filename)

ipv4_df.rename(columns = {'0':'ip_from', '16777215':'ip_to',
                              '-':'country_code','-.1':'country_name'}, inplace = True)

### Single ripe file (Used for C)

In [None]:
#Ripe dataset (Single file)

#Option 1 decompressed file
decomFilename = 'data/ping-2022-03-01T2300_decom'
#decomFile     = open(decomFilename, 'rt')

#Option 2 BZ2 file
bz2Filename = 'data/ping-2022-03-01T0000.bz2'
bz2File     = bz2.open(bz2Filename, 'rt')


# List of tuples
# https://stackoverflow.com/questions/28056171/how-to-build-and-fill-pandas-dataframe-from-for-loop
tuple_list = []

start  = time.time()

#for line in bz2File:
for line in bz2File:
    
    decoded_line = json.loads(line)
    if "af" in decoded_line and "dst_addr" in decoded_line and "prb_id" in decoded_line and "avg" in decoded_line: 
        if decoded_line["af"] == 4:
            tuple_list.append((decoded_line["dst_addr"],decoded_line["prb_id"],decoded_line["avg"]))

            
dur         = round(time.time() - start,2)
print("Loading took: "  + str(dur) + " seconds")
print("Lines added to tuple: " + str(len(tuple_list)))

#finally close bz2File
bz2File.close()

In [32]:
#Load tuples data into dataframe
start  = time.time()

RIPE_df = pd.DataFrame(tuple_list)

dur         = round(time.time() - start,2)
print("Loading took: "  + str(dur) + " seconds")

Loading took: 6.05 seconds


### Complete ripe dataset from BZ2 files
### Only loaded through 6 files due to time limitations
Changed reading method to raw characters to save about 20% in loading time

In [None]:
#Changed 24 -> 6
RIPE_Filenames = pd.date_range('2022-03-01', periods=6, freq='60min').strftime('D:/FoDa Data/ping-%Y-%m-%dT%H%M.bz2').tolist()

Complete_ASN_List = []
Complete_RIPE_Entries_df = pd.DataFrame({0:[], 1:[], 2:[], 'Country':[]})


for filename in RIPE_Filenames:
    #Read RIPE data
    print(filename)
    
    start  = time.time()
    with open(filename, 'rb') as fi:
        decomp = bz2.BZ2Decompressor()
        residue = b''
        total_lines = 0
        m = 0
        tuple_list = []
    
        for data in iter(lambda: fi.read(100 * 1024), b''):
            raw = residue + decomp.decompress(data) # process the raw data and  concatenate residual of the previous block to the beginning of the current raw data block
            residue = b''
            # process_data(current_block) => do the processing of the current data block
            current_block = raw.split(b'\n')
            if raw[-1] != b'\n':
                residue = current_block.pop() # last line could be incomplete

            for items in current_block:
                df_dict = json.loads(items.decode('utf-8'))
                if ('dst_addr' in df_dict) and (df_dict['af'] == 4) and (df_dict["avg"] > 0):
                    tuple_list.append((df_dict["dst_addr"],df_dict["prb_id"],df_dict["avg"]))
    
    fi.close()
    
    Temp_RIPE_df = pd.DataFrame(tuple_list)
    
    #Get list of ASNs
    unique_prbID = Temp_RIPE_df[1].unique()
    
    Temp_AS_Probe_RIPE_df = as_probe_joined_df.loc[as_probe_joined_df['prb_id'].isin(unique_prbID)]
    
    unique_ASNs = Temp_AS_Probe_RIPE_df['ASN'].unique()
    Complete_ASN_List.extend(unique_ASNs)
    
    
    #Get RIPE entries with dst addr in eu
    Temp_RIPE_HostAS_df = Temp_RIPE_df.loc[Temp_RIPE_df[1].isin(Temp_AS_Probe_RIPE_df['prb_id'])]
    
    for i in Temp_RIPE_HostAS_df.index:
        Temp_RIPE_HostAS_df.at[i, 0] = int(ipaddress.IPv4Address(Temp_RIPE_HostAS_df[0][i]))

    Temp_RIPE_HostAS_df = Temp_RIPE_HostAS_df.sort_values(by=[0])
    ipv4_df = ipv4_df.sort_values(by=["ip_from"])

    Dest_Addr_Countries = []
    ripeindex = 0
    ipindex = 0

    while Temp_RIPE_HostAS_df.iat[ripeindex, 0] < ipv4_df.at[ipindex, "ip_from"]:
        ripeindex = ripeindex + 1
        Dest_Addr_Countries.append("-")

    for ipindex in ipv4_df.index:
        while Temp_RIPE_HostAS_df.iat[ripeindex, 0] >= ipv4_df.at[ipindex, "ip_from"] and Temp_RIPE_HostAS_df.iat[ripeindex, 0] <= ipv4_df.at[ipindex, "ip_to"]:
            Dest_Addr_Countries.append(ipv4_df.at[ipindex, "country_code"])
            ripeindex = ripeindex + 1
            if ripeindex >= len(Temp_RIPE_HostAS_df[0]):
                break

        if ripeindex >= len(Temp_RIPE_HostAS_df[0]):
            break
            
    Temp_RIPE_HostAS_df["Country"] = Dest_Addr_Countries
    Temp_RIPE_HostAS_df = Temp_RIPE_HostAS_df.loc[Temp_RIPE_HostAS_df['Country'].isin(EU_list)]    
    
    
    #Add entries to complete dataframe
    frames = [Complete_RIPE_Entries_df, Temp_RIPE_HostAS_df]
    Complete_RIPE_Entries_df = pd.concat(frames)
    
    dur         = round(time.time() - start,2)
    print("Added " + str(len(Temp_RIPE_HostAS_df[0])) + " entries and " + str(len(unique_ASNs)) + " ASNs in " + str(dur) + " seconds")
    print()
    

#Remove duplicates
Complete_ASN_Set = set(Complete_ASN_List)


In [None]:
#Save filtered RIPE data to file
print("Unique ASNs: " + str(len(Complete_ASN_Set)))
print("Ripe entries: " + str(len(Complete_RIPE_Entries_df[0])))

Complete_ASN_Set_df = pd.DataFrame(Complete_ASN_Set)

Complete_ASN_Set_df.to_pickle("data/ASN_00-06.pkl")

Complete_RIPE_Entries_df.to_pickle("data/RIPE_00-06.pkl")

### Complete ripe dataset from pkl file
### Only loaded through 6 files due to time limitations

In [10]:
#Probe dataset

Ripe_Filename = 'data/RIPE_00-05.pkl'
ASN_Filename = 'data/ASN_00-05.pkl'

with open(Ripe_Filename, 'rb') as file:
    
    Complete_RIPE_Entries_df = pickle.load(file)
    

with open(ASN_Filename, 'rb') as file:
    
    Complete_ASN_Set_df = pickle.load(file)

Complete_ASN_Set = Complete_ASN_Set_df[0]



More detailed description of data if needed (Can also be after opening each dataset)

## 1.2 Limitations in data (Question A)

Evaluate if there are limitations in the provided datasets (AS and probe data set). If you find limitations, describe these and conjecture possible reasons, supported with data.

### 1.2.1 Limitations in the AS and Probe dataset
A limitation in the AS and probe dataset is the fact that according to the AS dataset there are 534 AS's that can be used for hosting in the EU. But when the AS data set is merged with the probe dataset the number of potential AS's has decreased to 339. An explanation for this is the fact that the probe dataset consists of significantly less AS's then the AS dataset (3652 compared to 60122). This can be seen as limitation as potential AS's are left out of the analysis eventhough these could be a interesting options for hosting the EU. 

In [21]:
len(AS_df.loc[(AS_df['type'] == 'hosting') & (AS_df['Country'].isin(EU_list))])

534

In [22]:
as_probe_joined_df = pd.merge(P_df,AS_df, on='ASN')
len(as_probe_joined_df.loc[(as_probe_joined_df['type'] == 'hosting') & (as_probe_joined_df['Country'].isin(EU_list))])

339

In [23]:
AS_df['ASN'].value_counts()

AS55330     1
AS31573     1
AS200609    1
AS200605    1
AS51879     1
           ..
AS48290     1
AS48295     1
AS48303     1
AS48360     1
AS37485     1
Name: ASN, Length: 60122, dtype: int64

In [24]:
P_df['ASN'].value_counts()

AS3320      351
AS7922      334
AS6830      333
AS3215      243
AS12322     210
           ... 
AS8282        1
AS12775       1
AS199484      1
AS199853      1
AS49432       1
Name: ASN, Length: 3652, dtype: int64

### 1.2.2 Limitation in the IP location dataset

### 1.2.3 Limitations in the RIPE dataset

When looking at the RIPE dataset, it was found that for some lines the IP destination addresses were missing. This was not the case for the Probe ID's and average round trip times as these were always included within the lines of the RIPE dataset. Below the number of missing values in the first 5 million lines are shown. When looking at the lines where the destination address is missing, it can be seen that these adresses are missing because there is no node or server name provided or known. Obviously this results in the lack of a destination address of that specific ping measurement. Due to the size of the available dataset this is not assumed as a limitation and the lines with missing destination address are simply skipped when opening the RIPE dataset. 

In [26]:
bz2Filename = 'data/ping-2022-03-01T2300.bz2'
bz2File_limitation = bz2.open(bz2Filename, 'rt') 
missing_adres = 0
missing_probeID = 0
missing_avg = 0
line_number = 0

for line in bz2File_limitation:
    decoded_line = json.loads(line)
    line_number += 1
    if "dst_addr" not in decoded_line: 
        missing_adres += 1
      
    if "prb_id" not in decoded_line: 
        missing_probeID += 1
        
    if "avg" not in decoded_line: 
        missing_avg += 1
           
    if line_number > 5000000:
        print('There are', missing_adres, 'missing IP destination addresses in the first 5m lines of the RIPE dataset (for one hour)')
        print('There are', missing_probeID, 'missing probe ID\'s in the first 5m lines of the RIPE dataset (for one hour)')
        print('There are', missing_avg, 'missing average round-trip time values in the first 5m lines of the RIPE dataset (for one hour)')
        break
        
bz2File_limitation.close()

There are 11428 missing IP destination addresses in the first 5m lines of the RIPE dataset (for one hour)
There are 0 missing probe ID's in the first 5m lines of the RIPE dataset (for one hour)
There are 0 missing average round-trip time values in the first 5m lines of the RIPE dataset (for one hour)


# 2 Analysis

Short description of what is going to be analyzed

## 2.1 AS (Question B)

With the AS and probe data set, find the number m of AS’s that can be used for hosting in the EU
and have probes in the RIPE data set. Sort the ASN’s in ascending order and include the first and last
three in your report (number, name and country).


In [27]:
#Merge the AS and Probe datasets
as_probe_joined_df = pd.merge(P_df,AS_df, on='ASN')

In [28]:
# EU country codes retrieved from: https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Country_codes
EU_list = ['BE','BG','CZ','DK','DE','EE','IE','EL','ES','FR','HR','IT','CY','LV','LT','LU','HU','MT','NL','AT','PL','PT','RO','SI','SK','FI','SE']

# Filter data set for AS's that can be used for hosting in the EU
as_probe_joined_df = as_probe_joined_df.loc[(as_probe_joined_df['type'] == 'hosting') & (as_probe_joined_df['Country'].isin(EU_list))]

In [33]:
#Get the unique number of probe IDs that are in the RIPE Data
unique_prbID = RIPE_df[1].unique()

print("Unique probe IDs: " + str(len(unique_prbID)))

Unique probe IDs: 11597


In [34]:
#Filter the data set by only selecting the ASN's that have probes in the Ripe dataset
AS_Probe_RIPE_df = as_probe_joined_df.loc[as_probe_joined_df['prb_id'].isin(unique_prbID)]

#Sort by ASN
AS_Probe_RIPE_df.sort_values(by=['ASN']).sort_values(by=['ASN'])

print("Number of probes connected to AS that can be used for hosting in the EU and are in the RIPE dataset: " + str(len(AS_Probe_RIPE_df["ASN"])))


Number of probes connected to AS that can be used for hosting in the EU and are in the RIPE dataset: 234


In [36]:
#Remove duplicate ASNs (Probes connected to same AS)
display_df = AS_Probe_RIPE_df.drop_duplicates(subset=['ASN'])

#Remove unused columns
display_df = display_df.drop(columns=['prb_id', 'NumIPs', 'type'])

#Sort by ASN
display_df.insert(2, 'AS', display_df['ASN'].str.replace('AS', ''))
display_df['AS'] = pd.to_numeric(display_df['AS'])
display_df = display_df.sort_values('AS')

#Print anwser to question B
print("Number of AS that can be used for hosting in the EU and are in the RIPE dataset: " + str(len(display_df["ASN"])))


Number of AS that can be used for hosting in the EU and are in the RIPE dataset: 113


In [37]:
#First 3 probes
display_df.head(3)

Unnamed: 0,ASN,Country,AS,Name
6422,AS6724,DE,6724,Strato AG
10262,AS8304,FR,8304,Ecritel SARL
8489,AS8315,NL,8315,Sentia Netherlands BV


In [38]:
#Last 3 probes
display_df.tail(3)

Unnamed: 0,ASN,Country,AS,Name
8377,AS201978,CY,201978,Osbil Technology Ltd.
9379,AS203944,LU,203944,NTT Luxembourg PSF S.A.
2910,AS203953,DK,203953,Hiper A/S


Description of results

## 2.2 Hosting location (Question C)
For a single hour in the RIPE data set: find all valid entries where the probe has hosting type AS and
the target IPv4 is from an EU country. Implement this in an efficient way.

In [39]:
#Selects all entries in RIPE data with probe connected to EU as of type hosting
RIPE_HostAS_df = RIPE_df.loc[RIPE_df[1].isin(AS_Probe_RIPE_df['prb_id'])]

print("Entries with probe connected to an EU as with type hosting: " + str(len(RIPE_HostAS_df[1])))

Entries with probe connected to an EU as with type hosting: 704132


In [40]:
#Convert IP strings to IP integers
for i in RIPE_HostAS_df.index:  
    IP_Splitstring = RIPE_HostAS_df[0][i].split(".") 
    RIPE_HostAS_df.at[i, 0] = int(IP_Splitstring[0]) * 16581375 + int(IP_Splitstring[1]) * 65025 + int(IP_Splitstring[2]) * 255 + int(IP_Splitstring[3])

In [41]:
#Add country of dst_addr to RIPE_HostAS_df

#Sorting the IP lists so we can check from low to high IPs
RIPE_HostAS_df = RIPE_HostAS_df.sort_values(by=[0])
ipv4_df = ipv4_df.sort_values(by=["ip_from"])

Dest_Addr_Countries = []
ripeindex = 0
ipindex = 0

#Check if there are IP addresses lower than included in the IP2Location dataset
while RIPE_HostAS_df.iat[ripeindex, 0] < ipv4_df.at[ipindex, "ip_from"]:
    ripeindex = ripeindex + 1
    Dest_Addr_Countries.append("-")

print("IP addresses not included in IP2location dataset: " + str(ripeindex))

#Check for each range of IP addresses in the IP2Location dataset which dst_addr IPs are present
#Break loop early if the length of the RIPE dataset is reached
for ipindex in ipv4_df.index:
    while RIPE_HostAS_df.iat[ripeindex, 0] >= ipv4_df.at[ipindex, "ip_from"] and RIPE_HostAS_df.iat[ripeindex, 0] <= ipv4_df.at[ipindex, "ip_to"]:
        Dest_Addr_Countries.append(ipv4_df.at[ipindex, "country_code"])
        ripeindex = ripeindex + 1
        if ripeindex >= len(RIPE_HostAS_df[0]):
            break
    
    if ripeindex >= len(RIPE_HostAS_df[0]):
        break

print("IP addresses linked to country: " + str(len(Dest_Addr_Countries)))

#Add list for destination address location to dataframe
RIPE_HostAS_df["Country"] = Dest_Addr_Countries

IP addresses not included in IP2location dataset: 79
IP addresses linked to country: 704132


In [42]:
#Remove entries not in EU
RIPE_HostAS_df = RIPE_HostAS_df.loc[RIPE_HostAS_df['Country'].isin(EU_list)]

print("Entries with probe connected to an EU AS with type hosting and destination address within EU: " + str(len(RIPE_HostAS_df[1])))

Entries with probe connected to an EU AS with type hosting and destination address within EU: 133653


In [43]:
RIPE_HostAS_df.head(10)

Unnamed: 0,0,1,2,Country
13073077,34230351,1000114,5.056936,FR
14702804,34328463,55634,3.784625,FR
13797784,34350710,51992,11.232558,FR
11730303,34351632,18567,13.708424,FR
8886535,34376151,31065,4.453532,FR
15393251,34392634,13972,6.531589,FR
15310796,34392634,21994,0.608414,FR
1138640,34392634,17138,17.750491,FR
15366304,34392634,6693,3.573891,FR
7790751,34392634,21959,0.777758,FR


Description of results

## 2.3 Latency (Question D)
Move from using only an hour to the full day. It is advisable to store the raw results of each file. Then,
using all processed files, calculate the average latency’s for each country-AS combination and store
the results into one ncountries ×m matrix. If we could place one server in each country, what would the
minimum average latency be for each country? (include in your report)


In [111]:
#Load the avg ping for each country-AS combination into a DF
ASN_Country_Avg =[]
start  = time.time()

for country in EU_list:
    
    #Filter each country's ping values seperately into a dataframe
    country_df = Complete_RIPE_Entries_df.loc[Complete_RIPE_Entries_df['Country'] == country]
    
    for ASN in Complete_ASN_Set:
        
        #Filter probe IDs for each seperate ASN
        #There are more probes than ASs to calculate the average ping more accurately we use all probes
        prb_df = as_probe_joined_df.loc[as_probe_joined_df['ASN'] == ASN]                            
        
        #Filter the ping data so it includes all probes from selected ASN and selected country
        temp_df = country_df.loc[country_df[1].isin(prb_df['prb_id'])]
        
        #Create sum of all ASN - Country ping measurements
        sumvalue = 0
        i = 0
        for pingvalue in temp_df[2]:
            sumvalue = sumvalue + pingvalue
            i = i+1
        
        #Check if there are ping measurements between AS - Country
        #Calculate average when needed, enter nan when no data available
        if not i == 0:
            average = sumvalue/i
            ASN_Country_Avg.append((country, ASN, average))
        else:
            ASN_Country_Avg.append((country, ASN, np.nan))
            
    

#Load tuple list into dataframe
ASN_Country_Avg_df = pd.DataFrame(ASN_Country_Avg)  
ASN_Country_Avg_df.columns = ['Country','ASN','Average latency']

dur         = round(time.time() - start,2)
print("Loading took: " + str(dur) + " seconds")

ASN_Country_Avg_df.head(5)

Loading took: 15.48 seconds


Unnamed: 0,Country,ASN,Average latency
0,BE,AS50926,37.631237
1,BE,AS24961,15.303253
2,BE,AS39790,19.871554
3,BE,AS48854,20.9096
4,BE,AS62416,37.32258


In [112]:
#Display Country-AS-AVerage dataframe as a matrix

ASN_Country_Avg_df = ASN_Country_Avg_df.iloc[:, 1:] # asn and latency
df_groupby = ASN_Country_Avg_df.groupby('ASN')['Average latency'].apply(list)

new_dftesttest = np.zeros((len(df_groupby), len(df_groupby[0])))
for i in range(len(df_groupby)):
    for j in range(len(df_groupby[0])):
        new_dftesttest[i,j] = df_groupby[i][j]

df_groupby.index
ASN_Country_Matrix_df = pd.DataFrame(new_dftesttest.transpose())   

column_list = list(df_groupby.index)
ASN_Country_Matrix_df.columns=column_list
ASN_Country_Matrix_df.insert(0,'Country', EU_list)
ASN_Country_Matrix_df.set_index('Country')
#ASN_Country_Matrix_df.insert(0, 'Country', EU_list)
#ASN_Country_Matrix_df

Unnamed: 0_level_0,AS12676,AS12824,AS12859,AS12876,AS12993,AS13287,AS15401,AS15598,AS15685,AS15817,...,AS61211,AS62000,AS62282,AS62416,AS6724,AS8304,AS8315,AS8560,AS8893,AS9211
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BE,11.156481,39.99105,10.863737,15.426602,47.130012,44.378539,32.022717,13.850941,7.24503,18.021685,...,16.165834,9.038073,30.031467,37.32258,22.10574,36.298499,14.670655,15.679276,14.239229,23.337293
BG,35.15371,58.282718,38.394082,,,,,,24.190486,39.580796,...,,38.017865,,66.94477,45.340079,,,31.703337,40.999549,34.482231
CZ,12.155558,,26.101986,,,,,,2.718502,16.392534,...,24.176706,19.323957,,52.145587,20.622238,37.162993,,11.712396,17.831068,
DK,21.028932,30.229738,16.442769,,34.615563,,,,,23.171498,...,,31.305757,,54.447622,21.598313,39.877256,,21.907005,15.833915,10.495258
DE,8.946247,26.107049,15.593103,16.04698,33.258409,38.762043,34.19517,6.402923,12.079615,13.481864,...,26.098151,17.739043,28.32944,46.813313,18.390852,32.461834,13.038404,9.949708,12.060789,11.322729
EE,29.877902,,36.278537,,9.662063,,,,,36.524693,...,,40.343486,,70.27507,29.945309,,,33.617919,27.866751,
IE,23.378083,,25.681111,,,,,,,28.969252,...,,23.853715,,46.950143,31.873797,37.075312,,25.522105,25.064629,
EL,,,,,,,,,,,...,,,,,,,,,,
ES,37.244236,53.957538,39.916419,19.806524,,6.24773,,,,38.716897,...,,28.723318,,19.346627,42.662035,32.945351,,34.003445,38.985322,
FR,17.536786,38.890575,22.038415,8.418621,,23.568624,11.139845,,16.252486,21.942327,...,,7.859537,35.479169,36.45756,27.828333,19.639763,26.828019,16.838865,21.960819,19.611572


In [113]:
#Calculate the minimum latency for each country

#Still needs to be done


min_latency_s = ASN_Country_Matrix_df.min(axis = 1)
#min_latency_df = pd.Series.to_frame(min_latency_s, 'min Latency')
min_latency_df
    

  min_latency_s = ASN_Country_Matrix_df.min(axis = 1)


Unnamed: 0,Country,min Latency
0,BE,2.766055
1,BG,8.660445
2,CZ,1.81391
3,DK,2.444158
4,DE,6.402923
5,EE,3.619814
6,IE,3.678095
7,EL,
8,ES,5.084935
9,FR,2.846752


Description of results

## 2.4 Optimal server locations (Question E)
Since we are only allowed to place four servers, determine the best four datacenters based on the total
latency for all countries. Report your findings and your procedure to obtain them. Also include the
average latency for each country.


In [175]:
#Code...

#ASN_Country_Matrix_df.insert(0, 'Country', EU_list)
ASN_latency_df = ASN_Country_Matrix_df.iloc[:, 1:]
results_byASN = ASN_latency_df.idxmin(axis=0)
ASN_min_index = list(results_byASN)
print(ASN_min_index)
result_byCountry = ASN_latency_df.idxmin(axis=1)
Country_min_index = list(result_byCountry)
#print(Country_min_index)
bestASN=[]
newList = []

# In this area, we get the index the minimum for each country and 
#the minuimum for each ASN column.
# Cross checking the two should give us a list of possible countries with 
# minimum latency

[4, 20, 18, 9, 5, 8, 9, 4, 2, 4, 9, 18, 8, 18, 14, 3, 0, 9, 2, 15, 26, 5, 2, 3, 18, 26, 3, 18, 18, 26, 26, 3, 21, 18, 4, 16, 18, 18, 18, 15, 18, 4, 26, 8, 9, 16, 26, 11, 26, 0, 18, 11, 18, 21, 18, 4, 0, 4, 11, 18, 9, 9, 2, 21, 6, 21, 2, 9, 18, 18, 22, 2, 0, 26, 16, 0, 9, 26, 18, 15, 26, 26, 18, 2, 18, 26, 20, 18, 20, 3, 5, 18, 4, 8, 15, 3, 26, 26, 9, 18, 3, 15, 26, 0, 9, 14, 21, 18, 9, 18, 15, 4, 26]


0Description of results

# Conclusions

... 
add code if needed

# Stuff we saved, delete if not used in final version

## Part C Alternative Approach reading all the dataset

In [None]:
import pickle
import time
import bz2
import os
import sys
import json
import pandas
import io
import datetime
import socket
import struct

def ip2int(addr):
    return struct.unpack("!I", socket.inet_aton(addr))[0]

with open('data/AS_dataset.pkl', 'rb') as file:
    AS_df = pickle.load(file)
    
with open('data/probe_dataset.pkl', 'rb') as file:    
    P_df = pickle.load(file)
    
decomFilename = 'data/ping-2022-03-01T2300.bz2'
#decomFile     = bz2.open(decomFilename, 'rt')   
merged_df = P_df.merge(AS_df)

ipv4_df = pandas.read_csv("data/IP2LOCATION-LITE-DB1.CSV")
ipv4_df.rename(columns = {'0':'ip_from', '16777215':'ip_to',
                              '-':'country_code','-.1':'country_name'}, inplace = True)


EU_Countries = ["AT","BE","HR","CY","CZ","DK","EE","FI","FR","GR","DE","HU",
                "IE","IT","LV","LT","LU","MT","NL","PL","PT","RO","SK","SI",
                "ES","SE"]

EU_data = merged_df[merged_df['Country'].isin(EU_Countries)]
EU_Hosting = EU_data[EU_data['type'] == 'hosting']



merged_df.insert(2, 'AS', merged_df['ASN'].str.replace('AS',''))
merged_df['AS'] = pandas.to_numeric(merged_df['AS'])
merged_df['prb_id'] = pandas.to_numeric(merged_df['prb_id'])


merged_df_sorted = merged_df.sort_values('AS')
df_HostingAS = merged_df[merged_df['type'] == 'hosting']

ipv4_df.head()
tpl = ipv4_df.loc[:, 'ip_from':'ip_to'].apply(tuple, 1).tolist()
idx = pandas.IntervalIndex.from_tuples(tpl, 'both')

t0 = time.time()
time.sleep(0.000001)
with open(decomFilename, 'rb') as file:
    decomp = bz2.BZ2Decompressor()
    residue = b''
    total_lines = 0
    m = 0
    checked = []
    #102400 Bytes = 102.4 KB (in decimal)
    #102400 Bytes = 100 KB (in binary)
    #Iterate over RIPE data in  100 KB chunks 
    for data in iter(lambda: file.read(100 * 1024), b''):
        # process the raw data and  concatenate residual of the previous block 
        #to the beginning of the current raw data block
        raw = residue + decomp.decompress(data) 
        residue = b''
        ## process_data(current_block) => do the processing of the 
        ##current data block
        current_block = raw.split(b'\n')
        if raw[-1] != b'\n':
            residue = current_block.pop() # last line could be incomplete
        ##Process all data in the current block to check    
        for items in current_block:
            df_dict = json.loads(items.decode('utf-8'))
            if ('dst_addr' in df_dict) and (df_dict['af'] == 4):# and (ip2int(df_dict['dst_addr'])>0:
                ##convert to interger
                df_ip = ip2int(df_dict['dst_addr'])
                
                if df_ip > 0: #and (df_dict['prb_id'] not in checked)): # certain lines have 0.0.0.0 IP
                    loc = idx.get_loc(df_ip)
                    if ((ipv4_df.loc[loc,'country_code'] in EU_Countries) and (df_dict['prb_id'] not in checked)):
                        #if len(EU_Hosting[EU_Hosting['prb_id'] == df_dict['prb_id']])!=0:
                            #print(df_HostingAS[df_HostingAS['prb_id'] == df_dict['prb_id']])
                        m +=1 ## increment count
                       ##create a list of probes that could be used later                     
                        checked.append(df_dict['prb_id']) 
        total_lines += len(current_block)
    total_lines += 1

print("Total processing time: ",(time.time() - t0))
print("Total number of probe entries with hosting type AS and EU target in RIPE is %i" %(m))
fi.close()



## Alternative solution to D (Emmanuel)

In [132]:
#We want a matrix of 26 countries * 113 ASNs (For a single file, should be more for 24 files)
import numpy as np
import pickle
import time
import bz2
import os
import sys
import json
import pandas
import io
import datetime
import socket
import struct
from ip2geotools.databases.noncommercial import HostIP #gets the country code from ip

def ip2int(addr):
    return struct.unpack("!I", socket.inet_aton(addr))[0]


with open('data/AS_dataset.pkl', 'rb') as file:
    AS_df = pickle.load(file)
    
with open('data/probe_dataset.pkl', 'rb') as file:    
    P_df = pickle.load(file)
    
decomFilename = 'data/ping-2022-03-01T2300.bz2'
#decomFile     = bz2.open(decomFilename, 'rt')   
merged_df = P_df.merge(AS_df)
probes = merged_df['prb_id'].tolist()
print(type(probes))

ipv4_df = pandas.read_csv("data/IP2LOCATION-LITE-DB1.CSV")
ipv4_df.rename(columns = {'0':'ip_from', '16777215':'ip_to',
                              '-':'country_code','-.1':'country_name'}, inplace = True)


EU_Countries = ["AT","BE","HR","CY","CZ","DK","EE","FI","FR","GR","DE","HU",
                "IE","IT","LV","LT","LU","MT","NL","PL","PT","RO","SK","SI",
                "ES","SE"]

EU_data = merged_df[merged_df['Country'].isin(EU_Countries)]
EU_Hosting = EU_data[EU_data['type'] == 'hosting']

merged_df.insert(2, 'AS', merged_df['ASN'].str.replace('AS',''))
merged_df['AS'] = pandas.to_numeric(merged_df['AS'])
merged_df['prb_id'] = pandas.to_numeric(merged_df['prb_id'])

df_HostingAS = merged_df[merged_df['type'] == 'hosting']

ipv4_df.head()
tpl = ipv4_df.loc[:, 'ip_from':'ip_to'].apply(tuple, 1).tolist()
idx = pandas.IntervalIndex.from_tuples(tpl, 'both')
    
l =pandas.date_range('2022-03-01', periods=24, freq='60min').strftime('data/ping-%Y-%m-%dT%H%M.bz2').tolist()
#print(l)
m = 0
data = []
df = []
for dataset in l:
    decomFilename = dataset
    print(decomFilename)
    tstamp = str(decomFilename.strip('.bz2')[-5:])
    #print(tstamp)

    with open(decomFilename, 'rb') as fi:
        decomp = bz2.BZ2Decompressor()
        residue = b''
        total_lines = 0
        
        for data in iter(lambda: fi.read(100 * 1024), b''):
            raw = residue + decomp.decompress(data) # process the raw data and  concatenate residual of the previous block to the beginning of the current raw data block
            residue = b''
            # process_data(current_block) => do the processing of the current data block
            current_block = raw.split(b'\n')
            if raw[-1] != b'\n':
                residue = current_block.pop() # last line could be incomplete
            for items in current_block:
                df_dict = json.loads(items.decode('utf-8'))
                if ('dst_addr' in df_dict and df_dict['dst_addr']!= '0.0.0.0' and df_dict['af'] == 4 and df_dict['avg']>0):
                    df_ip = ip2int(df_dict['dst_addr'])
                    loc = idx.get_loc(df_ip)
                    if ((ipv4_df.loc[loc,'country_code'] in EU_Countries) and df_dict['prb_id'] in probes):
                        res = {key: df_dict[key] for key in df_dict.keys()
                                   & {'avg','prb_id'}}
                        res['Country'] = ipv4_df.loc[loc,'country_code']
                        res['t'] = tstamp
                        #print(res)
                        df_line = pandas.DataFrame.from_dict(res, orient='index')
                        df.append(df_line)
                
            total_lines += len(current_block)
        total_lines += 1
        if total_lines > 1000:
            print('i should take a break')
            break
            
    df.groupby('Country', as_index=False)['avg'].mean() 
    m+=1
    if m>=1:
        print('Taking a break')
        break
df.head()


Unnamed: 0,0,1,2
0,BE,AS12859,85.256193
1,BE,AS25596,-1.0
2,BE,AS15598,
3,BE,AS203953,12.915613
4,BE,AS51815,3.754592
