# AS dataset

In [3]:
import pickle

with open('data/AS_dataset.pkl', 'rb') as file:
    
    AS_df = pickle.load(file)

In [4]:
AS_df.shape

(60122, 5)

In [5]:
AS_df.head()

Unnamed: 0,ASN,Country,Name,NumIPs,type
0,AS55330,AF,AFGHANTELECOM GOVERNMENT COMMUNICATION NETWORK,50432,hosting
1,AS17411,AF,Io Global Services Pvt. Limited,13568,business
2,AS55424,AF,Instatelecom Limited,13312,business
3,AS38742,AF,AWCC,11520,isp
4,AS131284,AF,Etisalat Afghan,10240,isp


# Probe dataset

In [6]:
import pickle

with open('data/probe_dataset.pkl', 'rb') as file:
    
    AS_df = pickle.load(file)
    
    
AS_df.shape

AS_df.head()

Unnamed: 0,prb_id,ASN
0,1,AS3265
1,2,AS1136
2,3,AS3265
3,6,AS6830
4,8,AS3265


In [7]:
AS_df.shape

(11008, 2)

# RIPE dataset

**Option 1 - Opening a data from a decompressed file (12gb each)**

In [None]:
import time
import bz2
import os
import sys
import json

# OPTION 1: open decompressed file
#decompression of one file can take up to 5 minutes (7zip @ Intel i5 4210U)
decomFilename = 'C:/Users/Kooltje/Downloads/FoDa Data/ping-2022-03-01T2300'
decomFile     = open(decomFilename, 'rt') 

#read first line and print
firstLine = decomFile.readline();
#print(firstLine)

#the line appears to be json-formatted: pretty print json
firstLineJson = json.loads(firstLine)
print(json.dumps(firstLineJson, sort_keys=True, indent=4))

#estimate total number of lines
firstLine_sizeInBytes   = sys.getsizeof(firstLine) 
decomFile_sizeInBytes   = os.stat(decomFilename).st_size
nrOfLines               = round(decomFile_sizeInBytes/firstLine_sizeInBytes)
#print("\nEstimated nr of lines = " + str(nrOfLines))

#read first 100k lines to estimate total loading time
count = 0;
st    = time.time()
for line in decomFile:
    count = count + 1
    if count>100000: break
    
#print time and estimate total time            
dur         = round(time.time() - st,2)
estTotTime  = round( (dur/100000)*nrOfLines )
#print("\nDecompressed file:" )
#print("Loading 100k lines took: "  + str(dur) + " seconds")
#print("Estimated loading time of entire decompression file: "  + \str(estTotTime) + " seconds" )

#finally close decomFile
decomFile.close()

**Option 1 results**

Loading time: 76s * 24 files = 30m 24s (On my desktop i5-7600K CPU @ 3.80GHz)

File size: 12.4 GB * 24 files = 297.6 GB

Number of lines in file: 27.968.458

**Option 2 - decompressing during analyzing**

In [5]:
import time
import bz2
import os
import sys
import json

#%% OPTION 2: 
#open .bz2 file directly
bz2Filename = 'data/ping-2022-03-01T2300.bz2'
bz2File     = bz2.open(bz2Filename, 'rt') 

firstLine = bz2File.readline();

firstLineJson = json.loads(firstLine)
print(json.dumps(firstLineJson, sort_keys=True, indent=4))

#estimate total number of lines
firstLine_sizeInBytes   = sys.getsizeof(firstLine) 
decomFile_sizeInBytes   = os.stat(bz2Filename).st_size
nrOfLines               = round(decomFile_sizeInBytes/firstLine_sizeInBytes)
print("\nEstimated nr of lines = " + str(nrOfLines))

#read first 100k lines to estimate total loading time
count = 0;
st    = time.time()
for line in bz2File:
    count = count + 1
    if count>100000: break

#print time and estimate total time            
dur         = round(time.time() - st,2)
estTotTime  = round( (dur/100000)*nrOfLines )
print("\nbz2 file:" )
print("Loading 100k lines took: "  + str(dur) + " seconds")
print("Estimated loading time of entire bz2 file: "  + str(estTotTime) + \
      " seconds" )


lastlinejson = json.loads(line)
#print(json.dumps(lastlinejson, sort_keys=True, indent=4))

#finally close bz2File
bz2File.close()

{
    "af": 4,
    "avg": 6.921626,
    "dst_addr": "192.5.5.241",
    "dst_name": "192.5.5.241",
    "dup": 0,
    "from": "83.243.73.91",
    "fw": 5020,
    "lts": -1,
    "max": 13.444936,
    "min": 3.557907,
    "msm_id": 1004,
    "msm_name": "Ping",
    "mver": "2.2.1",
    "prb_id": 1001660,
    "proto": "ICMP",
    "rcvd": 3,
    "result": [
        {
            "rtt": 13.444936
        },
        {
            "rtt": 3.762035
        },
        {
            "rtt": 3.557907
        }
    ],
    "sent": 3,
    "size": 32,
    "src_addr": "172.17.0.2",
    "step": 240,
    "timestamp": 1646176375,
    "ttl": 58,
    "type": "ping"
}

Estimated nr of lines = 2842055

bz2 file:
Loading 100k lines took: 0.84 seconds
Estimated loading time of entire bz2 file: 24 seconds


**Option 2 results**

Loading time: 487s * 24 files = 3h 14m 48s (On my desktop i5-7600K CPU @ 3.80GHz)

File size: 1.2 GB * 24 files = 28.8 GB

Number of lines in file: 27.968.458

**Result description**

Contains ping data from a lot of probes. This data is stored daily and divided into 24 files for 24 hours. The data loaded here is 1 hour, only one line is shown. Description of the fields is found here: https://web.archive.org/web/20200127090442/https://atlas.ripe.net/docs/data_struct/#v5000_ping

Example of a line:
<pre>
"af": 4,
    "avg": 27.20397,
    "dst_addr": "199.7.91.13",
    "dst_name": "199.7.91.13",
    "dup": 0,
    "from": "79.144.142.112",
    "fw": 5040,
    "lts": 28,
    "max": 27.302378,
    "min": 27.122802,
    "msm_id": 1012,
    "msm_name": "Ping",
    "mver": "2.4.1",
    "prb_id": 34344,
    "proto": "ICMP",
    "rcvd": 3,
    "result": [
        {
            "rtt": 27.302378
        },
        {
            "rtt": 27.18673
        },
        {
            "rtt": 27.122802
        }
    ],
    "sent": 3,
    "size": 20,
    "src_addr": "192.168.1.36",
    "step": 240,
    "timestamp": 1646175950,
    "ttl": 55,
    "type": "ping"
</pre>

# IPv4 Data

In [8]:
import pandas

In [9]:
ipv4_df = pandas.read_csv("data/IP2LOCATION-LITE-DB1.CSV")

In [10]:
ipv4_df.shape

(212146, 4)

In [11]:
ipv4_df.head()

Unnamed: 0,0,16777215,-,-.1
0,16777216,16777471,US,United States of America
1,16777472,16778239,CN,China
2,16778240,16779263,AU,Australia
3,16779264,16781311,CN,China
4,16781312,16785407,JP,Japan


**Result description**

Data shows which IP adresses roughly correspond to which country. The first two columns are IP4 adresses (From and to) in an integer format instead of the usual xxx.xxx.xxx.xxx format