In [9]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

## Create nodes dataframe

In [None]:
# Load data with format (node INDEX: IP INTERFACES)
data = pd.read_csv('kapar-midar-iff.nodes.txt',delimiter='\n', comment='#', header=None, names=['nodes'], encoding='ISO-8859-1')

# INDEX: IP INTERFACES
data['nodes'] = data.nodes.str[5:]

# INDEX | IP INTERFACES
data['index'], data['ip'] = data.nodes.str.split(':').str

# Convert IP interfaces from string to list
data['ip'] = data['ip'].str[2:-1]
data['ip'] = data['ip'].str.split(' ')

nodes = data[['index', 'ip']]
nodes = nodes.set_index('index')

In [None]:
nodes.head(10)

In [25]:
nodes.to_csv('nodes.csv', sep=',', encoding='ISO-8859-1')

In [34]:
import time
start = time.time()

nodes = pd.read_csv('nodes.csv', sep=',', encoding='ISO-8859-1')
nodes['ip'] = nodes['ip'].apply(lambda x: eval(x))

end = time.time()
print(end - start)

## Create links dataframe

In [4]:
# Remove IP address for nodes where interface is specified
def keep_nodes(x):
    nodes_list = []
    for n in x:
        nodes_list.append(n.split(':')[0])
    return nodes_list

In [5]:
# Load data with format (link INDEX: IP INTERFACES)
data = pd.read_csv('kapar-midar-iff.links.txt',delimiter='\n', comment='#', header=None, names=['links'], encoding='ISO-8859-1')

# INDEX: IP INTERFACES
data['links'] = data.links.str[5:]

# INDEX | IP INTERFACES
data['index'], data['links'] = data['links'].str.split(':',1).str
data['links'] = data['links'].str[2:-1]

# INDEX: [IP INTERFACES]
data['links'] = data['links'].str.split(' ')
data['links'] = data['links'].apply(lambda x: keep_nodes(x))

links = data[['index', 'links']]
links = data.set_index('index')

In [6]:
links.head(10)

Unnamed: 0_level_0,links
index,Unnamed: 1_level_1
L2,"[N19943, N259125, N3071997, N10836, N26186, N2..."
L3,"[N259125, N191763, N286513, N132388, N308194, ..."
L4,"[N259125, N249665, N313955, N313955, N192431]"
L5,"[N111930, N332377, N1921700, N1921701, N192170..."
L6,"[N165312, N165312, N268658, N132226, N332377, ..."
L7,"[N255629, N332377, N255629, N9194, N1319060, N..."
L8,"[N231376, N332377, N305589, N147966, N260350, ..."
L9,"[N260350, N156862, N304749, N332377, N237172, ..."
L11,"[N332377, N67904, N236660, N325509, N256412, N..."
L12,"[N4108, N111930, N1921706, N288969, N247665, N..."


In [7]:
links.tail(10)

Unnamed: 0_level_0,links
index,Unnamed: 1_level_1
L77611856,"[N45428, N74514886]"
L77611857,"[N45428, N74514887]"
L77611858,"[N45428, N74514888]"
L77611859,"[N45428, N74514889]"
L77611860,"[N45428, N74514890]"
L77611861,"[N45428, N74514891]"
L77611862,"[N297106, N74514892]"
L77611863,"[N297106, N74514893]"
L77611864,"[N297106, N74514894]"
L77611865,"[N195630, N74514895]"


## Add AS number to nodes

In [8]:
# Load data with format (node.AS INDEX AS TYPE)
data = pd.read_csv('kapar-midar-iff.nodes.as.txt',delimiter='\n', comment='#', header=None, names=['nodes'], encoding='ISO-8859-1')

# INDEX AS TYPE
data['nodes'] = data.nodes.apply(lambda x: x[8:])

# INDEX | AS
data['node_index'], data['AS_number'] = data.nodes.str.split(' ').str

nodes_as = data[['node_index','AS_number']]
nodes_as = nodes_as.set_index(['node_index'])

In [9]:
# Add AS column to nodes dataframe
nodes = nodes.join(nodes_as)

In [10]:
nodes.head(10)

Unnamed: 0_level_0,ip,AS_number
index,Unnamed: 1_level_1,Unnamed: 2_level_1
N1,"[59.152.193.141, 59.152.196.33, 59.152.200.125...",9381
N3,"[59.152.193.69, 59.152.193.73, 59.152.193.85, ...",9381
N4,"[42.61.0.89, 42.61.1.13, 42.61.1.57, 42.61.1.2...",3758
N6,"[181.129.43.145, 181.143.0.49, 181.143.2.225, ...",13489
N7,"[179.27.16.97, 179.27.16.153, 179.27.16.189, 1...",6057
N8,"[36.66.32.81, 36.66.32.89, 36.66.32.113, 36.66...",17974
N9,"[42.61.0.173, 42.61.0.217, 42.61.1.105, 42.61....",3758
N10,"[59.152.196.133, 59.152.196.205, 59.152.196.22...",9381
N11,"[179.27.0.37, 179.27.0.185, 179.27.0.229, 179....",6057
N12,"[5.2.116.0, 5.2.116.8, 5.2.116.10, 5.2.116.18,...",31655


## Add geo data to nodes

In [None]:
# Load data with format (node.geo INDEX: COUNTRY LATITUDE LONGITUDE)
data = pd.read_csv('kapar-midar-iff.nodes.geo.txt',delimiter='\n', comment='#', header=None, names=['nodes'], encoding='ISO-8859-1')

# INDEX: COUNTRY LATITUDE LONGITUDE
data['nodes'] = data.nodes.str[9:-1]

# INDEX: | COUNTRY | LATITUDE | LONGITUDE
data['node_index'], data['continent'], data['country'], data['region'], data['city'], data['latitude'], data['longitude'],_,_,_ = data.nodes.str.split('\t').str

# INDEX | COUNTRY | LATITUDE | LONGITUDE
data['node_index'] = data['node_index'].str[:-1]

geo = data[['node_index','country','latitude','longitude']]
geo = geo.set_index('node_index')

In [None]:
nodes = nodes.join(geo)

In [None]:
nodes.head(10)

## Observations

In [None]:
print('Number of nodes : {}'.format(len(nodes)))
print('Number of nodes without AS : {}'.format(len(nodes[nodes['AS_number'].isna()])))
print('Number of nodes without geo data : {}'.format(len(nodes[nodes['country'].isna()])))

In [None]:
temp = nodes['ip'].apply(lambda x: len(x))
temp.describe().apply(lambda x: format(x, 'f'))

In [None]:
plt.hist(temp, bins = temp.max())
plt.xlim(0,temp.max())
plt.yscale('log')

## Create links for AS dataframe

In [None]:
links_nodes = links.copy()

In [None]:
def apply_list(x):
    as_set = set()
    for elem in x:
        as_set.add(dict_nodes_to_as.get(elem, np.nan))
    return as_set

In [None]:
links_nodes['links'] = links_nodes['links'].apply(lambda x: apply_list(x))

In [None]:
links_nodes

### Save new dataframe in files

In [None]:
nodes.to_csv('nodes.csv', sep=',', encoding='ISO-8859-1')
links.to_csv('links.csv', sep=',', encoding='ISO-8859-1')
links_nodes.to_csv('links_as.csv', sep=',', encoding='ISO-8859-1')

In [42]:
links_as = pd.read_csv('links_as.csv',delimiter=',',encoding='ISO-8859-1')

In [43]:
links_as['links'] = links_as['links'].str.replace('nan, ','')
links_as['links'] = links_as['links'].str.replace(', nan', '')

In [44]:
links_as = links_as[links_as['links'].str.contains(',')]

In [45]:
links_as.set_index('index').to_csv('caida-asn-links.csv', sep=',', encoding='ISO-8859-1')

In [49]:
links_as['links'] = links_as['links'].apply(lambda x: eval(x))

In [52]:
links_as = links_as.set_index('index')

In [56]:
nodes_set = set()
for l in links_as['links']:
    for asn in l:
        nodes_set.add(asn)
nodes_set

{'53612',
 '203410',
 '22319',
 '202789',
 '1606',
 '28775',
 '17019',
 '25629',
 '62903',
 '49719',
 '36019',
 '30678',
 '42729',
 '36924',
 '1746',
 '197848',
 '201782',
 '60781',
 '39311',
 '46575',
 '16010',
 '37296',
 '40173',
 '39376',
 '29208',
 '200687',
 '13372',
 '18746',
 '55775',
 '293',
 '5474',
 '44616',
 '46294',
 '55481',
 '57339',
 '327756',
 '22042',
 '202629',
 '203386',
 '20655',
 '55677',
 '201377',
 '44396',
 '57275',
 '40732',
 '206315',
 '8894',
 '40485',
 '263108',
 '54393',
 '196870',
 '394618',
 '33268',
 '27334',
 '19963',
 '50910',
 '201347',
 '53010',
 '54062',
 '49259',
 '265462',
 '33169',
 '40076',
 '14909',
 '200719',
 '9911',
 '41582',
 '12829',
 '49369',
 '39098',
 '3252',
 '18295',
 '39134',
 '52875',
 '48137',
 '8569',
 '133982',
 '52486',
 '19744',
 '200171',
 '264656',
 '16769',
 '39956',
 '56094',
 '48464',
 '49529',
 '45935',
 '11069',
 '31720',
 '394277',
 '41390',
 '52418',
 '12705',
 '35124',
 '57671',
 '7477',
 '9756',
 '44136',
 '133716',


In [57]:
len(nodes_set)

51764