In [1]:
import requests as r
import traceback
import re
import pandas as pd
import time
import json

### Download RIPE data about the bund-networks

In [2]:
bund_nets = json.load(open('../bund-networks.json'))

In [3]:
bund_data = []

In [7]:
def params_no_enc(params):
    params = "&".join("{}={}".format(k,v) for k,v in params.items())
    return params

In [8]:
def get_data_for_ip(ip_addr):
    headers = {'Accept': 'application/json'}
    params = {
        'facet': 'true',
        'hl': 'true',
        'q': '(inetnum:({})+OR+netname:({}))+AND+(country:(DE))+AND+(object-type:inet6num+OR+object-type:inetnum)'.format(ip_addr, ip_addr),
        'start': '0',
        'wt': 'json'
    }
    url = 'https://apps.db.ripe.net/db-web-ui/api/rest/fulltextsearch/select'
    resp = r.get(url, headers=headers, params=params_no_enc(params))
    data = resp.json()
    new_data = data['result']['docs']
    numFound = data['result']['numFound']
    print("Number of entries found for {}:{}".format(ip_addr, numFound))
    if numFound == 0:
        return []
    for i in range(10,int(numFound)+1,10):
        params['start'] = str(i)
        resp = r.get(url, headers=headers, params=params_no_enc(params))
        data = resp.json()
        new_data.extend(data['result']['docs'])
    print("Results downloaded: ", len(new_data))
    time.sleep(1)
    return new_data

In [9]:
def get_dict_from_res(doc):
    ip_range = {}
    institution = doc[1]
    ip_range['bund_networks'] = institution
    doc = doc[0]
    if doc == []:
        return ip_range
    doc =  doc[0]['doc']['strs']
    ip_range['descr'] = ''
    for elem in doc:
        if elem['str']['name'] == 'descr':
            ip_range['descr'] = ip_range['descr'] + elem['str']['value'] + '|'
            continue
        ip_range[elem['str']['name']] = elem['str']['value']
    return ip_range

In [7]:
for institution, range_list in bund_nets.items():
    for ip_range in range_list:
        bund_data.append((get_data_for_ip(ip_range[0]), institution))

Number of entries found for 212.122.48.192:1
Results downloaded:  1
Number of entries found for 193.24.128.0:1
Results downloaded:  1
Number of entries found for 80.245.144.0:2
Results downloaded:  2
Number of entries found for 80.245.148.0:1
Results downloaded:  1
Number of entries found for 217.5.178.16:1
Results downloaded:  1
Number of entries found for 193.102.16.0:0
Number of entries found for 77.74.239.64:1
Results downloaded:  1
Number of entries found for 193.17.232.0:1
Results downloaded:  1
Number of entries found for 194.31.194.0:0
Number of entries found for 139.11.0.0:1
Results downloaded:  1
Number of entries found for 62.159.95.48:1
Results downloaded:  1
Number of entries found for 62.159.102.80:0
Number of entries found for 80.149.172.0:1
Results downloaded:  1
Number of entries found for 80.150.149.24:1
Results downloaded:  1
Number of entries found for 80.154.114.160:1
Results downloaded:  1
Number of entries found for 80.157.135.16:1
Results downloaded:  1
Number o

In [8]:
bund_data_parsed = [get_dict_from_res(x) for x in bund_data]

bund_data_df = pd.DataFrame(bund_data_parsed)

In [9]:
bund_data_end = []

In [11]:
for institution, range_list in bund_nets.items():
    for ip_range in range_list:
        bund_data_end.append((get_data_for_ip(ip_range[1]), institution))

Number of entries found for 212.122.48.199:1
Results downloaded:  1
Number of entries found for 193.24.191.255:1
Results downloaded:  1
Number of entries found for 80.245.159.255:2
Results downloaded:  2
Number of entries found for 80.245.151.255:1
Results downloaded:  1
Number of entries found for 217.5.178.31:1
Results downloaded:  1
Number of entries found for 193.102.16.255:0
Number of entries found for 77.74.239.79:1
Results downloaded:  1
Number of entries found for 193.17.247.255:1
Results downloaded:  1
Number of entries found for 194.31.194.255:0
Number of entries found for 139.11.255.255:1
Results downloaded:  1
Number of entries found for 62.159.95.55:1
Results downloaded:  1
Number of entries found for 62.159.102.87:0
Number of entries found for 80.149.172.15:1
Results downloaded:  1
Number of entries found for 80.150.149.31:1
Results downloaded:  1
Number of entries found for 80.154.114.175:1
Results downloaded:  1
Number of entries found for 80.157.135.23:1
Results downlo

In [12]:
bund_data_end_parsed = [get_dict_from_res(x) for x in bund_data_end]

bund_data_end_df = pd.DataFrame(bund_data_end_parsed)

#### Compare the begining and ending range 

In [13]:
bund_data_df['primary-key2'] = bund_data_end_df['primary-key']

In [14]:
bund_data_df['netname2'] = bund_data_end_df['netname']

In [15]:
# bund_data_df[bund_data_df['primary-key'] != bund_data_df['primary-key2']][['bund_networks', 'netname', 'descr', 'inetnum', 'netname2']]

#### Wrong entries

In [16]:
wrong_ones = [13,14,15,16,25,26,27,35,41,46,56,68,70,88]

In [17]:
# bund_data_df[['bund_networks', 'descr', 'netname', 'org', 'inetnum']].head(53)

In [18]:
# bund_data_df[['bund_networks', 'descr', 'netname', 'org', 'inetnum']].tail(52)

In [19]:
# bund_data_df[pd.isnull(bund_data_df.country)]

### Query ripe organizations

In [20]:
organizations = []
with open('RIPE_organizations.txt') as query_t:
    for line in query_t:
        organizations.append(line.strip())

In [21]:
keywords = []
with open('RIPE_api_query_terms.txt') as api_query_t:
    for line in api_query_t:
        keywords.append(line.strip())

In [10]:
def get_org_data(keyword):
    headers = {'Accept': 'application/json'}
    params = {
        'facet': 'true',
        'hl': 'true',
        'q': '({})+AND+(object-type:organisation)'.format(keyword),
        'start': '0',
        'wt': 'json'
    }
    url = 'https://apps.db.ripe.net/db-web-ui/api/rest/fulltextsearch/select'
    resp = r.get(url, headers=headers, params=params_no_enc(params))
    data = resp.json()
    new_data = data['result']['docs']
    numFound = data['result']['numFound']
    print("Number of entries found for {}:{}".format(keyword, numFound))
    if numFound == 0:
        return []
    for i in range(10,int(numFound)+1,10):
        params['start'] = str(i)
        resp = r.get(url, headers=headers, params=params_no_enc(params))
        data = resp.json()
        new_data.extend(data['result']['docs'])
    print("Results downloaded: ", len(new_data))
    time.sleep(1)
    return new_data

In [11]:
def get_org_info(doc):
    org_info = {}
    doc =  doc['doc']['strs']
    org_info['address'] = ''
    for elem in doc:
        if elem['str']['name'] == 'address':
            org_info['address'] = org_info['address'] + elem['str']['value'] + '|'
            continue
        org_info[elem['str']['name']] = elem['str']['value']
    return org_info

In [24]:
org_raw_data = []

In [25]:
%%time
for org in organizations:
    org_raw_data.extend(get_org_data(org))

Number of entries found for ORG-DB45-RIPE:1
Results downloaded:  1
Number of entries found for ORG-CA1591-RIPE:1
Results downloaded:  1
Number of entries found for ORG-KBS6-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BD44-RIPE:1
Results downloaded:  1
Number of entries found for ORG-LW49-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BL250-RIPE:1
Results downloaded:  1
Number of entries found for ORG-LFA24-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BdI1-RIPE:1
Results downloaded:  1
Number of entries found for ORG-FA511-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BFSU1-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BAMF1-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BA202-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BDR2-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BfKu1-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BNTA1-RIPE:1
Results 

In [26]:
%%time
for kw in keywords:
    org_raw_data.extend(get_org_data(kw))

Number of entries found for Bundesverfassungsgericht*:0
Number of entries found for Bundesliegenschaft*:0
Number of entries found for Bundesministerium*:9
Results downloaded:  9
Number of entries found for Bundes*:56
Results downloaded:  56
Number of entries found for Zoll*:38
Results downloaded:  38
Number of entries found for ministerium*:3
Results downloaded:  3
Number of entries found for bundes*amt:14
Results downloaded:  14
Number of entries found for bundes*institut:1
Results downloaded:  1
Number of entries found for bundes*anstalt:3
Results downloaded:  3
Number of entries found for bundes*agentur:3
Results downloaded:  3
Number of entries found for bundes*gericht:0
Number of entries found for bundesamt:11
Results downloaded:  11
Number of entries found for bundesinstitut:0
Number of entries found for bundesanstalt:2
Results downloaded:  2
Number of entries found for bundesagentur:1
Results downloaded:  1
Number of entries found for bundesgericht:0
Number of entries found for 

In [27]:
orgs = [get_org_info(x) for x in org_raw_data] 

In [28]:
orgs_df = pd.DataFrame(orgs)

In [33]:
org_query = list(orgs_df.organisation.unique())

In [34]:
org_query.extend(organizations)
org_query = list(set(org_query))

In [35]:
with open('RIPE_orgs_query.txt', 'w') as fp:
    for org in org_query:
        fp.write(org + '\n')

In [36]:
# org_query = []
# with open('RIPE_orgs_query.txt') as fp:
#     for line in fp:
#         org_query.append(line.strip())

### Query ripe for keywords

In [2]:
def get_raw_data(keyword):
    headers = {'Accept': 'application/json'}
    params = {
        'facet': 'true',
        'hl': 'true',
        'q': '({})+AND+(country:(DE))+AND+(object-type:inet6num+OR+object-type:inetnum)'.format(keyword),
#         'q': '(descr:({})+OR+netname:({}))+AND+(country:(DE))+AND+(object-type:inet6num+OR+object-type:inetnum)'.format(keyword, keyword),
        'start': '0',
        'wt': 'json'
    }
    url = 'https://apps.db.ripe.net/db-web-ui/api/rest/fulltextsearch/select'
    resp = r.get(url, headers=headers, params=params_no_enc(params))
    data = resp.json()
    new_data = data['result']['docs']
    numFound = data['result']['numFound']
    print("Number of entries found for {}:{}".format(keyword, numFound))
    if numFound == 0:
        return []
    for i in range(10,int(numFound)+1,10):
        params['start'] = str(i)
        resp = r.get(url, headers=headers, params=params_no_enc(params))
        data = resp.json()
        new_data.extend(data['result']['docs'])
    print("Results downloaded: ", len(new_data))
    time.sleep(1)
    return new_data

In [5]:
raw_res = []

In [3]:
keywords = ['CDU']

In [12]:
%%time
for kw in keywords:
    raw_res.extend(get_raw_data(kw))

Number of entries found for CDU:33
Results downloaded:  33
CPU times: user 64 ms, sys: 0 ns, total: 64 ms
Wall time: 1.69 s


In [40]:
%%time
for org in org_query:
    raw_res.extend(get_raw_data(org))

Number of entries found for ORG-BMI1-RIPE:0
Number of entries found for ORG-DSUE1-RIPE:1
Results downloaded:  1
Number of entries found for ORG-XA107-RIPE:0
Number of entries found for ORG-BfA7-RIPE:2
Results downloaded:  2
Number of entries found for ORG-ANKD1-RIPE:1
Results downloaded:  1
Number of entries found for ORG-MV12-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BL254-RIPE:1
Results downloaded:  1
Number of entries found for ORG-KB22-RIPE:2
Results downloaded:  2
Number of entries found for ORG-BFMU2-RIPE:0
Number of entries found for ORG-BA381-RIPE:1
Results downloaded:  1
Number of entries found for ORG-BMV1-RIPE:62
Results downloaded:  62
Number of entries found for ORG-BE20-RIPE:2
Results downloaded:  2
Number of entries found for ORG-TNEG1-RIPE:1
Results downloaded:  1
Number of entries found for ORG-XG14-RIPE:14
Results downloaded:  14
Number of entries found for ORG-SFPS1-RIPE:0
Number of entries found for ORG-HK8-RIPE:0
Number of entries found for ORG-

In [13]:
def get_ip_range(doc):
    ip_range = {}
    doc =  doc['doc']['strs']
    ip_range['descr'] = ''
    for elem in doc:
        if elem['str']['name'] == 'descr':
            ip_range['descr'] = ip_range['descr'] + elem['str']['value'] + '|'
            continue
        ip_range[elem['str']['name']] = elem['str']['value']
    return ip_range

In [14]:
ranges = [get_ip_range(x) for x in raw_res]

In [15]:
ranges_df = pd.DataFrame(ranges)

In [16]:
ranges_df.shape

(33, 17)

In [45]:
ranges_df[ranges_df.descr.str.contains('Bundes')][['descr', 'inet6num', 'inetnum', 'netname']].head()

Unnamed: 0,descr,inet6num,inetnum,netname
0,"Bundesverfassungsgericht|Karlsruhe, Germany|",,193.197.84.0 - 193.197.84.7,BVG-KARLSRUHE
16,TSI fuer Bundeskanzleramt|,2003:44:e03b::/48,,BUNDESLIEGENSCHAFT-BERLIN-NET10
99,TSI fuer Bundeskanzleramt|,,62.159.122.232 - 62.159.122.239,BUNDESLIEGENSCHAFT-BERLIN-NET10
100,TSI fuer Bundesministerium fuerWirtschaft und ...,2003:45:c05c::/48,,BUNDESMINISTERIUM-FUER-WIRTSCHAFT-U-TECHNIK-BO...
101,TSI fuer Bundesministerium derFinanzen|,2003:51:603c::/48,,ZID-DER-BUNDESFINANZVERWALTUNG-FRANKFURT-NET


In [17]:
print(ranges_df.shape)

ranges_df = ranges_df.drop_duplicates(subset='primary-key')

print(ranges_df.shape)

(33, 17)
(25, 17)


In [47]:
regexes = []
with open('RIPE_regex_terms_RIPE.txt') as query_t:
    for line in query_t:
        regexes.append(line.strip())

In [48]:
def match_against_regexes(descr):
    if type(descr) != str:
        return False
    for reg in regexes:
        try:
            if re.compile(reg, re.IGNORECASE).search(descr):
                return True
        except TypeError:
            traceback.print_exc()
            print(reg)
            print(descr)
    return False

In [49]:
def match_against_api_terms(descr):
    for reg in keywords:
        try:
            if re.compile(reg, re.IGNORECASE).search(descr):
                return True
        except TypeError:
            traceback.print_exc()
            print(reg)
            print(descr)
    return False

In [50]:
def match_against_organizations(descr):
    if type(descr) != str:
        return False
    for org in organizations:
        try:
            if re.compile(org, re.IGNORECASE).search(descr):
                return True
        except TypeError:
            traceback.print_exc()
            print(org)
            print(descr)
    return False

In [51]:
print(ranges_df.shape)

(1445, 23)


In [52]:
ranges_df['matched_descr'] = ranges_df['descr'].apply(match_against_regexes)

In [53]:
ranges_df['matched_netname'] = ranges_df['netname'].apply(match_against_regexes)

In [54]:
ranges_df['matched_org'] = ranges_df['org'].apply(match_against_organizations)

In [55]:
ranges_df['matched'] = ranges_df['matched_descr'] | ranges_df['matched_netname'] | ranges_df['matched_org']

In [56]:
ranges_df[['descr', 'netname', 'org', 'matched_descr', 'matched_netname', 'matched_org', 'matched']].sample(5)

Unnamed: 0,descr,netname,org,matched_descr,matched_netname,matched_org,matched
658,Bundesanstalt fuer Gewaesserkunde|Koblenz|,BFG-NET,,True,False,False,True
1140,Bildungszentrum der Bundesfinanzverwaltung Mue...,BWZ-VIT-MUENSTER-NET,,False,False,False,False
1497,Zollkriminalamt|Bergisch-Gladbacher-Str. 837|D...,ZKAIB-NET,,True,False,False,True
181,Volleyball Bundesliga GmbH|,VOLLEYBALL-BUNDESLIGA-BERLIN-NET,,False,False,False,False
113,"Bundesministerium fuer Umwelt, Naturschutz, Ba...",BMU-LAN,,True,False,False,True


In [18]:
def create_name(row):
    name = ''
    if row['matched_descr']:
        name+= row['descr'] + '|'
    if row['matched_netname']:
        name+= row['netname'] + '|'
    if row['matched_org']:
        name+= row['org'] + '|'
    return name

In [None]:
ranges_df['name'] = ranges_df.apply(lambda x: create_name(x), axis=1)

In [59]:
ranges_df[ranges_df.matched==True]['name'].value_counts().to_csv('RIPE_matched_descriptions.csv')

In [60]:
ranges_df[ranges_df.matched==False][['descr', 'netname']].to_csv('RIPE_unmatched_descriptions.csv')

In [61]:
print(ranges_df[ranges_df.matched==True].shape)

(876, 28)


### Check if the regexes are good

In [62]:
wiki_df = pd.read_html('https://de.wikipedia.org/wiki/Liste_der_deutschen_Bundesbeh%C3%B6rden')

In [63]:
wiki_df1 = wiki_df[0]
wiki_df1.columns = wiki_df1.loc[0]

wiki_df2 = wiki_df[1]
wiki_df2.columns = wiki_df2.loc[0]

wiki_df1 = wiki_df1.drop(0, axis=0)
wiki_df2 = wiki_df2.drop(0, axis=0)

to_match = list(wiki_df1.Name)
to_match.extend(list(wiki_df1.Aufsichtsbehörde))
to_match.extend(list(wiki_df2.Name))
to_match.extend(list(wiki_df2.Aufsichtsbehörde))

to_match = pd.DataFrame(to_match).dropna()

to_match['matched_regex'] = to_match[0].apply(match_against_regexes)

to_match[to_match.matched_regex == False][0].values

array([], dtype=object)

### Check if the query terms cover all wikipedia entries

In [64]:
to_match['matched_api'] = to_match[0].apply(match_against_api_terms)

to_match[to_match.matched_api == False][0].values

array(['Deutsches Patent- und Markenamt'], dtype=object)

### Save end result to file

In [65]:
ranges_df.loc[ranges_df.matched == True, ['inet6num', 'name', 'inetnum', 'primary-key']].to_csv("RIPE_ip_ranges.csv")

In [66]:
ranges_df.loc[ranges_df.matched == True].shape

(876, 28)

In [67]:
# (890, 29)

In [23]:
ranges_df.head()

Unnamed: 0,admin-c,country,created,descr,inet6num,inetnum,last-modified,lookup-key,mnt-by,mnt-lower,netname,notify,object-type,primary-key,remarks,status,tech-c
0,GB19250-RIPE,DE,2016-01-25T13:31:49Z,TSI fuer CDU-Bundesgeschaefts- st. Konrad-Aden...,2003:46:e033::/48,,2016-01-25T13:31:49Z,2003:46:e033::/48,DTAG-NIC,,UNION-BETRIEBS-RHEINBACH-NET,auftrag@nic.telekom.de,inet6num,15743813,,ASSIGNED,GB19250-RIPE
1,VTO-RIPE,DE,1970-01-01T00:00:00Z,CDU Landesverband Rheinland-Pfalz|Rheinallee 1...,,213.139.133.208 - 213.139.133.223,2009-03-20T13:45:51Z,213.139.133.208 - 213.139.133.223,VT-MNT,,CUSTOMER-NET-MZK-10150,,inetnum,421117,,ASSIGNED PA,VTO-RIPE
2,VTO-RIPE,DE,2003-05-20T15:31:28Z,CDU Landesverband Hessen|Frankfurter Strasse 6...,,213.139.151.128 - 213.139.151.143,2008-10-30T16:10:52Z,213.139.151.128 - 213.139.151.143,VT-MNT,VT-GARBAGE-MNT,VT-CUSTOMER-S-6515014,,inetnum,2187984,,ASSIGNED PA,VTO-RIPE
3,HGN12-RIPE,DE,2003-01-16T04:00:15Z,CDU FRAKTION DER BREMISCHEN BUERGERSCHAFT|,,217.110.149.48 - 217.110.149.55,2010-10-26T12:54:57Z,217.110.149.48 - 217.110.149.55,DE-COLT-MNT,,NET-DE-CDU-FRAKTION-DER-BREMISCHEN-BUERGERSCHAFT,,inetnum,1992719,notify eu-ripemaster@colt.net,ASSIGNED PA,HGN12-RIPE
4,QSC1-RIPE,DE,2005-08-05T01:31:43Z,CDU-Hamburg Landesverband|,,83.236.232.216 - 83.236.232.223,2017-12-02T02:32:39Z,83.236.232.216 - 83.236.232.223,QSC-NOC,QSC-NOC,QSC-CUSTOMER-552415-1025787,,inetnum,3856603,,ASSIGNED PA,QSC1-RIPE


In [24]:
ranges_df[['inet6num', 'inetnum', 'primary-key']].to_csv("CDU_ip_ranges.csv")