# ICLab Demo Data Generation

Generate demo data in the following JSON format for convenient substitution in the blocked websites frontend demo:

```
{
    country_code_a1 : {
        country_code_b1 : {
            similarity: (float),
            domains_blocked: [list of strings],
        },
        country_code_b2 : {
            similarity: (float),
            domains_blocked: [list of strings],
        },
    }
}
```

Similarity = (# of sites blocked in A and B) / (# of sites tested in A and B)

In [1]:
import pandas as pd
import tldextract
import numpy as np
import json
import math
import pycountry
import itertools

In [3]:
iclab = pd.read_csv("../data/iclab_cleaned_data/iclab_2018-09.csv")
iclab.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
0,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://kinox.to/,,,,,403.0,False,3094.0,{},,,False
1,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://movie4k.to/,,,,,403.0,False,3098.0,{},,,False
2,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,4shared.com,False,,False,,,,,,,,False
3,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,news.bbc.co.uk,False,,False,,,,,,,,False
4,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,ngt.jinbo.net,False,,False,,,,,,,,False


### Calculate Similarities, Blocked Domains

In [11]:
# sites blocked in the US according to the iclab dataset (we need to filter these out across all countries)

blocked_us = iclab.loc[(iclab['country'] == 'US') & (iclab['censored_updated'] == True)].url.unique()
blocked_us

array(['https://www.netflix.com/', 'http://warc.jalb.de/',
       'http://bittornado.com/', 'http://netflix.com/',
       'http://anonymouse.org/', 'http://milanuncios.com/',
       'http://www.hrcr.org/', 'http://advocacy.globalvoicesonline.org/',
       'http://globalvoicesonline.org/', '17ok.com', 'http://1905.com/',
       'netflix.com', 'www.netflix.com', 'http://www.schwarzreport.org/',
       'http://www.nato.int/', 'livedoor.com', 'yandex.ru',
       'http://www.realstreaming.net/', 'www.realstreaming.net'],
      dtype=object)

In [14]:
blocked_us = [[tldextract.extract(i).domain, tldextract.extract(i).suffix] for i in blocked_us]
blocked_us

[['netflix', 'com'],
 ['jalb', 'de'],
 ['bittornado', 'com'],
 ['netflix', 'com'],
 ['anonymouse', 'org'],
 ['milanuncios', 'com'],
 ['hrcr', 'org'],
 ['globalvoicesonline', 'org'],
 ['globalvoicesonline', 'org'],
 ['17ok', 'com'],
 ['1905', 'com'],
 ['netflix', 'com'],
 ['netflix', 'com'],
 ['schwarzreport', 'org'],
 ['nato', 'int'],
 ['livedoor', 'com'],
 ['yandex', 'ru'],
 ['realstreaming', 'net'],
 ['realstreaming', 'net']]

In [20]:
iclab['domain'] = iclab['url']
iclab['suffix'] = iclab['url']

iclab['domain'] = iclab['domain'].apply(lambda x: tldextract.extract(x).domain)
iclab['suffix'] = iclab['suffix'].apply(lambda x: tldextract.extract(x).suffix)

iclab.head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated,domain,suffix
0,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://kinox.to/,,,,,403.0,False,3094.0,{},,,False,kinox,to
1,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://movie4k.to/,,,,,403.0,False,3098.0,{},,,False,movie4k,to
2,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,4shared.com,False,,False,,,,,,,,False,4shared,com
3,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,news.bbc.co.uk,False,,False,,,,,,,,False,bbc,co.uk
4,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,ngt.jinbo.net,False,,False,,,,,,,,False,jinbo,net


In [33]:
iclab['combined_domain_suffix'] = iclab['domain'] + '.' + iclab['suffix']

combined_blocked_us = [i[0] + '.' + i[1] for i in blocked_us]

combined_blocked_us

['netflix.com',
 'jalb.de',
 'bittornado.com',
 'netflix.com',
 'anonymouse.org',
 'milanuncios.com',
 'hrcr.org',
 'globalvoicesonline.org',
 'globalvoicesonline.org',
 '17ok.com',
 '1905.com',
 'netflix.com',
 'netflix.com',
 'schwarzreport.org',
 'nato.int',
 'livedoor.com',
 'yandex.ru',
 'realstreaming.net',
 'realstreaming.net']

In [35]:
iclab.head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated,domain,suffix,combined_domain_suffix
0,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://kinox.to/,,,,,403.0,False,3094.0,{},,,False,kinox,to,kinox.to
1,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://movie4k.to/,,,,,403.0,False,3098.0,{},,,False,movie4k,to,movie4k.to
2,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,4shared.com,False,,False,,,,,,,,False,4shared,com,4shared.com
3,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,news.bbc.co.uk,False,,False,,,,,,,,False,bbc,co.uk,bbc.co.uk
4,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,ngt.jinbo.net,False,,False,,,,,,,,False,jinbo,net,jinbo.net


In [43]:
iclab_dropped_us = iclab.loc[~(iclab['combined_domain_suffix'].isin(combined_blocked_us))]

iclab_dropped_us.head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated,domain,suffix,combined_domain_suffix
0,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://kinox.to/,,,,,403.0,False,3094.0,{},,,False,kinox,to,kinox.to
1,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://movie4k.to/,,,,,403.0,False,3098.0,{},,,False,movie4k,to,movie4k.to
2,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,4shared.com,False,,False,,,,,,,,False,4shared,com,4shared.com
3,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,news.bbc.co.uk,False,,False,,,,,,,,False,bbc,co.uk,bbc.co.uk
4,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,ngt.jinbo.net,False,,False,,,,,,,,False,jinbo,net,jinbo.net


In [44]:
print(iclab.shape, iclab_dropped_us.shape)

(1052887, 20) (1045475, 20)


In [48]:
grouped_iclab = iclab_dropped_us.groupby('country')

grouped_iclab.get_group("US")

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated,domain,suffix,combined_domain_suffix
0,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://kinox.to/,,,,,403.0,False,3094.0,{},,,False,kinox,to,kinox.to
1,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://movie4k.to/,,,,,403.0,False,3098.0,{},,,False,movie4k,to,movie4k.to
4351,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://www.acquisitionx.com/,,,false,sameip,200.0,False,8419.0,{},,,False,acquisitionx,com,acquisitionx.com
4352,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://stackexchange.com/,,,-2,no_control_resp,200.0,False,126145.0,{},,,False,stackexchange,com,stackexchange.com
4353,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://www.lingerieatlarge.com/,,,false,sameip,200.0,False,29212.0,{},False,Handshake conflict,False,lingerieatlarge,com,lingerieatlarge.com
4354,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://www.pokerstars.net/,,,false,sameip,200.0,False,23591.0,{},,,False,pokerstars,net,pokerstars.net
4355,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://slickdeals.net/,,,-2,no_control_resp,200.0,False,679915.0,{},,,False,slickdeals,net,slickdeals.net
4356,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://flipkart.com/,,,-2,no_control_resp,200.0,False,903841.0,{},,,False,flipkart,com,flipkart.com
4357,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://tmall.com/,,,-2,no_control_resp,200.0,False,226701.0,{},,,False,tmall,com,tmall.com
4358,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://www.911truth.org/,,,false,sameip,406.0,False,300.0,{},,,False,911truth,org,911truth.org


In [49]:
unique_countries = iclab_dropped_us['country'].unique()

unique_countries

array(['US', 'KR', 'ES', 'ZA', 'CZ', 'PL', 'MY', 'RU', 'CN', 'TW', 'BG',
       'HK', 'RO', 'PE', 'HU', 'NO', 'MX', 'UA', 'NL', 'VN', 'JP', 'LT',
       'RS', 'AU', 'KE', 'SK', 'IN', 'CL', 'CA', 'LI', 'SG', 'ID', 'NZ',
       'LU', 'BZ', 'CO', 'TR', 'BR', 'SE', 'IS', 'FI', 'DZ', 'PT', 'DK',
       'IL', 'MD', 'AT', 'SC'], dtype=object)

In [62]:
demo_data_json = {}

for c in range(len(unique_countries)):
    curr_country_a = unique_countries[c]
    curr_group_a = grouped_iclab.get_group(curr_country_a)
    demo_data_json[curr_country_a] = {}
    for d in range(c + 1, len(unique_countries)):
        curr_country_b = unique_countries[d]
        curr_group_b = grouped_iclab.get_group(curr_country_b)
        
        # calculate similarity index
        sites_tested_a = curr_group_a['combined_domain_suffix'].unique()
        sites_blocked_a = curr_group_a.loc[curr_group_a['censored_updated'] == True]['combined_domain_suffix'].unique()
        
        sites_tested_b = curr_group_b['combined_domain_suffix'].unique()
        sites_blocked_b = curr_group_b.loc[curr_group_b['censored_updated'] == True]['combined_domain_suffix'].unique()
        
        print(curr_country_a, curr_country_b, len(sites_tested_a), len(sites_tested_b), len(sites_blocked_a), len(sites_blocked_b))
        
        both_sites_tested = np.intersect1d(sites_tested_a, sites_tested_b) # tested in both A AND B
        # calculated overlapping blocked
        both_sites_blocked = list(np.intersect1d(sites_tested_b, sites_blocked_b)) # blocked in both A AND B
        
        similarity = 0
        if len(both_sites_tested) != 0:
            similarity = len(both_sites_blocked) / len(both_sites_tested)
        
        if curr_country_b not in demo_data_json:
            demo_data_json[curr_country_b] = {}
        
        demo_data_json[curr_country_a][curr_country_b] = { 
            "similarity" : similarity, 
            "domains_blocked" : both_sites_blocked 
        } 

        demo_data_json[curr_country_b] = { 
            curr_country_a : { 
                "similarity" : similarity, 
                "domains_blocked" : both_sites_blocked 
            } 
        }

US KR 2313 2754 0 100
US ES 2313 2195 0 0
US ZA 2313 2237 0 2
US CZ 2313 2199 0 0
US PL 2313 2260 0 1
US MY 2313 2529 0 2
US RU 2313 2403 0 2
US CN 2313 2363 0 8
US TW 2313 2529 0 0
US BG 2313 2199 0 2
US HK 2313 2316 0 1
US RO 2313 2360 0 0
US PE 2313 2328 0 0
US HU 2313 2399 0 2
US NO 2313 2201 0 0
US MX 2313 2360 0 0
US UA 2313 3343 0 2
US NL 2313 2241 0 4
US VN 2313 2458 0 2
US JP 2313 2223 0 3
US LT 2313 2150 0 0
US RS 2313 2199 0 0
US AU 2313 2222 0 1
US KE 2313 2321 0 0
US SK 2313 2199 0 2
US IN 2313 2521 0 143
US CL 2313 2337 0 0
US CA 2313 2178 0 0
US LI 2313 2150 0 0
US SG 2313 2215 0 0
US ID 2313 2240 0 0
US NZ 2313 2234 0 3
US LU 2313 2183 0 0
US BZ 2313 2197 0 0
US CO 2313 2273 0 0
US TR 2313 2483 0 69
US BR 2313 4283 0 0
US SE 2313 2167 0 0
US IS 2313 2167 0 0
US FI 2313 2199 0 0
US DZ 2313 146 0 0
US PT 2313 2167 0 0
US DK 2313 2194 0 0
US IL 2313 2177 0 0
US MD 2313 2508 0 0
US AT 2313 2 0 0
US SC 2313 534 0 0
KR ES 2754 2195 100 0
KR ZA 2754 2237 100 2
KR CZ 2754 2199 

BG PT 2199 2167 2 0
BG DK 2199 2194 2 0
BG IL 2199 2177 2 0
BG MD 2199 2508 2 0
BG AT 2199 2 2 0
BG SC 2199 534 2 0
HK RO 2316 2360 1 0
HK PE 2316 2328 1 0
HK HU 2316 2399 1 2
HK NO 2316 2201 1 0
HK MX 2316 2360 1 0
HK UA 2316 3343 1 2
HK NL 2316 2241 1 4
HK VN 2316 2458 1 2
HK JP 2316 2223 1 3
HK LT 2316 2150 1 0
HK RS 2316 2199 1 0
HK AU 2316 2222 1 1
HK KE 2316 2321 1 0
HK SK 2316 2199 1 2
HK IN 2316 2521 1 143
HK CL 2316 2337 1 0
HK CA 2316 2178 1 0
HK LI 2316 2150 1 0
HK SG 2316 2215 1 0
HK ID 2316 2240 1 0
HK NZ 2316 2234 1 3
HK LU 2316 2183 1 0
HK BZ 2316 2197 1 0
HK CO 2316 2273 1 0
HK TR 2316 2483 1 69
HK BR 2316 4283 1 0
HK SE 2316 2167 1 0
HK IS 2316 2167 1 0
HK FI 2316 2199 1 0
HK DZ 2316 146 1 0
HK PT 2316 2167 1 0
HK DK 2316 2194 1 0
HK IL 2316 2177 1 0
HK MD 2316 2508 1 0
HK AT 2316 2 1 0
HK SC 2316 534 1 0
RO PE 2360 2328 0 0
RO HU 2360 2399 0 2
RO NO 2360 2201 0 0
RO MX 2360 2360 0 0
RO UA 2360 3343 0 2
RO NL 2360 2241 0 4
RO VN 2360 2458 0 2
RO JP 2360 2223 0 3
RO LT 

SK SG 2199 2215 2 0
SK ID 2199 2240 2 0
SK NZ 2199 2234 2 3
SK LU 2199 2183 2 0
SK BZ 2199 2197 2 0
SK CO 2199 2273 2 0
SK TR 2199 2483 2 69
SK BR 2199 4283 2 0
SK SE 2199 2167 2 0
SK IS 2199 2167 2 0
SK FI 2199 2199 2 0
SK DZ 2199 146 2 0
SK PT 2199 2167 2 0
SK DK 2199 2194 2 0
SK IL 2199 2177 2 0
SK MD 2199 2508 2 0
SK AT 2199 2 2 0
SK SC 2199 534 2 0
IN CL 2521 2337 143 0
IN CA 2521 2178 143 0
IN LI 2521 2150 143 0
IN SG 2521 2215 143 0
IN ID 2521 2240 143 0
IN NZ 2521 2234 143 3
IN LU 2521 2183 143 0
IN BZ 2521 2197 143 0
IN CO 2521 2273 143 0
IN TR 2521 2483 143 69
IN BR 2521 4283 143 0
IN SE 2521 2167 143 0
IN IS 2521 2167 143 0
IN FI 2521 2199 143 0
IN DZ 2521 146 143 0
IN PT 2521 2167 143 0
IN DK 2521 2194 143 0
IN IL 2521 2177 143 0
IN MD 2521 2508 143 0
IN AT 2521 2 143 0
IN SC 2521 534 143 0
CL CA 2337 2178 0 0
CL LI 2337 2150 0 0
CL SG 2337 2215 0 0
CL ID 2337 2240 0 0
CL NZ 2337 2234 0 3
CL LU 2337 2183 0 0
CL BZ 2337 2197 0 0
CL CO 2337 2273 0 0
CL TR 2337 2483 0 69
CL BR

In [63]:
demo_data_json['US']

{'AT': {'domains_blocked': [], 'similarity': 0.0},
 'AU': {'domains_blocked': ['kdnet.net'], 'similarity': 0.00045004500450045},
 'BG': {'domains_blocked': ['allegro.pl', 'rarbg.to'],
  'similarity': 0.0009095043201455207},
 'BR': {'domains_blocked': [], 'similarity': 0.0},
 'BZ': {'domains_blocked': [], 'similarity': 0.0},
 'CA': {'domains_blocked': [], 'similarity': 0.0},
 'CL': {'domains_blocked': [], 'similarity': 0.0},
 'CN': {'domains_blocked': ['360.cn',
   'address.com',
   'cefc.org',
   'chinaz.com',
   'people.com.cn',
   'weebly.com',
   'whitepages.com',
   'wikiwiki.jp'],
  'similarity': 0.003616636528028933},
 'CO': {'domains_blocked': [], 'similarity': 0.0},
 'CZ': {'domains_blocked': [], 'similarity': 0.0},
 'DK': {'domains_blocked': [], 'similarity': 0.0},
 'DZ': {'domains_blocked': [], 'similarity': 0.0},
 'ES': {'domains_blocked': [], 'similarity': 0.0},
 'FI': {'domains_blocked': [], 'similarity': 0.0},
 'HK': {'domains_blocked': ['360.cn'], 'similarity': 0.0004587

In [64]:
with open('../data/iclab_new_data_format.json', 'w') as outfile:
    json.dump(demo_data_json, outfile)