In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
import multiprocessing as mp
import pickle
import datetime
import braveblock

from IPython.core.display import display, HTML
display(HTML("<style>.container { width: 80% !important; }</style>"))

def parallel(func, data, process_num=None, chunksize=None, total=None, desc=None, maxtasksperchild=None):
    process_num = mp.cpu_count() if process_num is None else process_num
    chunksize = (total // (process_num * 32) + 1) if chunksize is None else chunksize
    print(f'Parallel {process_num=} {chunksize=}')
    with mp.Pool(process_num, maxtasksperchild=maxtasksperchild) as p:
        for res in tqdm(p.imap_unordered(func, data, chunksize=chunksize), total=total, desc=desc):
            yield res

In [2]:
merged_path = Path('history/merged')
merged_path.mkdir(exist_ok=True)

if not merged_path.is_dir():
    pet = list(map(lambda x: int(x.name),[*Path('history/peterlowe').iterdir()]))
    adg = list(map(lambda x: int(x.name),[*Path('history/adguard').iterdir()]))
    ez = list(map(lambda x: int(x.name), [*Path('history/easylist').iterdir()]))
    ezc = list(map(lambda x: int(x.name), [*Path('history/easylistchina').iterdir()]))
    ezd = list(map(lambda x: int(x.name), [*Path('history/easylistdutch').iterdir()]))
    ezf = list(map(lambda x: int(x.name), [*Path('history/easylistfr').iterdir()]))
    ezg = list(map(lambda x: int(x.name), [*Path('history/easylistgermany').iterdir()]))
    ezh = list(map(lambda x: int(x.name), [*Path('history/easylisthebrew').iterdir()]))
    ezi = list(map(lambda x: int(x.name), [*Path('history/easylistitaly').iterdir()]))
    ezp = list(map(lambda x: int(x.name), [*Path('history/easylistportuguese').iterdir()]))
    ezs = list(map(lambda x: int(x.name), [*Path('history/easylistspanish').iterdir()]))
    ezk = list(map(lambda x: int(x.name), [*Path('history/KoreanList').iterdir()]))

    start = datetime.datetime(2015, 3, 1, 0, 0, 0)
    end = datetime.datetime(2021, 3, 1, 0, 0, 0)

    last_p, last_a, last_e = min(pet), min(adg), min(ez)
    last_ec, last_ed, last_ef, last_eg, last_eh, last_ei, last_ep, last_es, last_ek = min(ezc), min(ezd), min(ezf), min(ezg), min(ezh), min(ezi), min(ezp), min(ezs), min(ezk), 
    for i in tqdm(range((end-start).days)):
        day_start = (start + datetime.timedelta(days=i)).timestamp()
        day_end = (start + datetime.timedelta(days=i+1)).timestamp()
        p = [i for i in pet if day_start <= i < day_end]
        a = [i for i in adg if day_start <= i < day_end if i != 1496233144]  # patch
        e = [i for i in ez if day_start <= i < day_end]

        ec = [i for i in ezc if day_start <= i < day_end]
        ed = [i for i in ezd if day_start <= i < day_end]
        ef = [i for i in ezf if day_start <= i < day_end]
        eg = [i for i in ezg if day_start <= i < day_end]
        eh = [i for i in ezh if day_start <= i < day_end]
        ei = [i for i in ezi if day_start <= i < day_end]
        ep = [i for i in ezp if day_start <= i < day_end]
        es = [i for i in ezs if day_start <= i < day_end]
        ek = [i for i in ezk if day_start <= i < day_end]

        al = [*p,*a,*e,*ec,*ed,*ef,*eg,*eh,*ei,*ep,*es,*ek]
        assert 1 <= len(al) <= 12
        p = p if p else [last_p]
        a = a if a else [last_a]
        e = e if e else [last_e]

        ec = ec if ec else [last_ec]
        ed = ed if ed else [last_ed]
        ef = ef if ef else [last_ef]
        eg = eg if eg else [last_eg]
        eh = eh if eh else [last_eh]
        ei = ei if ei else [last_ei]
        ep = ep if ep else [last_ep]
        es = es if es else [last_es]
        ek = ek if ek else [last_ek]

        lines = []
        lines += [f'||{l}^' for l in (Path('history/peterlowe') / str(p[0])).read_text().strip().splitlines()]
        lines += (Path('history/adguard') / str(a[0])).read_text().strip().splitlines()
        lines += (Path('history/easylist') / str(e[0])).read_text().strip().splitlines()
        lines += (Path('history/easylistchina') / str(ec[0])).read_text().strip().splitlines()
        lines += (Path('history/easylistdutch') / str(ed[0])).read_text().strip().splitlines()
        lines += (Path('history/easylistfr') / str(ef[0])).read_text().strip().splitlines()
        lines += (Path('history/easylistgermany') / str(eg[0])).read_text().strip().splitlines()
        lines += (Path('history/easylisthebrew') / str(eh[0])).read_text().strip().splitlines()
        lines += (Path('history/easylistitaly') / str(ei[0])).read_text().strip().splitlines()
        lines += (Path('history/easylistportuguese') / str(ep[0])).read_text().strip().splitlines()
        lines += (Path('history/easylistspanish') / str(es[0])).read_text().strip().splitlines()
        lines += (Path('history/KoreanList') / str(ek[0])).read_text().strip().splitlines()
        (merged_path / str(int(day_start))).write_text('\n'.join(lines))
        last_p = p[0] 
        last_a = a[0]
        last_e = e[0] 
        last_ec = ec[0] 
        last_ed = ed[0] 
        last_ef = ef[0] 
        last_eg = eg[0] 
        last_eh = eh[0] 
        last_ei = ei[0] 
        last_ep = ep[0] 
        last_es = es[0] 
        last_ek = ek[0] 


In [3]:
# generate merged_history.json
import convertlists

if not Path("../../data/merged_history.json").exists():
    merged_history = dict()
    domain_list = set()
    history_cnt = len(list(Path('history/merged').iterdir()))
    for i, f in enumerate(tqdm(list(Path('history/merged').iterdir())[::-1])):
        with open(f) as fd:
            adms = convertlists.convertlist_adbp(fd, {'verbosity': 0, 'supportedoptions': {'third-party': ''}})
        if i != history_cnt - 1:
            for dm in adms:
                domain_list.add(dm)
                merged_history[dm] = datetime.datetime.utcfromtimestamp(int(f.name))
        else:
            for dm in adms:
                domain_list.add(dm)
                if dm in merged_history:
                    del merged_history[dm]
    with open('../../data/merged_history.json', 'w', encoding='utf-8') as f:
        json.dump(merged_history, f, ensure_ascii=False, indent=4, default=str)
    with open('../../data/domain_list.json', 'w', encoding='utf-8') as f:
        json.dump(list(domain_list), f, ensure_ascii=False, indent=4, default=str)

In [2]:
site2time_parsed = pickle.loads(Path('../../data/site2time_parsed.pickle').read_bytes())

In [3]:
def load_blocker(path):
    return braveblock.Adblocker(rules=Path(path).read_text().strip().splitlines(), include_easylist=False, include_easyprivacy=False)

def load_latest_blocker(path):
    list_dir = Path(path)
    path = list_dir / max([p.name for p in list_dir.iterdir()], key=int)
    return load_blocker(str(path))

latest_blockers = [
    load_latest_blocker('history/merged'),
]
    
def is_blocked(url, source_url, request_type=''):
    return any(
        latest_blocker.check_network_urls(url=url, source_url=source_url, request_type=request_type)
        for latest_blocker in latest_blockers
    )

In [4]:
cache = Path('../../data/site2ad_parsed.pickle')

if cache.is_file():
    site2ad_parsed = pickle.loads(Path('../../data/site2ad_parsed.pickle').read_bytes())
else:
    site2ad_parsed = {site: set() for site, _ in site2time_parsed.items()}

    def func(site):
        time2parsed = site2time_parsed[site]
        ad_parsed = set()
        for time, parsed_urls in time2parsed.items():
            ad_parsed.update(
                parsed for parsed in parsed_urls if is_blocked(parsed.geturl(), f'http://{site}/')
            )
        return site, ad_parsed

    for site, ad_parsed in parallel(func, site2time_parsed, total=len(site2time_parsed)):
        site2ad_parsed[site].update(ad_parsed)

    cache.write_bytes(pickle.dumps(site2ad_parsed))

In [15]:
time_blockers = [f.name for f in list(Path('history/merged').iterdir())]
time_blockers.sort(key=lambda t_blocker:int(t_blocker))

cache_i = [1096, 1095, 548, 547, 1644, 1643, 274, 273, 822, 821, 1370, 1369, 1918, 1917, 137, 136, 411, 410, 685, 684, 959, 958, 1233, 1232, 1507, 1506, 1781, 1780, 2055, 2054, 69, 68, 206, 205, 343, 342, 480, 479, 617, 616, 754, 753, 891, 890, 1028, 1027, 1165, 1164, 1302, 1301, 1439, 1438, 1576, 1575, 1713, 1712, 1850, 1849, 1987, 1986, 2124, 2123, 35, 34, 172, 171, 309, 308, 446, 445, 583, 582, 720, 719, 857, 856, 994, 993, 1131, 1130, 1268, 1267, 1405, 1404, 1542, 1541, 1679, 1678, 1816, 1815, 1953, 1952, 2090, 2089, 103, 102, 240, 239, 377, 376, 0]

time_blocker_cache = { i: load_blocker(Path('history/merged') / time_blockers[i]) for i in cache_i }

def get_time_blocker(i):
    if i in time_blocker_cache:
        return time_blocker_cache[i]
    return load_blocker(Path('history/merged') / time_blockers[i])

def binary_search(url, site):
    t = time_blockers[0]
    source_url = f'http://{site}/'
    if get_time_blocker(0).check_network_urls(url=url, source_url=source_url, request_type=''):
        return t
    L, R = 1, len(time_blockers)
    while L < R:
        M = (L + R) // 2
        RR = get_time_blocker(M).check_network_urls(url=url, source_url=source_url, request_type='')
        if RR:
            LL = get_time_blocker(M - 1).check_network_urls(url=url, source_url=source_url, request_type='')
            if LL:
                R = M
            else:
                t = time_blockers[M]
                return t
        else:
            L = M + 1
    return None

In [6]:
from urllib.parse import urlparse
#is_blocked("https://i.legendas.tv/equipe/204x25/legendas_tv_201801102138530.gif", "legendas.tv")
binary_search("https://i.legendas.tv/equipe/204x25/legendas_tv_201801102138530.gif", "legendas.tv")

In [7]:
binary_search("https://insight.adsrvr.org/track/conv/?adv=hip6dvm&ct=0:gzluor6k&fmt=3", "www.buffalowildwings.com")

'1425168000'

In [16]:
import gc
def func(s_req):
    site, t2requests = s_req
    
    if (Path("checkpoints") / f'{site}.pickle').is_file():
        return site
    
    results = []
    requests_set = set()
    cache = dict()
    
    for requests in t2requests.values():
        for parsed in requests:
            requests_set.add(parsed)
    for parsed in requests_set:
        if is_blocked(parsed.geturl(), f'http://{site}/'):
            t = binary_search(parsed.geturl(), site)
            results.append((site, parsed, t))
    (Path("checkpoints") / f'{site}.pickle').write_bytes(pickle.dumps(results))
    return site

for site in parallel(func, list(site2time_parsed.items()), total=len(list(site2time_parsed)), maxtasksperchild=5, process_num=30):
    pass

Parallel process_num=30 chunksize=1


  0%|          | 0/292 [00:00<?, ?it/s]

In [None]:
del time_blocker_cache

In [5]:
site2_ad_parsed2t = {
    site: {} for site in site2ad_parsed
}
for i in tqdm([*Path('checkpoints').iterdir()]):
    results = pickle.loads(i.read_bytes())
    for site, ad_parsed, t in results:
        if t is not None:
            site2_ad_parsed2t[site][ad_parsed] = int(t)

  0%|          | 0/46922 [00:00<?, ?it/s]

In [8]:
for site in tqdm(site2ad_parsed):
    ad_parsed = site2ad_parsed[site]
    ad_parsed2t = site2_ad_parsed2t[site]
    assert set(ad_parsed)  == set(ad_parsed2t.keys())

  0%|          | 0/46922 [00:00<?, ?it/s]

In [9]:
Path('../../data/site2_ad_parsed2t.pickle').write_bytes(pickle.dumps(site2_ad_parsed2t))

190362479

In [None]:
count = []
def b(v):
    global count
    L, R = 1, 2192
    while L < R:
        M = (L + R) // 2
        count.append(M)
        count.append(M - 1)
        RR = (v <= M)
        if RR:
            LL = (v <= M - 1)
            if LL:
                R = M
            else:
                return M
        else:
            L = M + 1
    return None
for i in range(1, 2192):
    assert i == b(i)
from collections import Counter
print(list(dict(Counter(count).most_common(100)).keys()))