In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk, bulk
import os
from tqdm import tqdm
import pandas as pd
import random
import json
from urllib.parse import urlparse

_2022_policies = {}
with open('/data/privseer/third-crawl/metadata_2023') as f:
    data = f.readlines()
for line in data:
    line = json.loads(line)
    if line['final']:
        _2022_policies[line['hash']] = {'domain': urlparse(line['url']).netloc, 'url': line['url'], 'hash': line['hash'], 'pagerank': line['pagerank'], 'readability': line['readability'], 'industry': line['industry'], 'probability': line['proba'], 'industry': line['industry'], 'tracking_tech': [], 'self_regulatory_bodies': [], 'agreements_regulations': [], 'display_date': line['timestamp'].split(' ')[0], 'html_location': '/data/privseer/third-crawl/third_crawl/third_crawl/urls/'+line['folder_number']+'/'}

_2019_policies = {}
with open('/data/privseer/data/updated_privaseer_metadata1') as f:
    data = f.readlines()
for line in data:
    line = json.loads(line)
    _2019_policies[line['hash']] = {'display_date': '-'.join(line['crawl-date'].split('-')[::-1]), 'html_location': '/data/privseer/urls/'+line['path']+'/'}

_2020_policies = {}
with open('/data/privseer/data/updated_linkedin_metadata1') as f:
    data = f.readlines()
for line in data:
    line = json.loads(line)
    _2020_policies[line['file_hash']] = {'industry': line['industry'], 'display_date': '-'.join(line['crawl_date_time'].split(' ')[0].split('-')[::-1]), 'html_location': '/data/sxn5310/privacysearch/privacy_policies/pages/'+line['folder_num']+'/'}

_2021_policies = {}
with open('/data/privseer/data/2021_crawl/2021_metadata') as f:
    data = f.readlines()
for line in data:
    line = json.loads(line)
    if line['proba'] > 0.5 and not line['duplicate'] and not line['near_duplicate'] and line['verified']:
        _2021_policies[line['hash']] = {'domain': urlparse(line['response']).netloc, 'url': line['response'], 'probability': line['proba'], 'display_date': line['timestamp'].split(' ')[0], 'html_location': '/data/privseer/crawl-update/crawlupdate/crawlupdate/urls/'+line['folder_number']+'/', 'vagueness': random.uniform(0, 1)}

with open('/data/privseer/data/pageranks') as f:
    data = f.readlines()
for ranks in data:
    ranks = ranks.strip()
    hash_val, val = ranks.split(',')
    if hash_val in _2019_policies:
        _2019_policies[hash_val]['pagerank'] = val 

with open('/data/privseer/data/pageranks-linkedin') as f:
    data = f.readlines()
for ranks in data:
    ranks = ranks.strip()
    hash_val, val = ranks.split(',')
    if hash_val in _2020_policies:
        _2020_policies[hash_val]['pagerank'] = val

with open('/data/privseer/data/2021_crawl/pageranks') as f:
    data = f.readlines()
for ranks in data:
    ranks = ranks.strip()
    hash_val, val = ranks.split(',')
    if hash_val in _2021_policies:
        _2021_policies[hash_val]['pagerank'] = val

with open('/data/privseer/data/readability-output') as f:
    data = f.readlines()
for ranks in data:
    ranks = ranks.strip()
    ranks = ranks.split(',')
    hash_val = ranks[0]
    readability = ranks[1]
    readability = float(readability)
    if hash_val in _2019_policies:
        _2019_policies[hash_val]['readability'] = readability

with open('/data/privseer/data/2021_crawl/readability') as f:
    data = f.readlines()
for ranks in data:
    ranks = ranks.strip()
    hash_val, val = ranks.split(',')
    val = float(val)
    if hash_val in _2021_policies:
        _2021_policies[hash_val]['readability'] = val

with open('/data/sxn5310/document_classifier_output/probabilities') as f:
    data = f.readlines()
for line in data:
    hash_val, proba = line.split(' ')
    if hash_val in _2020_policies:
        _2020_policies[hash_val]['probability'] = float(proba)

with open('/data/privseer/python-files/probabilities') as f:
    data = f.readlines()
for line in data:
    hash_val, proba = line.split(' ')
    if hash_val in _2019_policies:
        _2019_policies[hash_val]['probability'] = float(proba)

with open('/data/privseer/data/tracking-tech.json') as f:
    data = json.load(f)
for i in data:
    data[i] = set(data[i])
for hash_val in _2019_policies:
    _2019_policies[hash_val]['tracking_tech'] = []
    for tech in data:
        if hash_val in data[tech]:
            if '-' in tech:
                _2019_policies[hash_val]['tracking_tech'].append(''.join(tech.split('-')))
            else:
                _2019_policies[hash_val]['tracking_tech'].append(tech)


with open('/data/privseer/data/2021_crawl/tracking-tech.json') as f:
    data = json.load(f)
for i in data:
    data[i] = set(data[i])
for hash_val in _2021_policies:
    _2021_policies[hash_val]['tracking_tech'] = []
    for tech in data:
        if hash_val in data[tech]:
            if '-' in tech:
                _2021_policies[hash_val]['tracking_tech'].append(''.join(tech.split('-')))
            else:
                _2021_policies[hash_val]['tracking_tech'].append(tech)


with open('/data/privseer/data/vagueness_output') as f:
    data = f.readlines()
vagueness = {}
for line in data:
    hash_, vague = line.split(' ')
    vagueness[hash_] = float(vague)
for hash_val in _2019_policies:
    if hash_val in vagueness:
        _2019_policies[hash_val]['vagueness'] = vagueness[hash_val]
    else:
        _2019_policies[hash_val]['vagueness'] = 0.3


with open('/data/sxn5310/regulations-analysis/privaseer2') as f:
    data = f.readlines()
for hash_val in _2020_policies:
    _2020_policies[hash_val]['agreements_regulations'] = []
    _2020_policies[hash_val]['self_regulatory_bodies'] = []
for line in data:
    line = json.loads(line)
    if line['file_hash'] in _2020_policies:
        if line['nai']: 
            _2020_policies[line['file_hash']]['self_regulatory_bodies'].append('nia')
        if line['daa']:
            _2020_policies[line['file_hash']]['self_regulatory_bodies'].append('daa')
        if line['edaa']:
            _2020_policies[line['file_hash']]['self_regulatory_bodies'].append('edaa')
        if line['gdpr']:
            _2020_policies[line['file_hash']]['agreements_regulations'].append('gdpr')
        if line['coppa']:
            _2020_policies[line['file_hash']]['agreements_regulations'].append('coppa')
        if line['caloppa']:
            _2020_policies[line['file_hash']]['agreements_regulations'].append('caloppa')
        if line['privacyshield']:
            _2020_policies[line['file_hash']]['agreements_regulations'].append('privacyshield')

with open('/data/privseer/data/2021_crawl/regulations.json') as f:
    data = f.readlines()
for hash_val in _2021_policies:
    _2021_policies[hash_val]['agreements_regulations'] = []
    _2021_policies[hash_val]['self_regulatory_bodies'] = []
for line in data:
    line = json.loads(line)
    if line['file_hash'] in _2021_policies:
        if line['nai']:
            _2021_policies[line['file_hash']]['self_regulatory_bodies'].append('nia')
        if line['daa']:
            _2021_policies[line['file_hash']]['self_regulatory_bodies'].append('daa')
        if line['edaa']:
            _2021_policies[line['file_hash']]['self_regulatory_bodies'].append('edaa')
        if line['gdpr']:
            _2021_policies[line['file_hash']]['agreements_regulations'].append('gdpr')
        if line['privacyshield']:
            _2021_policies[line['file_hash']]['agreements_regulations'].append('privacyshield')
        if line['coppa']:
            _2021_policies[line['file_hash']]['agreements_regulations'].append('coppa')
        if line['caloppa']:
            _2021_policies[line['file_hash']]['agreements_regulations'].append('caloppa')
        if line['ccpa']:
            _2021_policies[line['file_hash']]['agreements_regulations'].append('ccpa')
        if line['hipaa']:
            _2021_policies[line['file_hash']]['agreements_regulations'].append('hippa')

with open('/data/privseer/data/2021_crawl/hash_to_industry') as f:
    _2021_industries = json.load(f)


files = os.listdir('/data/privseer/third-crawl/boilerpipe-policy-text/')

for file in tqdm(files):
    actions = []
    with open('/data/privseer/third-crawl/boilerpipe-policy-text/'+file, encoding='utf-8') as f:
        data = f.readlines()
    for line in data:
        line = json.loads(line)
        _hash = line['hash']
        if _hash in _2022_policies:
            _2022_policies[_hash]['text'] = line['text']
            _2022_policies[_hash]['title'] = line['title']
            _2022_policies[_hash]['vagueness'] = 0.3
            _2022_policies[_hash]['crawl_date'] = ['o2022']

files = os.listdir('/data/privseer/linkedin-boilerpipe-policy-text/')

for file in tqdm(files):
    actions = []
    with open('/data/privseer/linkedin-boilerpipe-policy-text/'+file, encoding='utf-8') as f:
        data = f.readlines()
    for line in data:
        line = json.loads(line)
        _hash = line['hash']
        if line['hash'] in _2020_policies:
            _2020_policies[line['hash']]['crawl_date'] = ['m2020']
            _2020_policies[line['hash']]['text'] = line['text']
            _2020_policies[line['hash']]['title'] = line['title']


DIRECTORY = '/data/privseer/boilerpipe-policy-text/'
file_dump = os.listdir(DIRECTORY)

for file in tqdm(file_dump):
    with open(DIRECTORY+file) as f:
        lines = f.readlines()
    for line in lines:
        line = json.loads(line)
        line['industry'] = 'nan' 
        if line['hash'] in _2019_policies:
            _2019_policies[line['hash']]['crawl_date'] = ['j2019']
            _2019_policies[line['hash']]['text'] = line['text']
            _2019_policies[line['hash']]['title'] = line['title']


DIRECTORY = '/data/privseer/crawl-update/boilerpipe-policy-text/'
file_dump = os.listdir(DIRECTORY)
for file in tqdm(file_dump):
    actions = []
    with open(DIRECTORY+file) as f:
        lines = f.readlines()
    for line in lines:
        line = json.loads(line)
        line['industry'] = 'nan'
        if line['hash'] in _2021_policies:
            _2021_policies[line['hash']]['crawl_date'] = ['a2021']
            _2021_policies[line['hash']]['text'] = line['text']
            _2021_policies[line['hash']]['title'] = line['title']


for idx, k in enumerate(_2019_policies):
  if idx == 3: break
  print((k, _2019_policies[k]))

print()

for idx, k in enumerate(_2020_policies):
  if idx == 3: break
  print((k, _2020_policies[k]))

print()

for idx, k in enumerate(_2021_policies):
    if idx == 3: break
    print((k, _2021_policies[k]))

print()

for idx, k in enumerate(_2022_policies):
    if idx == 3: break
    print((k, _2022_policies[k]))
