In [1]:
import json
import pandas as pd
import copy
import datetime
import time
import math

In [2]:
""" The function takes in the search history and list of cookies
    and shorten the urls for comparisons and graph node outputs. """
def shorten_urls(short_history, short_cookies):
    for i in short_history:
        if 'url' in i:
            i['url'] = i['url'].split('/')[2]
    
    for i in short_cookies:
        if 'domain' in i:
            i['domain'] = i['domain'].split('.')[-2]

In [3]:
#load in the json file with cookie and history information
test_data = json.load(open('data.js'))

# needed for making calls to the api to delete cookies
history = copy.deepcopy(test_data['history'])
cookies = copy.deepcopy(test_data['cookies'])


#get shortened urls for cookies and history
short_history = copy.deepcopy(history)
short_cookies = copy.deepcopy(cookies)

#shorten urls
shorten_urls(short_history, short_cookies)

In [4]:
""" Return the number of days (x) in seconds to use for time comparisons. """
def calc_days_in_sec(x):
    return x * 24 * 60 *60

In [5]:
""" Takes in the average time a cookie is placed on a computer for
    (avg_time) and the list of cookies (cookies) to add the estimated
    date the cookie was placed. """
def add_place_date(avg_time, cookies):   
    min_place_time = calc_days_in_sec(avg_time)
    for i in cookies:
        if 'expirationDate' in i:
            i['placeDate'] = i['expirationDate'] - min_place_time
add_place_date(90, short_cookies)

In [6]:
""" Goes through the list of cookies (cookies) for a given site (STR)
    and finds the minimum (oldest) place and maximum (most recent) expiration
    date to find the range in which the cookies could collect data. """
def get_place_and_expir_date(str, cookies):
    place_date = math.inf
    expir_date = -math.inf
    for i in cookies:
        if 'domain' in i and str in i['domain'].lower() and 'expirationDate' in i:
            place_date = min(place_date, i['placeDate'])
            expir_date = max(expir_date, i['expirationDate'])
    return place_date, expir_date

In [7]:
""" Takes in the the date the first cookie was placed (place_date) and the date
    the cookie will expire (expir_date) and the browsing history (history) to 
    create a list of websites visited in time frame (in order from most recent to
    oldest). """
def get_list_of_websites_visited(place_date, expir_date, history):
    visit_List = []
    for i in history:
        visit_time = i['lastVisitTime'] 
        if place_date <= visit_time and visit_time >= expir_date:
            visit_List.append(i)
    return visit_List 

In [8]:
""" Takes a list of websites visited (visit_list) which is from most to least
    recent and return edge pairs for the graph in order of oldest to newest. """
def make_pairs(visit_list):
    len_list = len(visit_list) - 1
    pairs = []
    while len_list > 0:
        curr_url = visit_list[len_list]['url']
        next_url = visit_list[len_list - 1]['url']
        if curr_url != next_url:
            pairs.append([curr_url, next_url])
        len_list -= 1
    return pairs

In [9]:
""" Takes in the website name (str) and returns a list of pairs of websites 
    visited while the cookie was in place. """
def get_nodes_for_cookie(str):
    place_date, expir_date = get_place_and_expir_date(str, short_cookies)
    visit_list = get_list_of_websites_visited(place_date, expir_date, short_history)
    pairs_list = make_pairs(visit_list)
    return pairs_list, place_date, expir_date

In [10]:
get_nodes_for_cookie('youtube')

([['bcourses.berkeley.edu', 'berkeleyforum.slack.com'],
  ['berkeleyforum.slack.com', 'slack.com'],
  ['slack.com', 'mail.google.com'],
  ['mail.google.com', 'ieorpicnicgroup.slack.com'],
  ['ieorpicnicgroup.slack.com', 'www.facebook.com'],
  ['www.facebook.com', 'www.google.com'],
  ['www.google.com', 'scholar.google.com'],
  ['scholar.google.com', 'ieeexplore.ieee.org'],
  ['ieeexplore.ieee.org', 'dl.acm.org'],
  ['dl.acm.org', 'scholar.google.com'],
  ['scholar.google.com', 'link.springer.com'],
  ['link.springer.com', 'www.springer.com'],
  ['www.springer.com', 'www.lib.berkeley.edu'],
  ['www.lib.berkeley.edu', 'libproxy.berkeley.edu'],
  ['libproxy.berkeley.edu', 'login.libproxy.berkeley.edu'],
  ['login.libproxy.berkeley.edu', 'shib.berkeley.edu'],
  ['shib.berkeley.edu', 'p8888-ucelinks.cdlib.org.libproxy.berkeley.edu'],
  ['p8888-ucelinks.cdlib.org.libproxy.berkeley.edu', 'www.springer.com'],
  ['www.springer.com', 'link.springer.com'],
  ['link.springer.com', 'p8888-ucelinks.

In [11]:
""" Return the number of cookies that aren't restricted to the website
    that placed the cookie. """
def calc_num_restrictionless_cookies(cookies):    
    no_restrict = 0
    for i in cookies:
        if i['hostOnly'] == False and i['sameSite'] == 'no_restriction':
            no_restrict += 1
    return no_restrict
no_restrict = calc_num_restrictionless_cookies(short_cookies)
print("Number of cookies that are restrictionless", no_restrict)  

Number of cookies that are restrictionless 290


In [12]:
""" Calculate the percentage of websites that may contain sensitive 
    information based on the website url. """
def percentage_of_sensitive_information(history):
    websites = 0
    for i in history:
        if 'edu' in i['url'] or 'berkeley' in i['url']:
            websites += 1
    return websites/len(test_data['history']) * 100
per = percentage_of_sensitive_information(short_history)
print("Percentage of websites that might contain sensitive data", per)        

Percentage of websites that might contain sensitive data 17.0


In [13]:
""" Returns the number of cookies that are not restricted to HTTPS sites. """
def insecure_num_cookies(cookies):    
    insecure = 0
    for i in cookies:
        if i['secure'] == False:
            insecure += 1
    return insecure
insecure = insecure_num_cookies(short_cookies)
print("Number of insecure cookies", insecure)  

Number of insecure cookies 392


In [14]:
visit, place_date, expir_date = get_nodes_for_cookie('facebook')
print(len(visit))

63


In [15]:
""" Takes in the expiration date of a cookie and returns the number of days
    the cookie will remain on the laptop. """
def how_much_longer(expir_date):
    current = time.time()
    days_left = (expir_date - current)/(60*60*24)
    return days_left
print(how_much_longer(expir_date))  

716.1493789215524


In [16]:
#print format of history items
len(test_data)
print(short_history[0])

{'id': '47348', 'url': 'stackoverflow.com', 'visitCount': 1, 'title': 'How can I pretty-print JSON using JavaScript? - Stack Overflow', 'lastVisitTime': 1524550229203.5789, 'typedCount': 0}


In [17]:
#print format of cookies items
print(test_data['cookies'][0])

{'domain': 'reserve.jacobshall.org', 'path': '/schedule/login/Reservations', 'expirationDate': 1587585931.903761, 'secure': False, 'httpOnly': False, 'sameSite': 'no_restriction', 'name': 'SS_181449_username', 'storeId': '0', 'value': 'skudtarkar%40berkeley.eduK', 'hostOnly': True, 'session': False}


In [18]:
#print the comparison to determine order of history
last_time = 0
for i in short_history:
    #print(i['url'])
    #print(datetime.datetime.utcfromtimestamp(i['lastVisitTime']))
    print(last_time)
    if last_time <= i['lastVisitTime']:
        print('higher than last time')
    else:
        print('lower than last time')
    last_time = i['lastVisitTime']    

0
higher than last time
1524550229203.5789
lower than last time
1524550224310.732
lower than last time
1524550199405.6611
lower than last time
1524550121931.438
lower than last time
1524549593781.129
lower than last time
1524549481207.16
lower than last time
1524549476682.011
lower than last time
1524549399355.885
lower than last time
1524549399336.76
lower than last time
1524549219714.112
lower than last time
1524549219261.932
lower than last time
1524549134500.521
lower than last time
1524549123955.048
lower than last time
1524549122001.3708
lower than last time
1524549052496.623
lower than last time
1524549028480.4631
lower than last time
1524548794132.3918
lower than last time
1524548280645.653
lower than last time
1524548274840.4282
lower than last time
1524548269479.605
lower than last time
1524548107472.334
lower than last time
1524547912707.244
lower than last time
1524547894129.2268
lower than last time
1524547873993.402
lower than last time
1524547733573.968
lower than last t