In [1]:
# noexport

import os
os.system('export_notebook analysis_base.ipynb')

0

In [2]:
import urlparse
from glob import glob
import os
from os import path

from memoized import memoized
import numpy

try:
  import ujson as json
except:
  import json

from collections import Counter
from operator import itemgetter

import pyximport
pyximport.install(setup_args={"include_dirs":numpy.get_include()})
from decompress_lzstring_base64_cython import decompressFromBase64

In [3]:
tmi_overrides = {
  'basedir': None,
}

@memoized
def get_basedir():
  if tmi_overrides['basedir'] != None:
    return tmi_overrides['basedir']
  pathbase = path.dirname(path.realpath('__file__'))
  output = [x for x in glob(pathbase + '/browsingsurvey_*')]
  output.sort(reverse=True)
  return output[0]

def get_basedir_file(filename):
  return get_basedir() + '/' + filename

def jsonload_basedir_file(filename):
  return json.load(open(get_basedir_file(filename)))


In [4]:
def decompress_data_lzstring_base64(data):
  data_type = type(data)
  if data_type == unicode or data_type == str:
    return json.loads(decompressFromBase64(data))
  return data

def uncompress_data_subfields(x):
  if 'windows' in x:
    data_type = type(x['windows'])
    if data_type == unicode or data_type == str:
      x['windows'] = json.loads(decompressFromBase64(x['windows']))
  if 'data' in x:
    data_type = type(x['data'])
    if data_type == unicode or data_type == str:
      x['data'] = json.loads(decompressFromBase64(x['data']))
  return x

@memoized
def get_history_pages():
  return [uncompress_data_subfields(x) for x in jsonload_basedir_file('history_pages.json')]

@memoized
def get_history_visits():
  return [uncompress_data_subfields(x) for x in jsonload_basedir_file('history_visits.json')]

@memoized
def get_survey_results():
  return jsonload_basedir_file('surveyresults.json')

@memoized
def get_user_to_hist_pages():
  output = {}
  for line in get_history_pages():
    if 'user' not in line:
      continue
    user = line['user']
    if user not in output:
      output[user] = []
    output[user].append(line)
  return output

@memoized
def get_user_to_hist_visits():
  output = {}
  for line in get_history_visits():
    if 'user' not in line:
      continue
    user = line['user']
    if user not in output:
      output[user] = []
    output[user].append(line)
  return output

def iterate_hist_pages_for_user(user):
  return get_user_to_hist_pages()[user]

def iterate_hist_visits_for_user(user):
  return get_user_to_hist_visits()[user]


In [5]:
@memoized
def get_history_valid_hids_for_user(user):
  hid_with_history_pages = set()
  hid_to_totalparts = {}
  hid_to_seenparts = {}
  hid_with_complete_history_visits = set()
  for line in iterate_hist_pages_for_user(user):
    hid = line['hid']
    hid_with_history_pages.add(hid)
  for line in iterate_hist_visits_for_user(user):
    hid = line['hid']
    totalparts = line['totalparts']
    idx = line['idx']
    if totalparts < 1:
      raise 'have totalparts value less than one of ' + str(totalparts) + ' for user ' + user
    if hid not in hid_to_totalparts:
      hid_to_totalparts[hid] = totalparts
    else:
      if hid_to_totalparts[hid] != totalparts:
        raise 'inconsistent totalparts for user ' + user + ' on hid ' + str(hid) + ' with values ' + str(totalparts) + ' and ' + str(hid_to_totalparts[hid])
    if hid not in hid_to_seenparts:
      hid_to_seenparts[hid] = set()
    hid_to_seenparts[hid].add(idx)
    num_parts_seen_so_far = len(hid_to_seenparts[hid])
    if num_parts_seen_so_far > totalparts:
      raise 'num parts seen so far ' + str(num_parts_seen_so_far) + ' is greater than totalparts ' + str(totalparts) + ' for user ' + user
    if num_parts_seen_so_far == totalparts:
      hid_with_complete_history_visits.add(hid)
  output = [hid for hid in hid_with_complete_history_visits if hid in hid_with_history_pages]
  output.sort()
  return output

In [6]:
@memoized
def get_history_pages_for_user(user):
  valid_hids = get_history_valid_hids_for_user(user)
  if len(valid_hids) == 0:
    return []
  target_hid = max(valid_hids)
  for line in iterate_hist_pages_for_user(user):
    hid = line['hid']
    if hid != target_hid:
      continue
    data = line['data']
    return data
  return []

@memoized
def get_history_visits_for_user(user):
  valid_hids = get_history_valid_hids_for_user(user)
  if len(valid_hids) == 0:
    return {}
  target_hid = max(valid_hids)
  output = {}
  for line in iterate_hist_visits_for_user(user):
    hid = line['hid']
    if hid < target_hid:
      continue
    data = line['data']
    for k,v in data.viewitems():
      output[k] = v
  return output


In [7]:

@memoized
def get_survey_results_by_user():
  output = {}
  for line in get_survey_results():
    if 'id' not in line:
      continue
    username = line['id']
    output[username] = line
  return output

def get_survey_results_for_user(user):
  return get_survey_results_by_user()[user]

@memoized
def list_users():
  history_pages_by_user = get_user_to_hist_pages()
  history_visits_by_user = get_user_to_hist_visits()
  survey_results_by_user = get_survey_results_by_user()
  users = survey_results_by_user.keys()
  return [x for x in users if x in history_pages_by_user and x in history_visits_by_user]

@memoized
def get_results_by_user():
  fields = {
    'history_pages': get_history_pages_for_user,
    'history_visits': get_history_visits_for_user,
    'survey_results': get_survey_results_for_user
  }
  output = {}
  for user in list_users():
    output[user] = {k: v(user) for k,v in fields.viewitems()}
  return output


In [8]:
def url_to_domain(url):
  return urlparse.urlparse(url).netloc

def print_counter(counter, **kwargs):
  num = kwargs.get('num', 100)
  keys_and_values = [{'key': k, 'val': v} for k,v in counter.viewitems()]
  keys_and_values.sort(key=itemgetter('val'), reverse=True)
  for item in keys_and_values[:num]:
    print item['key'], item['val']

In [9]:
def compute_per_user(func):
  output = {}
  results_by_user = get_results_by_user()
  for user in list_users():
    output[user] = func(user)
  return output

@memoized
def get_history_ordered_visits_for_user(user):
  url_to_visits = get_history_visits_for_user(user)
  ordered_visits = []
  for url,visits in url_to_visits.viewitems():
    for visit in visits:
      visit['url'] = url
    ordered_visits.extend(visits)
  ordered_visits.sort(key=itemgetter('visitTime'))
  return ordered_visits

@memoized
def get_domain_to_num_history_visits_for_user(user):
  output = Counter()
  for url,visits in get_history_visits_for_user(user).viewitems():
    domain = url_to_domain(url)
    output[domain] += len(visits)
  return output

In [10]:
print list_users()

[u'jess', u'shivaal']


In [11]:
#print get_history_visits_for_user('jess').keys()

In [12]:
#print get_history_pages()[0].keys()

In [13]:
#print get_history_visits()[0].keys()

In [14]:
print_counter(get_domain_to_num_history_visits_for_user('jess'))

www.facebook.com 16377
docs.google.com 9812
www.youtube.com 3560
www.google.com 2402
mail.google.com 2103
localhost:8080 1818
www.instagram.com 1300
localhost:3000 1239
web.stanford.edu 390
photos.google.com 309
calendar.google.com 286
www.hulu.com 267
www.airbnb.com 255
www.linkedin.com 244
 217
piazza.com 217
github.com 210
personalityfactors.net 200
www.express.com 184
orgsync.com 175
www.yelp.com 150
drive.google.com 138
stackoverflow.com 135
scholar.google.com 134
web.groupme.com 129
www.lulus.com 127
www.amazon.com 118
www.ae.com 114
www.pandora.com 111
www.missguidedus.com 107
www.netflix.com 106
www.forever21.com 93
gradescope.com 92
www.picmonkey.com 87
soundcloud.com 82
canvas.stanford.edu 76
weblogin.stanford.edu 75
groupme.com 70
axess.sahr.stanford.edu 69
en.wikipedia.org 67
mvideox.stanford.edu 66
localhost:8000 51
therighthairstyles.com 49
www.w3schools.com 47
browsingsurvey.herokuapp.com 46
dashboard.heroku.com 45
docs.angularjs.org 44
www.solarmovie.ph 43
secure.bankof

In [15]:
#print iterate_hist_pages_for_user('jess').keys()

In [16]:
print_counter(get_domain_to_num_history_visits_for_user('shivaal'))

docs.google.com 19582
www.google.com 5599
www.facebook.com 5456
www.messenger.com 2666
web.stanford.edu 2176
drive.google.com 1484
www.youtube.com 1304
localhost:3000 1233
piazza.com 1038
mail.google.com 880
github.com 733
i.imgur.com 694
 689
www.amazon.com 441
canvas.stanford.edu 395
developers.google.com 390
imgur.com 388
www.reddit.com 363
stackoverflow.com 297
m.facebook.com 272
keep.google.com 252
localhost:8000 186
explorecourses.stanford.edu 162
weblogin.stanford.edu 160
app.asana.com 160
gradescope.com 154
www.bibme.org 147
www.netflix.com 143
en.wikipedia.org 143
www.instagram.com 127
www.glassdoor.com 126
www.yikyak.com 120
secure.bankofamerica.com 118
www.linkedin.com 116
materializecss.com 108
calendar.google.com 106
www.applyweb.com 101
axess.sahr.stanford.edu 95
console.developers.google.com 91
www.rescuetime.com 90
mvideox.stanford.edu 87
25live.collegenet.com 86
quip.com 85
www.twilio.com 83
devpost.com 81
www.treehacks.com 80
xsearch.stanford.edu 79
us11.admin.mailchi