In [1]:
from tmilib import *
import csv
from itertools import izip

In [2]:

import plotly.graph_objs as go
import plotly.offline as py

py.init_notebook_mode()



In [3]:
import scipy
import scipy.stats

from timeactive_algorithm_utils import *

In [4]:
user_to_time_to_domain_id = {}
user_to_time_to_nextdomain_id = {}
user_to_time_to_prev7_domains_id = {}
user_to_time_to_immediate_prev7_domains_id = {}

In [5]:
def predict_domain_id_baseline2_algorithm(test_data_line, predictions_line):
  user = test_data_line[1]
  most_popular_domain = get_most_popular_domain_for_user(user)
  return domain_to_id(most_popular_domain)


In [6]:
def predict_domain_id_baseline_algorithm(test_data_line, predictions_line):
  user = test_data_line[1]
  time_to_domain_id = user_to_time_to_domain_id.get(user, None)
  if time_to_domain_id == None:
    time_to_domain_id = get_recent_domain_id_at_seconds_for_user(user)
    user_to_time_to_domain_id[user] = time_to_domain_id
  time_sec = test_data_line[0] # needs to remain string so we can index into things
  return time_to_domain_id[time_sec]

In [7]:
def predict_domain_id_reference_algorithm(test_data_line, predictions_line):
  ref_domain = test_data_line[2]
  if ref_domain == 'none':
    return -1
  ref_domain_id = domain_to_id(ref_domain)
  return ref_domain_id

In [8]:


def predict_domain_id_our_algorithm(test_data_line, predictions_line):
  predict = predictions_line[0]
  label = test_data_line[3]
  ref_domain = test_data_line[2]
  ref_domain_id = domain_to_id(ref_domain)
  if predict == label:
    return ref_domain_id
  time_sec = test_data_line[0] # needs to remain string so we can index into things
  user = test_data_line[1]
  time_to_domain_id = user_to_time_to_domain_id.get(user, None)
  if time_to_domain_id == None:
    time_to_domain_id = get_recent_domain_id_at_seconds_for_user(user)
    user_to_time_to_domain_id[user] = time_to_domain_id
  time_to_nextdomain_id = user_to_time_to_nextdomain_id.get(user, None)
  if time_to_nextdomain_id == None:
    time_to_nextdomain_id = get_next_domain_id_at_seconds_for_user(user)
    user_to_time_to_nextdomain_id[user] = time_to_nextdomain_id
  time_to_prev7_domains_id = user_to_time_to_prev7_domains_id.get(user, None)
  if time_to_prev7_domains_id == None:
    time_to_prev7_domains_id = get_prev7_domains_id_at_seconds_for_user(user)
    user_to_time_to_prev7_domains_id[user] = time_to_prev7_domains_id
  time_to_immediate_prev7_domains_id = user_to_time_to_prev7_domains_id.get(user, None)
  if time_to_immediate_prev7_domains_id == None:
    time_to_immediate_prev7_domains_id = get_immediate_prev7_domains_id_at_seconds_for_user(user)
    user_to_time_to_immediate_prev7_domains_id[user] = time_to_immediate_prev7_domains_id
  pred_domain_id = -1
  if predict == 'c':
    pred_domain_id = time_to_domain_id[time_sec]
  elif predict == 'n':
    pred_domain_id = time_to_nextdomain_id[time_sec]
  elif predict[0] == 'p':
    num = int(predict[1])
    pred_domain_id = time_to_prev7_domains_id[time_sec][num]
  elif predict[0] == 'i':
    num = int(predict[1])
    pred_domain_id = time_to_immediate_prev7_domains_id[time_sec][num]
  if pred_domain_id == -1:
    pred_domain_id = time_to_domain_id[time_sec]
  return pred_domain_id

In [9]:
def get_user_to_time_to_predicted_domain_id(domain_prediction_algorithm): #, all_insession=False):
  # if all_insession = True, then we are doing it on all domains
  # if false, we are doing it only on active times
  classifier_num = 78
  all_insession = True
  if not all_insession:
    predictions_csv_file = 'domainclass_cpn_v' + str(classifier_num) + '_randomforest_v1_predictions.csv'
  else:
    predictions_csv_file = 'domainclass_cpn_v' + str(classifier_num) + '_randomforest_v1_all_insession_predictions.csv'
  if not all_insession:
    test_data_csv_file = 'domainclass_cpn_test_all_withdomain_v' + str(classifier_num) + '.csv'
  else:
    test_data_csv_file = 'domainclass_cpn_test_all_withdomain_v' + str(classifier_num) + '_all_insession.csv'
  predictions_csv = csv.reader(sdir_open(predictions_csv_file))
  predictions_header = next(predictions_csv)
  assert predictions_header[0] == 'predict'

  test_data_csv = csv.reader(sdir_open(test_data_csv_file))
  test_data_header = next(test_data_csv)
  assert test_data_header[0] == 'time_sec'
  assert test_data_header[1] == 'user'
  assert test_data_header[2] == 'ref_domain'
  assert test_data_header[3] == 'label'

  output = {}

  for predictions_line,test_data_line in izip(predictions_csv, test_data_csv):
    user = test_data_line[1]
    if user not in output:
      output[user] = {}
    ref_domain = test_data_line[2]
    ref_domain_id = domain_to_id(ref_domain)
    pred_domain_id = domain_prediction_algorithm(test_data_line, predictions_line)
    time_sec = int(test_data_line[0])
    output[user][time_sec] = pred_domain_id
  return output
  

In [10]:
@jsonmemoized
def get_user_to_time_to_predicted_domain_id_baseline_algorithm():
  return get_user_to_time_to_predicted_domain_id(predict_domain_id_baseline_algorithm)

In [11]:
@jsonmemoized
def get_user_to_time_to_predicted_domain_id_baseline2_algorithm():
  return get_user_to_time_to_predicted_domain_id(predict_domain_id_baseline2_algorithm)

In [12]:
@jsonmemoized
def get_user_to_time_to_predicted_domain_id_our_algorithm():
  return get_user_to_time_to_predicted_domain_id(predict_domain_id_our_algorithm)

In [13]:
@jsonmemoized
def get_user_to_time_to_predicted_domain_id_reference_algorithm(): # note this will include domain_id=-1 for the inactive times!
  return get_user_to_time_to_predicted_domain_id(predict_domain_id_reference_algorithm)

In [14]:
@jsonmemoized
def get_user_to_domain_id_to_total_time_spent_reference_domainalgorithm_on_referenceactive():
  output = {}
  user_to_time_to_domain_id = get_user_to_time_to_predicted_domain_id_reference_algorithm()
  for user,time_to_domain_id in user_to_time_to_domain_id.viewitems():
    active_set = set(get_active_insession_seconds_for_user(user))
    output[user] = Counter()
    for time,domain_id in time_to_domain_id.viewitems():
      if int(time) not in active_set:
        continue
      if domain_id == -1:
        continue
      output[user][domain_id] += 1
  return output


In [15]:
@jsonmemoized
def get_user_to_domain_id_to_total_time_spent_our_domainalgorithm_on_referenceactive():
  output = {}
  user_to_time_to_domain_id = get_user_to_time_to_predicted_domain_id_our_algorithm()
  for user,time_to_domain_id in user_to_time_to_domain_id.viewitems():
    active_set = set(get_active_insession_seconds_for_user(user))
    output[user] = Counter()
    for time,domain_id in time_to_domain_id.viewitems():
      if int(time) not in active_set:
        continue
      if domain_id == -1:
        continue
      output[user][domain_id] += 1
  return output


In [16]:
@jsonmemoized
def get_user_to_domain_id_to_total_time_spent_baseline_domainalgorithm_on_referenceactive():
  output = {}
  user_to_time_to_domain_id = get_user_to_time_to_predicted_domain_id_baseline_algorithm()
  for user,time_to_domain_id in user_to_time_to_domain_id.viewitems():
    active_set = set(get_active_insession_seconds_for_user(user))
    output[user] = Counter()
    for time,domain_id in time_to_domain_id.viewitems():
      if int(time) not in active_set:
        continue
      if domain_id == -1:
        continue
      output[user][domain_id] += 1
  return output


In [17]:
@jsonmemoized
def get_user_to_domain_id_to_total_time_spent_baseline2_domainalgorithm_on_baseline2active():
  output = {}
  user_to_time_to_domain_id = get_user_to_time_to_predicted_domain_id_baseline2_algorithm()
  active_set_for_user = get_user_to_predicted_times_active_baseline_algorithm()
  for user,time_to_domain_id in user_to_time_to_domain_id.viewitems():
    active_set = set(active_set_for_user[user])
    output[user] = Counter()
    for time,domain_id in time_to_domain_id.viewitems():
      if int(time) not in active_set:
        continue
      if domain_id == -1:
        continue
      output[user][domain_id] += 1
  return output


In [18]:
@jsonmemoized
def get_user_to_domain_id_to_total_time_spent_baseline_domainalgorithm_on_baselineactive():
  output = {}
  user_to_time_to_domain_id = get_user_to_time_to_predicted_domain_id_baseline_algorithm()
  active_set_for_user = get_user_to_predicted_times_active_baseline_algorithm()
  for user,time_to_domain_id in user_to_time_to_domain_id.viewitems():
    active_set = set(active_set_for_user[user])
    output[user] = Counter()
    for time,domain_id in time_to_domain_id.viewitems():
      if int(time) not in active_set:
        continue
      if domain_id == -1:
        continue
      output[user][domain_id] += 1
  return output


In [19]:
@jsonmemoized
def get_user_to_domain_id_to_total_time_spent_our_domainalgorithm_on_ouractive():
  output = {}
  user_to_time_to_domain_id = get_user_to_time_to_predicted_domain_id_our_algorithm()
  active_set_for_user = get_user_to_predicted_times_active_our_algorithm()
  for user,time_to_domain_id in user_to_time_to_domain_id.viewitems():
    active_set = set(active_set_for_user[user])
    output[user] = Counter()
    for time,domain_id in time_to_domain_id.viewitems():
      if int(time) not in active_set:
        continue
      if domain_id == -1:
        continue
      output[user][domain_id] += 1
  return output


In [20]:
baseline_domainalgorithm_on_baselineactive = get_user_to_domain_id_to_total_time_spent_baseline_domainalgorithm_on_baselineactive()

In [21]:
baseline2_domainalgorithm_on_baseline2active = get_user_to_domain_id_to_total_time_spent_baseline2_domainalgorithm_on_baseline2active()

In [22]:
print id_to_domain(51285)

thejigsawpuzzles.com


In [23]:
our_domainalgorithm_on_ouractive = get_user_to_domain_id_to_total_time_spent_our_domainalgorithm_on_ouractive()

In [24]:
#our_domainalgorithm_on_ouractive

In [25]:
our_domainalgorithm_on_refactive = get_user_to_domain_id_to_total_time_spent_our_domainalgorithm_on_referenceactive()


In [26]:
baseline_domainalgorithm_on_refactive = get_user_to_domain_id_to_total_time_spent_baseline_domainalgorithm_on_referenceactive()


In [27]:
ref_domainalgorithm_on_refactive = get_user_to_domain_id_to_total_time_spent_reference_domainalgorithm_on_referenceactive()
#a=get_user_to_domain_id_to_total_time_spent_reference_domainalgorithm_on_referenceactive()

In [51]:
def compute_mean_absolute_error_mean_over_all_users(user_to_domain_to_reference_time, user_to_domain_to_predicted_time):
  output = []
  for user,domain_to_reference_time in user_to_domain_to_reference_time.viewitems():
    total = 0
    error = 0
    for domain_id,reference_time in domain_to_reference_time.viewitems():
      total += reference_time
      predicted_time = user_to_domain_to_predicted_time[user].get(domain_id, 0)
      error += abs(reference_time - predicted_time)
    output.append(error/float(total))
  return numpy.mean(output), numpy.std(output)

In [29]:
def compute_mean_absolute_error_over_all_users(user_to_domain_to_reference_time, user_to_domain_to_predicted_time):
  total = 0
  error = 0
  for user,domain_to_reference_time in user_to_domain_to_reference_time.viewitems():
    for domain_id,reference_time in domain_to_reference_time.viewitems():
      total += reference_time
      predicted_time = user_to_domain_to_predicted_time[user].get(domain_id, 0)
      error += abs(reference_time - predicted_time)
  return float(error) / total

In [30]:
print compute_mean_absolute_error_mean_over_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_mean_absolute_error_mean_over_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_ouractive)
print compute_mean_absolute_error_mean_over_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_baselineactive)


0.0
0.305610103095
0.3436877928


In [52]:
print compute_mean_absolute_error_mean_over_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_mean_absolute_error_mean_over_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_ouractive)
print compute_mean_absolute_error_mean_over_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_baselineactive)
#print compute_mean_absolute_error_mean_over_all_users(ref_domainalgorithm_on_refactive, baseline2_domainalgorithm_on_baseline2active)

(0.0, 0.0)
(0.30561010309527176, 0.12914463015027952)
(0.34368779279990042, 0.12832414425695793)


In [50]:
print compute_mean_absolute_error_over_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_mean_absolute_error_over_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_ouractive)
print compute_mean_absolute_error_over_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_baselineactive)
#print compute_mean_absolute_error_over_all_users(ref_domainalgorithm_on_refactive, baseline2_domainalgorithm_on_baseline2active)

0.0
0.319592250853
0.361415956364


In [32]:
print compute_mean_absolute_error_over_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_mean_absolute_error_over_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_refactive)
print compute_mean_absolute_error_over_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_refactive)
#print compute_mean_absolute_error_over_all_users(ref_domainalgorithm_on_refactive, baseline2_domainalgorithm_on_baseline2active)

0.0
0.280670817626
0.316068048091


In [33]:
def compute_pearsonr_unweighted_over_all_users(user_to_domain_to_reference_time, user_to_domain_to_predicted_time):
  vector1 = []
  vector2 = []
  for user,domain_to_reference_time in user_to_domain_to_reference_time.viewitems():
    for domain_id,reference_time in domain_to_reference_time.viewitems():
      predicted_time = user_to_domain_to_predicted_time[user].get(domain_id, 0)
      vector1.append(reference_time)
      vector2.append(predicted_time)
  return scipy.stats.pearsonr(vector1, vector2)

In [34]:
def compute_pearsonr_unweighted_mean_all_users(user_to_domain_to_reference_time, user_to_domain_to_predicted_time):
  output = []
  for user,domain_to_reference_time in user_to_domain_to_reference_time.viewitems():
    vector1 = []
    vector2 = []
    for domain_id,reference_time in domain_to_reference_time.viewitems():
      predicted_time = user_to_domain_to_predicted_time[user].get(domain_id, 0)
      vector1.append(reference_time)
      vector2.append(predicted_time)
    #if len(vector1) == 0:
    #  continue
    output.append(scipy.stats.pearsonr(vector1, vector2)[0])
  return numpy.mean(output)

In [53]:
def compute_pearsonrsquared_mean_all_users(user_to_domain_to_reference_time, user_to_domain_to_predicted_time):
  output = []
  for user,domain_to_reference_time in user_to_domain_to_reference_time.viewitems():
    vector1 = []
    vector2 = []
    for domain_id,reference_time in domain_to_reference_time.viewitems():
      predicted_time = user_to_domain_to_predicted_time[user].get(domain_id, 0)
      vector1.append(reference_time)
      vector2.append(predicted_time)
    #if len(vector1) == 0:
    #  continue
    output.append(scipy.stats.pearsonr(vector1, vector2)[0]**2)
  return numpy.mean(output), numpy.std(output)

In [55]:
print compute_pearsonrsquared_mean_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_pearsonrsquared_mean_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_ouractive)
print compute_pearsonrsquared_mean_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_baselineactive)


(1.0, 0.0)
(0.92016494015988848, 0.12255978348233418)
(0.9138236807003588, 0.12298661247600788)


In [35]:
print compute_pearsonr_unweighted_mean_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_pearsonr_unweighted_mean_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_ouractive)
print compute_pearsonr_unweighted_mean_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_baselineactive)
#print compute_pearsonr_unweighted_mean_all_users(ref_domainalgorithm_on_refactive, baseline2_domainalgorithm_on_baseline2active)


1.0
0.95649883722
0.953141800098


In [36]:
print compute_pearsonr_unweighted_mean_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_pearsonr_unweighted_mean_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_ouractive)
print compute_pearsonr_unweighted_mean_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_baselineactive)


1.0
0.95649883722
0.953141800098


In [37]:
print compute_pearsonr_unweighted_over_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_pearsonr_unweighted_over_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_ouractive)
print compute_pearsonr_unweighted_over_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_baselineactive)


(1.0, 0.0)
(0.94878807005381693, 0.0)
(0.93350928467882033, 0.0)


In [38]:
print compute_pearsonr_unweighted_over_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_pearsonr_unweighted_over_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_refactive)
print compute_pearsonr_unweighted_over_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_refactive)


(1.0, 0.0)
(0.95739290779737296, 0.0)
(0.94364953894957604, 0.0)


In [39]:
def compute_pearsonr_squared_unweighted_over_all_users(a, b):
  return compute_pearsonr_unweighted_over_all_users(a, b)[0]**2

In [40]:
print compute_pearsonr_squared_unweighted_over_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_pearsonr_squared_unweighted_over_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_ouractive)
print compute_pearsonr_squared_unweighted_over_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_baselineactive)


1.0
0.900198801876
0.871439584582


In [41]:
print compute_pearsonr_squared_unweighted_over_all_users(ref_domainalgorithm_on_refactive, ref_domainalgorithm_on_refactive)
print compute_pearsonr_squared_unweighted_over_all_users(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_refactive)
print compute_pearsonr_squared_unweighted_over_all_users(ref_domainalgorithm_on_refactive, baseline_domainalgorithm_on_refactive)




1.0
0.916601179901
0.89047445236


In [48]:
top_20_domains = top_n_domains_by_visits(20)
top_20_domain_ids = [domain_to_id(x) for x in top_20_domains]
domain_id_to_idxnum = {}
for i,x in enumerate(top_20_domain_ids):
  domain_id_to_idxnum[x] = i+1
#twentyone_colors = ["#c62514","#4bcc4d","#902b16","#c95486","#a1c1ba","#7e51d0","#994e49","#564460","#d365b0","#8b21e9","#8560f","#3f3aff","#4fa549","#684bf7","#68234a","#6dbb4e","#650000","#8430a2","#86909","#777cf8","#4198c3"]
twentyone_colors = ["rgb(75,95,78)","rgb(253,220,172)","rgb(187,15,134)","rgb(165,113,175)","rgb(143,34,54)","rgb(202,27,32)","rgb(130,243,171)","rgb(113,214,236)","rgb(235,59,126)","rgb(78,13,233)","rgb(112,13,43)","rgb(46,250,139)","rgb(210,209,59)","rgb(98,137,155)","rgb(123,93,103)","rgb(63,201,189)","rgb(1,75,21)","rgb(69,168,117)","rgb(0,87,43)","rgb(66,130,136)","rgb(50,155,189)"]

def plot_domain_reconstructions(user_to_domain_to_reference_time, user_to_domain_to_predicted_time):
  vector1 = []
  vector2 = []
  point_labels = []
  colors = []
  for user,domain_to_reference_time in user_to_domain_to_reference_time.viewitems():
    for domain_id,reference_time in domain_to_reference_time.viewitems():
      predicted_time = user_to_domain_to_predicted_time[user].get(domain_id, 0)
      vector1.append(reference_time)
      vector2.append(predicted_time)
      domain_id_num = int(domain_id)
      domain = id_to_domain(domain_id_num)
      point_labels.append(domain)
      colors.append(domain_id_to_idxnum.get(domain_id_num, 0))
  # Create a trace
  trace = go.Scatter(
    x = [x/3600.0 for x in vector1],
    y = [x/3600.0 for x in vector2],
    #z = colors,
    mode = 'markers',
    text = point_labels,
    marker=dict(
      #size=5,
      #color=[i/20.0 for i in colors],
      #colorscale=list(enumerate(twentyone_colors)),
      #cmin=0,
      #cmax=20,
      #colorscale=[[i/20.0, x] for i,x in enumerate(twentyone_colors)],
    )
  )
  layout = go.Layout(
    autosize=False,
    width=500,
    height=500,
    margin=go.Margin(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    xaxis=dict(
      title='Actual time user spent on domain (hours)',
      range=[0,70],
    ),
    yaxis=dict(
      title='Estimated time user spent on domain (hours)',
      range=[0,70],
    ),
  )
  
  py.iplot(go.Figure(data=[trace], layout=layout))

In [49]:
plot_domain_reconstructions(ref_domainalgorithm_on_refactive, our_domainalgorithm_on_ouractive)