In [13]:
import csv
import lzstring
from collections import namedtuple, Counter
import json
from memoize import memoize
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import sys
import json

In [14]:
csv.field_size_limit(sys.maxsize)

# [num unique urls, num unique urls typed, total visits, total typed, first visit time, last visit time]
domaininfo = namedtuple('domaininfo', ['num_unique_urls', 'num_unique_urls_typed', 'total_visits', 'total_typed', 'first_visit_time', 'last_visit_time'])

decompressFromEncodedURIComponent = lzstring.LZString().decompressFromEncodedURIComponent

#filepath = 'difficultyselectionexp_may31_11am.csv'
filepath = 'data/JUL17.csv'
reader = csv.DictReader(open(filepath))

def extract_domain_visit_info(domain_visit_info_compressed):
  domain_visit_info = json.loads(decompressFromEncodedURIComponent(domain_visit_info_compressed))
  output = {}
  for k,v in domain_visit_info.items():
    linedata = domaininfo(*v)
    output[k] = linedata
  return output

alldata = []

for alldata_item in reader:
  if alldata_item['selected_difficulty'] not in ['nothing', 'easy', 'medium', 'hard']:
    continue
  if alldata_item['domain_visit_info_compressed'] == None or len(alldata_item['domain_visit_info_compressed']) == 0:
    continue
  alldata_item['domain_visit_info'] = extract_domain_visit_info(alldata_item['domain_visit_info_compressed'])
  alldata.append(alldata_item)


In [15]:
#np.random.shuffle(alldata)
training_data = alldata[:round(len(alldata)*0.8)]
test_data = alldata[round(len(alldata)*0.8):]
print(len(training_data))
print(len(test_data))

605
151


In [16]:
def extract_labels_alldata(data):
  return np.array([line['selected_difficulty'] for line in data])

@memoize
def get_most_common_label():
  label_to_count = Counter()
  for line in training_data:
    label = line['selected_difficulty']
    label_to_count[label] += 1
  sorted_by_count = sorted(label_to_count.items(), key=lambda x: x[1], reverse=True)
  return sorted_by_count[0][0]

@memoize
def get_most_visited_domains():
  domain_to_num_visits = Counter()
  for line in training_data:
    domain_visit_info = line['domain_visit_info']
    for domain,info in domain_visit_info.items():
      domain_to_num_visits[domain] += info.total_visits
  sorted_by_num_visits = sorted(domain_to_num_visits.items(), key=lambda x: x[1], reverse=True)
  return [x[0] for x in sorted_by_num_visits[:100]]

cnt = 0
@memoize
def get_most_common_domains():
  domain_to_num_visits = Counter()
  for line in training_data:
    domain_visit_info = line['domain_visit_info']
    for domain,info in domain_visit_info.items():
      domain_to_num_visits[domain] += 1
  sorted_by_num_visits = sorted(domain_to_num_visits.items(), key=lambda x: x[1], reverse=True)
  return [x[0] for x in sorted_by_num_visits[:100]]

def get_all_domains():
  domain_to_num_visits = Counter()
  for line in training_data:
    domain_visit_info = line['domain_visit_info']
    for domain,info in domain_visit_info.items():
      domain_to_num_visits[domain] += 1
  sorted_by_num_visits = sorted(domain_to_num_visits.items(), key=lambda x: x[1], reverse=True)
  return [x[0] for x in sorted_by_num_visits]



def get_num_visits_for_domain(domain_visit_info, domain):
  info = domain_visit_info.get(domain, None)
  if info != None:
    return info.total_visits
  return 0

def get_productivity():
    with open ('domain_to_productivity.json') as json_file:
        data = json.load(json_file)
        return data

domain_to_productivity = get_productivity()

def extract_features_for_user(domain_visit_info):
  cnt = 0
  domains = get_all_domains()
  final_features = [0, 0, 0, 0, 0];
  for x in domains:
        if x in domain_to_productivity.keys():
            final_features[domain_to_productivity[x] + 2] += get_num_visits_for_domain(domain_visit_info, x)
    
  #np.array([get_num_visits_for_domain(domain_visit_info, x) for x in domains])
  if np.sum(final_features) >= 1:
    final_features = np.divide(final_features, np.sum(final_features))
  return final_features

def extract_features_alldata(data):
  output = []
  for line in data:
    domain_visit_info = line['domain_visit_info']
    features = extract_features_for_user(domain_visit_info)
    output.append(features)
  return np.array(output)


In [17]:
def get_percent_correct(predicted_labels, actual_labels):
  if len(predicted_labels) != len(actual_labels):
    raise 'need predicted and actual labels to have same lengths'
  total = len(actual_labels)
  correct = 0
  for p,a in zip(predicted_labels, actual_labels):
    if p == a:
      correct += 1
  return correct / total

def test_baseline_classifier():
  most_common_label = get_most_common_label()
  predictions = [most_common_label for line in test_data]
  actual = extract_labels_alldata(test_data)
  percent_correct = get_percent_correct(predictions, actual)
  print('baseline classifier accuracy:', percent_correct)

def test_classifier(clf,features_test, actual, str=None):
  #actual = extract_labels_alldata(test_data)
  #features_test = extract_features_alldata(test_data)
  predictions = clf.predict(features_test)
  percent_correct = get_percent_correct(predictions, actual)
  return percent_correct

def training_error_classifier(clf, str=None):
  actual = extract_labels_alldata(training_data)
  features_train = extract_features_alldata(training_data)
  predictions = clf.predict(features_train)
  percent_correct = get_percent_correct(predictions, actual)
  print(str + ' classifier training accuracy:', round(percent_correct, 2))

def to_int_categorical(dt):
  # {'easy', 'hard', 'medium', 'nothing'}
  cat_dt = []
  for item in dt:
    if item == 'nothing':
      cat_dt.append(0)
    elif item == 'easy':
      cat_dt.append(1)
    elif item == 'medium':
      cat_dt.append(2)
    else:
      cat_dt.append(3)
  return np.array(cat_dt)
    
test_baseline_classifier()

baseline classifier accuracy: 0.5231788079470199


In [20]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils import to_categorical
from keras import regularizers

labels_test = to_categorical(to_int_categorical(extract_labels_alldata(test_data)), num_classes=4)
features_test = extract_features_alldata(test_data)

labels_train = to_categorical(to_int_categorical(extract_labels_alldata(training_data)), num_classes=4)
features_train = extract_features_alldata(training_data)

In [21]:
stats = {'nothing': {'count': 0, 'avg':0}, 'easy' : {'avg': 0, 'count': 0}, 'medium' : {'avg': 0, 'count': 0},
         'hard' : {'avg': 0, 'count': 0}}
users = [{'nothing': 0, 'easy': 0, 'medium': 0, 'hard': 0, 'count': 0}, 
       {'nothing': 0, 'easy': 0, 'medium': 0, 'hard': 0, 'count': 0}, 
       {'nothing': 0, 'easy': 0, 'medium': 0, 'hard': 0, 'count': 0}, 
       {'nothing': 0, 'easy': 0, 'medium': 0, 'hard': 0, 'count': 0}]

sm1 = []
sm2 = []

for line in training_data:
    domain_visit_info = line['domain_visit_info']
    cnt = 0
    sm = 0
    cnt1 = 0
    cnt2 = 0
    domains = get_all_domains()
    final_features = [0, 0, 0, 0, 0];
    for x in domains:
        if x in domain_to_productivity.keys():
            sm += domain_to_productivity[x] * get_num_visits_for_domain(domain_visit_info, x)
            cnt += get_num_visits_for_domain(domain_visit_info, x)
            cnt1 += 1
        else:
            cnt2 += 1
    sm1.append(cnt1)
    sm2.append(cnt2)
    
    if cnt > 0:
        sm /= cnt
    #print(sm)
    stats[line['selected_difficulty']]['count'] += 1
    stats[line['selected_difficulty']]['avg'] += sm
    if sm >= 1: 
        users[3][line['selected_difficulty']] += 1
        users[3]['count'] += 1
    elif sm >= 0: 
        users[2][line['selected_difficulty']] += 1
        users[2]['count'] += 1
    elif sm >= -1:
        users[1][line['selected_difficulty']] += 1
        users[1]['count'] += 1
    else: 
        users[0][line['selected_difficulty']] += 1
        users[0]['count'] += 1

print(sm1)
print(sm2)
     
    #hard medium easy nothing 
stats['hard']['avg'] /= stats['hard']['count']
stats['medium']['avg'] /= stats['medium']['count']
stats['easy']['avg'] /= stats['easy']['count']
stats['nothing']['avg'] /= stats['easy']['count']
for i in range(-2, 2):
    users[i + 2]['nothing'] /= users[i + 2]['count'] / 100
    users[i + 2]['easy'] /= users[i + 2]['count'] / 100
    users[i + 2]['medium'] /= users[i + 2]['count'] / 100
    users[i + 2]['hard'] /= users[i + 2]['count'] / 100

print(stats)
print ("based on productivity: ")
print ("peope who rank between -2 and -1 (least productive): ")
print (users[0])

print ("people who rank between -1 and 0:")
print (users[1])

print ("people who rank between 0 and 1:")
print (users[2])
print ("people who rank between 1 and 2:")
print (users[3])
    

[18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575, 18575