In [63]:
import itertools
import math
import random
import numpy as np
import copy

In [2]:
nr_documents = 5
relevances_cats = ['N', 'R', 'HR']
relevances_vals = [0,1,5]

STEP 1

In [3]:
def split_list(a_list):
    half = len(a_list)/2
    return [a_list[:half], a_list[half:]]

In [4]:
def get_combinations_list(relevances):  
    combinations = []
    for i in itertools.product(relevances, repeat = nr_documents*2):
        i = list(i)
        i = split_list(i)
        combinations.append(i)
    return combinations

STEP 2

Precision

Calculates precision at rank k with a list with 3 relevance levels (R, HR and N). 'Precision at rank k' though, asks for a binary classication problem, so HR and R is counted as relevant (1) and N as non-relevant(0).

k must be 5 or smaller

In [10]:
def precision_at(k, combinations_cats):
    countTP = 0 # amount of true positives
    countFP = 0 # amount of false positives
    precisionList =[]
    for j in combinations_cats:
        kcounter = 0
        for m in range(0, k):
            l = j[0][m]
            if l == 'R': countTP+=1
            elif l == 'HR': countTP+=1
            else : countFP+=1
            precisionP = countTP/float(countTP+countFP)
        countTP=0
        countFP=0
        for m in range(0,k):
            l = j[1][m]
            if l == 'R': countTP+=1
            elif l == 'HR': countTP+=1
            else : countFP+=1
            precisionE = countTP / float(countTP + countFP)
        precisions = [precisionP, precisionE]
        precisionList.append(precisions)
    return precisionList

DCG

k must be 5 or smaller

In [11]:
def dcg_at(k, combinations_vals):
    EP_results = []
    for relevances in combinations_vals:
        rank_dcgs = []
        for algorithm in relevances:
            dcg = 0
            for r in range(1,k+1):
                dcg += ((2**algorithm[r-1])-1)/(math.log(1+r,2))
            rank_dcgs.append(dcg)
        EP_results.append(rank_dcgs)
    return EP_results

ERR


In [12]:
def R(seq, g): # mapping from relevance grades g to probability of relevance           
    return ((2**seq[g-1])-1)/float((2**max(relevances_vals)))

In [13]:
def P(seq, r): # probability that user stops at position r
    P = 1
    for i in range(1,(r-1)+1):
        P *= (1-R(seq, i)) * R(seq, r)
    return P 

In [14]:
def err(combinations_vals): # a cascade based metric with x(r) = 1/r
    ERR_results = []
    for relevances in combinations_vals:
        rank_err = []
        for algorithm in relevances:
            err = 0
            for r in range(1, len(algorithm)+1):
                err += (1/float(r))*P(algorithm, r)
            rank_err.append(err)
        ERR_results.append(rank_err)
    return ERR_results 

STEP 3

In [15]:
def calculate_differences(results):
    difference_measures=[]
    for algo in results:
        a = algo[0]
        b = algo[1]
        difference = b - a
        if (difference > 0 ): difference_measures.append(difference)
    return difference_measures

In [16]:
combinations_cats = get_combinations_list(relevances_cats)
combinations_vals = get_combinations_list(relevances_vals)

precision = precision_at(5, combinations_cats)
dcg = dcg_at(5, combinations_vals)
err = err(combinations_vals)

difference_measures = calculate_differences(precision)

STEP 4

In [17]:
def flip_coin():
    random.seed()
    return random.getrandbits(1)

In [18]:
# generates a list with 10 random bits. 1 represents a click.

def generate_random_clicks():
    clicks = []
    for i in range(nr_documents*2):
        clicks.append(flip_coin())
    return clicks

Team-draft interleaving

In [52]:
def team_draft_interleaving(rankings, clicks):
    print 'team draft interleaving'
    print 'clicks: ', clicks
    random.seed()
    credits = [0,0]
    new_ranking = []
    for i in range(nr_documents):
        winner = flip_coin()
        print 'winner: ', winner
        print 'rankings: ', rankings
        print 'i: ', i
        new_ranking.append(rankings[winner][i])
        print 'finihsed'
        if clicks[len(new_ranking)-1] == 1:
            print 'winning click'
            credits[winner] += 1
            
        new_ranking.append(rankings[1-winner][i])
        if clicks[len(new_ranking)-1] == 1:
            print 'winning click'
            credits[1-winner] += 1
    
    print 'credits: ', credits
    print 'interleaved ranking team draft: ', new_ranking
    return credits

Probabilistic interleaving

In [46]:
def init_softmaxes(rankings, tau):
    denominator = 0
    for i in range(1, (len(rankings[0])+1)):
        denominator += 1 / float(i ** tau)
    index = 1
    softmax1 = []
    softmax2 = []
    for ranking in rankings[0]:
        prob = (1/float(index**tau))/float(denominator)
        softmax1.append(prob)
        softmax2.append(prob)
        index += 1
    return softmax1, softmax2

In [47]:
def recalculate_softmax(softmax, pick):
    softmax.remove(softmax[pick])
    softmax[:] = [x/float(sum(softmax)) for x in softmax]
    return softmax

In [61]:
def probabilistic_interleaving(rankings1, tau, clicks):
    probrankings = copy.deepcopy(rankings1)
    credits = [0,0]
    conc_list = []
    s1, s2 = init_softmaxes(probrankings, tau)
    
    while probrankings[0] or probrankings[1]:
        
        winner = flip_coin() # keep flipping until all lists are empty
        
        if winner == 0 and probrankings[0]:
            
            pick = np.random.choice(len(probrankings[0]), 1, p=s1)
            conc_list.append(probrankings[winner][pick]) # add pick to the concatenated list
            probrankings[winner].remove(probrankings[winner][pick]) # remove the pick from the document list
#             print 'this ones: ', rankings
            s1 = recalculate_softmax(s1, pick) # recalculate the softmax of that list to normalise
            if clicks[len(conc_list)-1] == 1: credits[0] += 1
                
        elif winner == 1 and probrankings[1]:
            
            pick = np.random.choice(len(probrankings[1]), 1, p=s2)
            conc_list.append(probrankings[winner][pick]) # add pick to the concatenated list
            probrankings[winner].remove(probrankings[winner][pick]) # remove the pick from the document list
            s2 = recalculate_softmax(s2, pick) # recalculate the softmax of that list to normalise
            if (clicks[len(conc_list)-1] == 1): credits[1] += 1
                
    print 'interleaved ranking probabilistic: ', conc_list
    return credits

In [23]:
# probrankings = [['N','R','HR','R','HR'], ['R','R','HR','N','N']]
# clicks1 = [0,1,0,1,1,1,0,0,1,1]

# random.seed()
# rankings = [['N','R','HR','R','HR'], ['R','R','HR','N','N']]
# # utility = rankings1
# clicks = generate_random_clicks()

# ### Team-draft interleaved ranking
# team_draft_interleaved_ranking, credits_team = team_draft_interleaving(rankings, clicks)
# print 'Ranking P: ', rankings[0]
# print 'Ranking E: ', rankings[1]
# print 'Team-Draft Interleaved ranking: ', team_draft_interleaved_ranking
# print 'P credits: ', credits_team[0]
# print 'E credits: ', credits_team[1]
# team_winning_algo = credits_team.index(max(credits_team))

# ## Probabilistic interleaved ranking
# prob_interleaved_ranking, credits_prob = probabilistic_interleaving(probrankings, 3, clicks1)
# print 'Ranking P: ', rankings[0]
# print 'Ranking E: ', rankings[1]
# print 'Probabilistic Interleaved ranking: ', prob_interleaved_ranking
# print 'P credits: ', credits_prob[0]
# print 'E credits: ', credits_prob[1]
# prob_winning_algo = credits_prob.index(max(credits_prob))

STEP 5

In [24]:
def process_data(filename):
    data = []
    with open(filename) as f:  
        f = f.readlines()
        for line in f:
            data.append(line.split())
    return data

data = process_data("training_data.txt")

In [25]:
def rand():
    return random.random()

In [27]:
# required method (c)
def is_clicked(P):
    result = 1 if rand() < P else 0
    return result

RCM

In [28]:
# required method (b)
def predict_click_probabilities_RCM(nr_clicks, nr_docs):
    return nr_clicks / float(nr_docs)

In [29]:
# required method (a)
def get_parameter_RCM():
    nr_clicks = 0
    nr_docs = 0
    for row in data:
        if 'C' in row:
            nr_clicks += 1
        nr_docs += len(row) - 5 # 5 because first 5 elements is other information
    nr_docs -= nr_clicks
    return nr_clicks, nr_docs

In [30]:
nr_clicks, nr_docs = get_parameter_RCM()
click_probability_RCM = predict_click_probabilities_RCM(nr_clicks, nr_docs)
print 'final click_probability: ', round(click_probability_RCM,2)

final click_probability:  0.18


In [31]:
# printing the simulated clicks
def get_clicks_RCM(nr_docs):
    sim_clicks_RCM = []   
    
    
    for document in range(nr_docs):
        sim_clicks_RCM.append(is_clicked(click_probability_RCM))
    print 'simulated clicks RCM: ', sim_clicks_RCM
    return sim_clicks_RCM



Position-based model PBM

In [32]:
def predict_click_probabilities_PBM(a, g):
    click_probabilities = []
    for i in range(len(a)):
        P = a[i] * g[i]
        click_probabilities.append(P)
    return click_probabilities
    

In [33]:
def new_gammas(g, a, queries):
    all_query_gammas = [0] * 50 
    for values in queries.values(): #[[docid, boolean],[...,...]] 
        query_gammas = g[:] # [0.5, 0.5, ...]
        index = 0
        for doc in values: # [docid, boolean]  
            clicked = doc[1] # boolean
            gamma = clicked + ((1-clicked)*(((1-a[index])*query_gammas[index])/(1-query_gammas[index]*a[index])))
            query_gammas[index] = gamma
            index +=1   
        for i in range(len(query_gammas)):
            all_query_gammas[i] += query_gammas[i]
    
    for i in range(len(all_query_gammas)):
        all_query_gammas[i] /= len(queries)
        
    return all_query_gammas

In [34]:
def get_click_results(session):
    query_results = {}
    last_query = 0
    for row in session:
        # query action
        if 'Q' in row: 
            last_query = row[3]
            retrieved_docs = row[5:]
            for docid in range(len((retrieved_docs))):
                retrieved_docs[docid] = [retrieved_docs[docid], 0]
            if row[3] not in query_results:     
                query_results[row[3]] = retrieved_docs
            else:
                for i in range(len(retrieved_docs)):
                    exists = False
                    for document in query_results[row[3]]:
                        if retrieved_docs[i][0] in document:
                            exists = True
                            break
                    if not exists:
                        query_results[row[3]] += [retrieved_docs[i]]
                        
        # click action
        else:
            found = False
            while not found:
                # check if its in the last query (most likely the correct query page)
                for values in query_results[last_query]:
                    if row[3] == values[0]:
                        values[1] = 1
                        found = True
                        
                # otherwise, check in other query pages
                for queries in query_results.values():
                    for values in queries:
                        
                        if row[3] == values[0]:
                            values[1] = 1
                            found = True
                            
    return query_results

In [35]:
# required method (a)
def get_parameters_PBM(data_slice):
    alphas = [0.90] * 50 #list(np.arange(0.9,0.8,-0.001))
    gammas = [0.5] * 50
    learned_gammas = [0] * 50
    
    # get examination probabilities
    sessions = set(map(lambda x:x[0], data_slice))
    sessions_data = [[y for y in data_slice if y[0]==x] for x in sessions]
    session_nr = 1
    for session in sessions_data:
        session_nr += 1
        query_results = get_click_results(session)
        session_gammas = new_gammas(gammas, alphas, query_results)
        for i in range(len(session_gammas)):
            learned_gammas[i] += session_gammas[i]
    for i in range(len(learned_gammas)):
        learned_gammas[i] /= len(sessions_data)
        learned_gammas[i] = learned_gammas[i]
    return alphas[:10], learned_gammas[:10]
    
            

In [36]:
data_slice = data[0:20000]
alphas, gammas = get_parameters_PBM(data_slice)
click_probabilities_PBM = predict_click_probabilities_PBM(alphas[:10], gammas[:10])

print '\nfinal click probabilities PBM: \n'#, learned_gammas
for i in click_probabilities_PBM:
    print round(i,2),


final click probabilities PBM: 

0.56 0.26 0.21 0.18 0.16 0.15 0.14 0.13 0.13 0.13


In [37]:
# printing the simulated clicks
def get_clicks_PBM(nr_docs):
    sim_clicks_PBM = []      
    for document_index in range(nr_docs):
        sim_clicks_PBM.append(is_clicked(click_probabilities_PBM[document_index]))
    print 'simulated clicks PBM: ', sim_clicks_PBM
    return sim_clicks_PBM



STEP 6 

Experiments.


This method runs N experiments with a list of categorical rankings E and P with categories (HR/R/N).
It takes one of the rankings and runs both a team_draft and probabilistic interleave with different random click values, but with the same ranking.


We should think of what it means to use different ranking combinations or the same click values for each simulation.


It prints out the E proportion for both the team-draft and the probabilistic interleave.


We should make a measure for which we can compare results between on- and offline evaluation, between the 2 interleaves and the different click models.

In [38]:
# # printing the simulated clicks
# def get_simulated_clicks():
#     sim_clicks_RCM = []
#     nr_clicks, nr_docs = get_parameter_RCM()
#     click_probability = predict_click_probabilities_RCM(nr_clicks, nr_docs)
#     for i in range(10):
#         sim_clicks_RCM.append(is_clicked(click_probability))
#     return sim_clicks_RCM
# #print 'simulated clicks: ', sim_clicks_RCM

In [None]:
def interleaving_experiment(ranking_pairs, N):
    wins_P_E = [0,0]
    ties = 0
    
    for pair in ranking_pairs:
        print 'pair: ', pair
        for i in range(N):
            print 'i: ', i
            PBM_clicks = get_clicks_PBM(10)
            RCM_clicks = get_clicks_RCM(10)
            click_models = [PBM_clicks, RCM_clicks]
            for clicks in click_models:
                interleaf_credits = [team_draft_interleaving(pair, clicks), probabilistic_interleaving(pair, 3, clicks)]
                print 'interlieav fcerctidt ', interleaf_credits
                for credits in interleaf_credits:
                    if credits[0] != credits[1]: # if not a tie
                        wins_P_E[credits.index(max(credits))] += 1
                    else:
                        ties += 1
#     total_games = ties + sum(wins_P_E)
    print 'win counts: ', wins_P_E
    print 'ties: ', ties
    print 'total_games: ', total_games
     
    #     for clicks in click_models:    
#         for i in range(N):
            
#             clicksTeam = get_simulated_clicks() # random clicks
#             clicksProb = get_simulated_clicks() # random clicks
#             team_draft_interleaved_ranking, credits_team = team_draft_interleaving(combinations_cats[i], clicksTeam)
#             prob_interleaved_ranking, credits_prob = probabilistic_interleaving(combinations_cats[i], 3, clicksProb)
#             EpropTeam.append(credits_team[1]/float(credits_team[0]+credits_team[1]+0.000001))
#             EpropProb.append(credits_team[1]/float(credits_prob[0]+credits_prob[1]+0.000001))
#         print 'E proportion Team-draft Interleave: ', EpropTeam
#         print 'E proportion Probabilistic Interleave: ', EpropProb
    
combinations_cats = get_combinations_list(relevances_cats)
interleaving_experiment(combinations_cats, 1)

pair:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'N', 'N']]
i:  0
simulated clicks PBM:  [0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
simulated clicks RCM:  [1, 0, 0, 1, 1, 1, 0, 0, 0, 0]
team draft interleaving
clicks:  [0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
winner:  1
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'N', 'N']]
i:  0
finihsed
winning click
winner:  0
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'N', 'N']]
i:  1
finihsed
winning click
winner:  0
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'N', 'N']]
i:  2
finihsed
winner:  0
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'N', 'N']]
i:  3
finihsed
winner:  1
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'N', 'N']]
i:  4
finihsed
credits:  [2, 0]
interleaved ranking team draft:  ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
interleaved ranking probabilistic:  ['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
interlieav fcerctidt  [[2, 0], [2, 0]]
team draft interleaving
clicks:  [1, 0, 0, 1,

  from ipykernel import kernelapp as app


interleaved ranking probabilistic:  ['N', 'N', 'N', 'N', 'N', 'N', 'R', 'N', 'HR', 'N']
interlieav fcerctidt  [[3, 0], [1, 2]]
pair:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'HR', 'N']]
i:  0
simulated clicks PBM:  [1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
simulated clicks RCM:  [0, 0, 1, 0, 1, 0, 0, 0, 0, 0]
team draft interleaving
clicks:  [1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
winner:  1
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'HR', 'N']]
i:  0
finihsed
winning click
winner:  1
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'HR', 'N']]
i:  1
finihsed
winner:  0
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'HR', 'N']]
i:  2
finihsed
winning click
winner:  1
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'HR', 'N']]
i:  3
finihsed
winner:  1
rankings:  [['N', 'N', 'N', 'N', 'N'], ['N', 'N', 'N', 'HR', 'N']]
i:  4
finihsed
credits:  [0, 2]
interleaved ranking team draft:  ['N', 'N', 'N', 'N', 'N', 'N', 'HR', 'N', 'N', 'N']
interleaved ranking probabilistic:  [