In [1]:
import numpy as np
import cPickle
from collections import OrderedDict
import collections
from nltk.metrics import *
import operator
import os
from PST_engine import PSTInfer, PST
from scipy.stats import rv_discrete
"""
    we assume this ipython notebook resides in the following git-repo directory structure:
    ir2
    |---src
        |----preprocessing
                ipython notebook
        |----data
             | -- bg_session.ctx
                  tr_session.ctx
                  ...
        |----baseline (directory of Allesandro programs)
        

"""

DATA_PATH = '../data/'
# raw session files, query words, tab separated queries, space separated words, one line=one session
bg_session_filename = os.path.join(DATA_PATH, 'bg_session.ctx')
val_session_filename = os.path.join(DATA_PATH,'val_session.ctx')
test_session_filename = os.path.join(DATA_PATH,'test_session.ctx')
tr_session_filename = os.path.join(DATA_PATH,'tr_session.ctx')
# query frequency dict of the background data
bg_query_freq_file = os.path.join(DATA_PATH, 'bg_query_freq.pkl')
bg_query_dict_file = os.path.join(DATA_PATH, 'bg_query_dict.pkl')
# ADJ model filename of background data (generated with Allesandro programs)
bg_ADJ_model_filename = os.path.join(DATA_PATH, 'bg_session.ctx_ADJ.mdl')
# VMM model filename of background data (generated with Allesandro programs)
VMM_model_file = os.path.join(DATA_PATH, 'bg_session.ctx_VMM.mdl')
# after loading it once, we can save it to pickle and load pickle file next time...which is quicker
VMM_model_pickle = os.path.join(DATA_PATH, 'bg_pstreeVMM.pkl')
# candidate filename, used to store the dict that holds the sessions & candidate queries for test/val/train
tr_sess_candid_f = os.path.join(DATA_PATH, 'tr_suggest.pkl')
val_sess_candid_f = os.path.join(DATA_PATH, 'val_suggest.pkl')
test_sess_candid_f = os.path.join(DATA_PATH, 'test_suggest.pkl')

In [2]:
"""
Func to print the suggested query id's as strings using the id_to_query map
"""
def print_suggestion(suggestions):
    for suggest in suggestions:
        print id_to_query[suggest[0]]
        
        
"""
    Save a dictionary to file
"""
def save_pickle_dict(a_dict, output_file):
    f = open(output_file, 'wb')
    cPickle.dump(a_dict, f)
    print("Successfully saved dict to %s" % output_file)
    f.close()

"""
    make a inverted version of the query to id dict
""" 
def make_id_to_query_dict(q_to_id_dict):
    return {v: k for k, v in q_to_id_dict.iteritems()}

"""
Make a query frequency dictionary of the background data set
we need this dict for one of the features: 
    -- the frequency of an anchor query in the background data set
Input: session file with string queries
Output: dict with the query frequencies 
"""
def make_query_frequencies(session_file):
    global query_dict
    
    query_freq = {}
    total_freq = 0
    for num, session in enumerate(session_file):
        session = session.strip().split('\t')
        for query in session:
            query_freq[query] = query_freq.get(query, 0.) + 1.
            total_freq += 1
    
    # Determine the 100 most frequent queries. These will be used later for the
    # perturbation of a session context (exp 4.5)
    query_noisy = sorted(query_freq.items(), key=operator.itemgetter(1), reverse=True)[:100]
    # convert list of tuples [('google', 1908839), ('com', 982398)...] into list of query indexes
    # we will later sample from the query indices
    noisy_query_ids = np.array([query_dict[x[0]] for x in query_noisy])
    noisy_query_counts = np.array([x[1] for x in query_noisy])
    noisy_probs = noisy_query_counts * 1./np.sum(noisy_query_counts)
    noisy_query_dist = rv_discrete(name='noisy_query_prob', values=(noisy_query_ids, noisy_probs))
    
    print("Successfully made background frequency dictionary")
    return query_freq, noisy_query_dist

"""
    load the 2 dicts from the ADJ model that we generated with Allesandro programs
    we need these dicts to generate the candidate queries for a test/train/val files
"""
def load_ADJ_model_dicts(filename):
    print("Loading ADJ model from file %s" % filename)
    input_handle = open(filename, 'r')
    tuple_dict = cPickle.load(input_handle)
    query_to_id = cPickle.load(input_handle)
    print("Successfully loaded ADJ model dicts")
    print("\t %d entries in tuple dict" % len(tuple_dict))
    print("\t %d entries in query_to_id dict" % len(query_to_id))
    
    return tuple_dict, query_to_id

"""
make a new dict with key anchor query, as value we have a new dict with keys previous query and 
their value count 

dict[anchor_query] = { previous_query: count_value}

"""
def make_search_dict(tuple_dict, gen_v2=False):
    search_dict = collections.defaultdict(dict)
    # use the keys (tuples with two query id's) of the tuple dict to make a new dict 
    tuple_pairs = tuple_dict.keys()
    
    print("Start making search dict...")
    for _tuple in tuple_pairs:
        search_dict[_tuple[1]][_tuple[0]] = tuple_dict[_tuple] 
    
    # search dict for Allesandro way of generating candidates
    """
        JUST FOR TEST PURPOSES
        GENERATING THE SEARCH_DICt thAT ALLESANDRO USES IN PSINFER class
    """
    search_dict_v2 = collections.defaultdict(dict)
    if gen_v2:
        for key, freq in tuple_dict.items():
            search_dict_v2[key[:-1]][key[-1]] = freq
            
    print("Successfully made search dict")
    del tuple_pairs
    
    return search_dict, search_dict_v2

ADJ_tuple_dict, query_dict = load_ADJ_model_dicts(bg_ADJ_model_filename)
# returns background query frequency dict
# returns background noisy query distribution, contains 100 noisy query ID's you can sample from
# e.g bg_noisy_query_dist.rvs(size=10) samples 10 query ID's from the dist
bg_query_freq, bg_noisy_query_dist = make_query_frequencies(open(bg_session_filename, 'r'))
save_pickle_dict(bg_query_freq, bg_query_freq_file)
save_pickle_dict(query_dict, bg_query_dict_file)

# finally make queryID to query-words dict
id_to_query = make_id_to_query_dict(query_dict)
search_dict, search_dict_v2 = make_search_dict(ADJ_tuple_dict)
print(" ---------->>> READY let's start <<<-----------")

Loading ADJ model from file ../data/bg_session.ctx_ADJ.mdl
Successfully loaded ADJ model dicts
	 5455244 entries in tuple dict
	 3949947 entries in query_to_id dict
Successfully made background frequency dictionary
Successfully saved dict to ../data/bg_query_freq.pkl
Successfully saved dict to ../data/bg_query_dict.pkl
Start making search dict...
Successfully made search dict
 ---------->>> READY let's start <<<-----------


In [3]:
"""
    THIS IS ONLY FOR TEST PURPOSES...TO DEBUG THE GENERATION OF THE SUGGESTIONS
    THESE METHODS COME FROM PROGRAMS OF ALLESANDRO
    PART OF THE PSINFER class
"""
def _find(suffix, exact_match=False):
        global query_dict
        global search_dict_v2
        
        _suffix = [query_dict.get(x, -1) for x in suffix]
       
        # Back off to shorter suffixes,
        for i in range(len(_suffix)):
            key = tuple(_suffix[i:])
            # print("Search keys: ", id_to_query[key[0]])
            if key in search_dict_v2:
                
                return {'last_node': key, \
                        'is_found': i==0 and len(_suffix)==len(suffix), \
                        'empty': False, \
                        'probs': search_dict_v2[key]}
        # and if nothing is found
        
        return {'last_node': (0,), \
                'is_found': False, \
                'empty': True, \
                'probs' : {}}
    
def suggest(suffix, N=100, exact_match=False):
        global id_to_query
        
        result = _find(suffix)

        node = result['last_node']
        probs = result['probs']

        data = {'last_node_id' : node[0],
                'last_node_query': id_to_query[node[0]],
                'found' :   result['is_found'],
                'suggestions' : [],
                'scores' : []}
        if node[0] == 0 or (exact_match and not data['found']):
            return data
        # Get top N
        id_sugg_probs = sorted(probs.items(), key=operator.itemgetter(1), reverse=True)[:N]
        string_sugg_probs = [(id_to_query[sugg_id], sugg_score) for sugg_id, sugg_score in id_sugg_probs]
        sugg, score = map(list, zip(*string_sugg_probs))
        data['suggestions'] = sugg
        data['scores'] = score
        
        return data

In [12]:
"""
    THIS IS ONLY FOR TEST PURPOSES...TO DEBUG THE GENERATION OF THE SUGGESTIONS
"""
with open(tr_session_filename, 'r') as tr:
    s = 0
    for session in tr:
        
        suffix = session.strip().split('\t')
        target_query = suffix[-1]
        anchor_query = suffix[-2]
        suggestions = suggest(suffix[:-1], N=20)
        if len(suggestions['scores']) > 20:
            print("anchor_query ", anchor_query)
            print("target_query ", target_query)
            print(suggestions['suggestions'])
        if s > 5000:
            break
        s += 1
        
print("finish")

finish


In [3]:
def shorten_query(query):
    global query_dict
    splitted_query = query.split()
    for i in range(0,len(splitted_query)):
        shorted_query = splitted_query[:i] + splitted_query[i+1 :]
        shorted_query = ' '.join(shorted_query)
        if shorted_query in query_dict:
            return shorted_query
   
    if len(splitted_query) >= 1:
        shorted_query = splitted_query[:i] + splitted_query[i+1 :]
        shorted_query = ' '.join(shorted_query[:-1])
        return shorten_query(shorted_query)
    elif len(splitted_query) == 1:
        if splitted_query[0] in query_dict:
            return splitted_query[0]
        else:
            return None
    else:
        return None

"""
Function that makes suggestions for a session

Input: session file, *.ctx
Output: dict with key:session_idx value: (target_query,anchor_query, session, suggestions)

"""

def print_suggestions(session, anchor_query, candidates):
    global id_to_query
    
    print("session ", session)
    print("anchor_query ", anchor_query)
    for query_1, query_2 in candidates:
        print("query 1 ", id_to_query[query_1])
        

def make_suggestions(session_file, min_sess_length=1, max_sess_length=50, num_suggestions=20, 
                     early_stop=False, long_tail_queries=False):
    global query_dict
    global search_dict
    # make a dict to save all the results
    suggestion_dict = {}
    c = 1
    num_sessions = 0
    # loop over every session in the *.ctx file
    for idx, line in enumerate(session_file):
        # queries are tab-separated 
        session = line.strip().split('\t')
        
        # we also have to limit the session length because we can't generate a HRED log-likehood score
        # for sessions that are too long (memory problems)
        if len(session) >= min_sess_length+1 and len(session) <= max_sess_length:
            target_query = session[-1] # target query is the last query Qm
            anchor_query = session[-2] # Anchor query is the query Qm-1
            context = session[:-1] # Qm-1 till Q1 are the context queries
            orig_anchor_query = ""
            
            if long_tail_queries:
                # when doing long-tail-prediction we only want queries that are not in the bg-set
                if anchor_query not in query_dict:
                    # make a shorter version of the query that is in the bg-set
                    orig_anchor_query = anchor_query
                    anchor_query = shorten_query(anchor_query)
                    if anchor_query is None: # when it is not possible to make a smaller version
                        continue
                else:
                    # if the query is in the query dict, we do not want to use this query
                    continue
            
            if anchor_query in query_dict:
                anchor_q_key =  query_dict[anchor_query] # the key of the query in the bg-set 
                # check if target query and anchor query are in the background set
                # if key in search_dict and target_query in query_dict:
                if anchor_query in query_dict and target_query in query_dict:
                    """
                    We could use the search dict to find all the queries that follow the anchor query 
                    in the bg set, we use this queries as suggestions
                    """
                    suggestions = search_dict[anchor_q_key]
                    if len(suggestions) > num_suggestions: # we need at least 20 suggestions 
                        # print("suggestions ", suggestions)
                        target_key = query_dict[target_query] # find the key of the target query
                        list_suggestions = [(sugg_key, suggestions[sugg_key] ) \
                                            for sugg_key in suggestions.keys()]
                        # sort list of tuples by second tuple entry which is the frequency count
                        # also reverse order so it is in descending order
                        sorted_suggestions = sorted(list_suggestions, key=lambda x: x[1])[::-1]
                        #take only the top 20 suggestions based on counts 
                        
                        suggestions = sorted_suggestions[0:num_suggestions]
                        
                        # final check, is the target query really in the set of suggestions? 
                        if target_key in (x[0] for x in suggestions): 
                            # we have a valid session, now we list all the suggestions and sort them
                            # save this in the dict key(idx):(target_query,anchor_query, session, suggestions)
                            suggestion_dict[idx] = (target_query, anchor_query, session, suggestions)
                            # print_suggestions(session, anchor_query, suggestions)
                            
                            if early_stop:
                                print("target_query ", target_query)
                                print("original anchor query ", orig_anchor_query)
                                print("anchor_query ", anchor_query)
                                print("Sorted suggestions: ", suggestions)
                            num_sessions += 1
        if num_sessions > 3 and early_stop:
            print("Break")
            break
            
    return suggestion_dict

In [4]:
def generate_suggestions(p_type="tr", min_sess_length=1, max_sess_length=50, 
                         num_suggestions=20, load_existing=False, 
                         early_stop=False, long_tail_queries=False):
    """
        find for a specific session file (train/test/val) all the corresponding
        suggestions and store result in a dict for later processing
        This procedures differs between experiments 4.4, 4.5, 4.6 in the paper
    """
    global test_session_filename, tr_session_filename, val_session_filename
    global tr_sess_candid_f, test_sess_candid_f, val_sess_candid_f
    
    if p_type == 'tr':
        # training
        session_file = tr_session_filename
        output_file = tr_sess_candid_f
    elif p_type == 'val':
        # validation
        session_file = val_session_filename
        output_file = val_sess_candid_f
    else:
        # test sessions
        session_file = test_session_filename
        output_file = test_sess_candid_f
        
    if not load_existing:
    
        print("Generating suggestion queries for session file %s" % session_file)
        suggestion_dict = make_suggestions(open(session_file, 'r'), 
                                           min_sess_length=min_sess_length,
                                           max_sess_length=max_sess_length,
                                           num_suggestions=num_suggestions,
                                           early_stop=early_stop,
                                           long_tail_queries=long_tail_queries)

        print("Successfully generated suggestions for %d sessions" % len(suggestion_dict))
        save_pickle_dict(suggestion_dict, output_file)
    else:
        print("Loading suggestion queries from file %s" % output_file)
        suggestion_dict = cPickle.load(open(output_file ,'rb'))
        print("Successfully loaded suggestions")
        
    return suggestion_dict


In [5]:
def load_VMM_model(filename, load_saved_model=False):
    # load the VMM model made with Allesandro's Probabilistic Suffix Tree (PST)
    # currently the context scope is limited to D=2 which means the tuple dict contains
    # tuples with max lenght of 3 (so the memory span is look 2 queries ahead)
    global VMM_model_pickle
    
    if not load_saved_model:
        print("Loading VMM model from %s" % filename)
        print("Patient, this will take a while (approx 5 minutes)")
        pstree = PSTInfer()
        pstree.load(filename)
        save_pickle_dict(pstree, VMM_model_pickle)
    else:
        print("Loading VMM model from pickle %s" % VMM_model_pickle)
        pstree = cPickle.load(open(VMM_model_pickle, 'rb'))
        
    print("======== READY ===========")
    return pstree

pstreeVMM = load_VMM_model(VMM_model_file, load_saved_model=False)

Loading VMM model from ../data/bg_session.ctx_VMM.mdl
Patient, this will take a while (approx 5 minutes)
Loading inference engine
Preparing internal structures
Loaded inference engine
Successfully saved dict to ../data/bg_pstreeVMM.pkl


In [9]:
"""
    we use the following output files
    (1) training
        tr_sess_candid_f
        
    (2) validation
        val_sess_candid_f
        
    (3) test
        test_sess_candid_f
        
    if called with load_existing=True the dict will be loaded from an earlier saved pickle file
    
    Experiments 4.4:
        Paper says they are getting the following number of candidate sessions (brackets our numbers):
            Training = 18,882 (17960)
            Test = 9,348 (7313)
            Validation = 6,988 (11,330)
            Note: the split of the original AOL data file is dan on query dttm.
                  the paper does not specify the exact separation between validation and test.
                  they only mention that they separate the last 2 weeks of May 2006 between both 
                  sets
"""

# suggest_train = generate_suggestions(p_type="tr"
# suggest_test = generate_suggestions(p_type="test"
# sugget_val = generate_suggestions(p_type="val"
suggest_val = generate_suggestions(p_type="val", min_sess_length=1, 
                                       max_sess_length = 50,
                                       load_existing=False, 
                                       early_stop=False,
                                       long_tail_queries=False)


Generating suggestion queries for session file ../data/val_session.ctx
Successfully generated suggestions for 11330 sessions
Successfully saved dict to ../data/val_suggest.pkl


In [8]:
print(len(suggest_train))
print(len(suggest_test))
print(len(suggest_val))

17960
7313


In [9]:
def count_letter_ngram(sentence, n=3):
    """
    How many n-grams fits in this sentenec 
    """
    if len(sentence) < n:
        return set(sentence)
    local_counts = set()
    for k in range(len(sentence.strip()) - n + 1): 
        local_counts.add(sentence[k:k+n])
    return local_counts

def matches(ng1, ng2):
    """
    For both n-gram sets how many sim elements they contain
    """
    return len(ng1 & ng2)

def n_gram_sim(query1, query2,n=3):
    """
    return n-gram similarity between two queries 
    """
    return matches(count_letter_ngram(query1, n), count_letter_ngram(query2, n))

def make_n_gram_sim_features(context_queries,suggestion):
    """
    For every suggestion make the n-gram similarity for the context queries (at most 10)
    """
    n_sim = [0] * 10
    for idx, context_query in enumerate(context_queries):
        if idx >=10:
            """
            only do this for at most 10 context queries 
            """
            break
        n_sim[idx] = n_gram_sim(suggestion, context_query,n=3)
        
    return n_sim


def get_VMM_score(session, suggestion, no_normalize=False, fallback=False):
    global pstreeVMM
    """
    For every suggestion determine the VMM score (variable memory Markov score)
    """
    
    _, scores = pstreeVMM.rerank(session, suggestion, no_normalize=no_normalize, fallback=fallback)
    
    return scores[0]

In [173]:
def prepare_files_hred_score(suggestion_dict, out_dir, suffix='tr', num_context=None, corrupt_context=False):
    """
        in order to obtain the HRED scores from Allesandro's model we need to feed the score.py 
        with two input files (note raw data = the acutal words)
        (1) sessions, tab separated queries, space separated words
        (2) candidates belonging to that session, so on each line tab-separated the candidate queries 
        
        the procedures generates both output files based on the input suggestion dictionary.
        Remember, that dicionary contains per entry all necessary information:
        
        out_dir is just the directory where to write to
        
    """
    global pstreeVMM
    """
        contains 100 noisy query ID's, you can sample with bg_noisy_query_dist.rvs(size=1)
        we need this for experiments 4.5 
        where we pertubate the sessions with a randomly sampled noisy query
        we just inject/corrupt the noisy query at a random point in the session
    """
    global bg_noisy_query_dist
    global id_to_query
    
    assert (num_context == None or num_context <= 3)
    if num_context == None:
        if not corrupt_context:
            c_suffix = "_all"
        else:
            # perturbate context with noisy query
            c_suffix = "_noisy"
    else:
        c_suffix = "_c" + str(num_context)
    session_f = os.path.join(out_dir, suffix + c_suffix + "_hred_sess.ctx")
    candid_f = os.path.join(out_dir, suffix + c_suffix + "_hred_cand.ctx")
    print("Writing 2 output files")
    print(session_f)
    print(candid_f)
    
    with open(session_f, 'w') as sess, open(candid_f, 'w') as cand:
        for session_key in suggestion_dict.keys():
            # tuple 
            session_tuple = suggestion_dict[session_key]
            target_query = session_tuple[0]
            # all queries except the target query (full context)
            context_queries = session_tuple[2][:-1]
            
            if num_context == None:
                # complete session context without the target query
                # check whether we're corrupting the context with a noisy query
                if corrupt_context:
                    noisy_query = id_to_query[int(bg_noisy_query_dist.rvs(size=1))]
                    pos = np.random.randint(0, len(context_queries))
                    if pos == 0:
                        # in front
                        context_queries = [noisy_query] + context_queries
                    elif pos == len(context_queries):
                        # at the end
                        context_queries = context_queries + [noisy_query]
                    else:
                        # somewhere in between
                        context_queries = context_queries[0:pos] + [noisy_query] + context_queries[pos:]
                        
                else:
                    # full query context and no corruption of session context
                    pass
            else:
                # only the last num_context queries as context for the HRED score generation
                # remember CORRUPTION OF CONTEXT only applicable if num_context == None == full context
                # experiments 4.5 paper
                context_queries = context_queries[-num_context:]
            
            suggestions = session_tuple[3]
            queries = []
            for idx, suggestion in enumerate(suggestions):
                suggestion_id = suggestion[0]
                query_words = pstreeVMM.id_to_query[suggestion_id]
                queries.append(query_words)

            sess.write("\t".join(context_queries) + "\n")
            cand.write("\t".join(queries) + "\n")
        
    print("-----> Ready <------")
    
"""
    make the files that we need to generate the HRED log-likelihood scores
    
    For experiment 4.4.2 we need to generate different query contexts for the 
    generation of the HRED score
    1) with only Q_(m-1) i.e. only anchor query
    2) with Q_(m-2), Q(m-1) i.e. 2 queries in context
    3) with Q_(m-3), Q_(m-2), Q(m-1) 3 queries in context
    
    - For all three context-options we need to determine the HRED score
    - produce the feature matrix
    - score with LambdaMart
    
    For experiment 4.5 we inject a noisy query in to the session context.
    So we can't use the option num_context because that restricts session context
    the valid combination for this experiment is:  num_context=None, corrupt_context=True
    
    
"""      
# prepare_files_hred_score(suggest_train, DATA_PATH, suffix='tr', num_context=None, corrupt_context=True) 
# prepare_files_hred_score(suggest_test, DATA_PATH, suffix='test', num_context=3, corrupt_context=False) 
prepare_files_hred_score(suggest_val_long, DATA_PATH, suffix='val', num_context=None, corrupt_context=False) 

Writing 2 output files
../data/val_all_hred_sess.ctx
../data/val_all_hred_cand.ctx
-----> Ready <------


In [164]:
"""
    count number of lines in a file
"""
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

"""
Function that returens a feature vector for every suggestion 

Input: suggestion_dict
Output: per session a matrix [17,20] with the feature vectors 
"""

def make_suggestion_features(suggestion_dict, hred_ll_file, num_features=18, do_test=False):
    global pstreeVMM
    global bg_query_freq
    global query_dict
    
    num_of_candidates = 20
    # Number of lines in HRED file must be equal to, -1 because of header line
    lines_in_file = file_len(hred_ll_file)  - 1
    expected_count = len(suggestion_dict) * num_of_candidates
    print("Lines match? %d = %d" % (lines_in_file, expected_count))
    
    assert lines_in_file == expected_count
    
    c = 0
    """
        matrix_out is the final numpy matrix. The layout is as follows:
        dim0 = number of sessions which is actually equal to the size of the suggestions dict
               because we determined for each session from the tr, test, val session file if it passes
               the requirements, 20 suggestions
        dim1 = col00: anchor query ID
               col01: suggestion/candidate query ID
               col02: length of session (including target query!)
               col03: how many times does the candidate/suggestion follow the anchor query in the background set
               col04: Additionally, we use the frequency of the anchor query in the background data
               col05: levenshtein_distance
               col06: # of characters of suggestion query + # of words suggestion query
               col07: frequency of suggestion query in background data
               col08: 10 n-gram similarities
               .
               .
               col17: 10 n-gram similarities
               col18: VMM score
               col19: HRED log-likelihood score
               col20: label, which basically is zero except for the target query ID, which is one of the 
                      suggestion queries.
    """
    
    feature_dim = num_features + 3
    matrix_out = np.zeros((len(suggestion_dict) * 20, feature_dim))
    session_id = 0
    sess_less_cand = 0
    
    with open(hred_ll_file, 'r') as hred_ll:
        # read header line
        print("HRED-header ", hred_ll.readline() )
        for session_key in suggestion_dict.keys():
            # tuple 
            session_tuple = suggestion_dict[session_key]
            target_query = session_tuple[0]
            target_id = query_dict[target_query]
            context_queries = session_tuple[2][:-1]

            """
                because we are also handling situations in which the anchor query was changed (exp 4.6)
                in that case session_tuple[1] = shortened anchor query (which exists in the bg)
                but we want to calculate anchor query features based on the long-tail original query
                therefore we get the last query from the context queries (which does not contain target query)
                
                before we did: anchor_query = session_tuple[1]
            """
            anchor_query = context_queries[-1]
            anchor_query_id = pstreeVMM.query_to_id.get(anchor_query, -1)
            """
                If   anchor_query_id = -1   after this operation
                then we're dealing with experiment 4.6, the anchor query does not exist in the background data
                so we can't calculate frequencies
            """
            # frequency of anchor query in background data, same for all 20 suggestions, so determine here
            bg_anchor_freq = bg_query_freq.get(anchor_query, 0)
            
            suggestions = session_tuple[3]
            VMM_scores = []
            candidates = []
            # create an empty matrix for this session, which stores the num_of_candidates rows
            # we're doing this because at the end of the session we will sort this matrix
            # so that the target query (the positive candidate) is the first row in the session matrix
            session_matrix = np.zeros((num_of_candidates, feature_dim))
            
            if len(suggestions) == 20:
                for idx, suggestion in enumerate(suggestions):

                    suggestion_id = suggestion[0]
                    query_string = pstreeVMM.id_to_query[suggestion_id]

                    session_matrix[idx, 0] = anchor_query_id
                    session_matrix[idx, 1] = suggestion_id
                    # add a feature with the session length (needed in Experiment 4.4.2 for classifying
                    # the sessions in short, medium, long). Add one because context queries misses the target query
                    session_matrix[idx, 2] = len(context_queries) + 1
                    """"
                    For each candidate suggestion, we count how many times it follows 
                    the anchor query in the background data and add this count as a feature.
                    This should be the ADJ-score!!!
                    """
                    follow_anchor_count = suggestion[1]
                    """
                        TODO: Experiment 4.6:
                            the "follow_anchor_count" is still based on the shortened anchor query
                            which is actually 
                    """ 
                    if anchor_query_id == -1:
                        # Experiment 4.6, anchor query has no ID and suggestion never follows
                        # the real long-tail anchor query
                        session_matrix[idx, 3] = 0
                    else:
                        session_matrix[idx, 3] = follow_anchor_count
                    """
                    Additionally, we use the frequency of the anchor query in the background data.
                    """
                    session_matrix[idx, 4] = bg_anchor_freq
                    """
                    We also add the Levenshtein distance between the anchor and the suggestion.
                    """
                    levenshtein_distance = edit_distance(anchor_query, query_string)
                    session_matrix[idx, 5] = levenshtein_distance
                    """
                    The suggestion length (characters and words)
                    """
                    chars_leng = len(query_string) 
                    word_leng = len(query_string.split())
                    session_matrix[idx, 6] = chars_leng + word_leng
                    """
                        Frequency of suggestion query in background data
                    """
                    session_matrix[idx, 7] = bg_query_freq[query_string]
                    
                    """
                    We add 10 features corresponding to the character n-gram similarity 
                    between the suggestion and the 10 most recent queries in the context.
                    """
                    n_gram_sim =  make_n_gram_sim_features(context_queries, query_string)
                    session_matrix[idx, 8:18] = n_gram_sim
                    

                    VMM_score = get_VMM_score(context_queries, [query_string])
                    VMM_scores.append(VMM_score)
                    session_matrix[idx, 18] = VMM_score

                    """
                    HRED Score
                    """
                    hred_score = float(hred_ll.readline())
                    session_matrix[idx, 19] = hred_score

                    if target_id == suggestion_id:
                        session_matrix[idx, 20] = 1
                    else:
                        session_matrix[idx, 20] = 0
                    
                    candidates.append(query_string)
                # ok, let's sort the session matrix first on the last column, the label (0/1) so that the target
                # query is the first row
                # session_matrix = session_matrix[session_matrix[:, feature_dim-1].argsort()[::-1]]
                # let's parse the session_matrix into our final output matrix
                start = session_id * num_of_candidates
                end   = start + num_of_candidates

                matrix_out[start:end, :] = session_matrix
                if do_test:
                    print("session ", context_queries)
                    print("anchor_query ", anchor_query)
                    # print("candidates ", candidates)
                    # print("VMM scores ", VMM_scores)
                    # print(matrix_out[session_id:20,feature_dim-1])
                    # break
                assert np.sum(matrix_out[start:end, 20] == 1) == 1, query_string + " " + str(target_id) + " " + str(anchor_query_id)
                session_id += 1
                if session_id % 5000 == 0:
                    print("Progress, session id %d" % session_id)
            else:
                sess_less_cand += 1
                
            if session_id > 10 and do_test:
                break

        
    print("Session with less than 20 candidates %d" % sess_less_cand)
    return matrix_out

In [181]:
"""
    Please note, the output files of HRED with the ll-score are called:
    Experiments 4.4.1 (base)
        tr_hred_score_exp4_4_1.f
        val_hred_score_exp4_4_1.f
        test_hred_score_exp4_4_1.f
        
    Experiments 4.4.2 (with different context)
        1 query context:
            'tr_hred_score_exp_4_4_c1.f'
            'test_hred_score_exp_4_4_c1.f'
            'val_hred_score_exp_4_4_c1.f'
        2 queries in context:
            'tr_hred_score_exp_4_4_c2.f'
            'test_hred_score_exp_4_4_c2.f'
            'val_hred_score_exp_4_4_c2.f'
        3 queries in context:
            'tr_hred_score_exp_4_4_c3.f'
            'test_hred_score_exp_4_4_c3.f'
            'val_hred_score_exp_4_4_c3.f'
            
    Experiment 4.5 (perturbate session with ONE noisy query from background data):
        tr_hred_score_exp_4_5.f
        test_hred_score_exp_4_5.f
        val_hred_score_exp_4_5.f
        
    Experiment 4.6 (Long-tail prediction) aka anchor query does not exist in background data
        tr_hred_score_exp_4_6.f
        test_hred_score_exp_4_6.f
        val_hred_score_exp_4_6.f
"""

hred_ll_file = os.path.join(DATA_PATH, 'val_hred_score_exp_4_6.f')
output_matrix = make_suggestion_features(suggest_val_long, hred_ll_file, num_features=18, do_test=False)
np.savez(os.path.join(DATA_PATH, "val_longtail_suggest_matrix"), output_matrix)

Lines match? 32620 = 32620
('HRED-header ', '0_HED_1479568981.18\n')
Session with less than 20 candidates 0


In [179]:
print(output_matrix[100:120, 0])
# print(output_matrix[120:160, 20])

[-1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1.]


In [39]:
np.sum(np.argmax(output_matrix[:, 20]) == np.argmax(output_matrix[:, 3]))

0

In [122]:
with open(os.path.join(DATA_PATH, 'tr_c3_suggest_matrix.npz'), 'r') as m:
    npz = np.load(m)
    print(type(npz))
    tr_c3 = npz['arr_0']

<class 'numpy.lib.npyio.NpzFile'>


In [95]:
val_c3.shape

(226600, 21)

In [97]:
val_c2.shape

(226600, 21)

In [101]:
print(np.sum(tr_c3[:, 19]))
print(np.sum(tr_c1[:, 19]))

3429056.77195
3271546.39725


In [123]:
num_cand = 20
correct = 0
sessions = 0
for idx in np.arange(tr_c3.shape[0] / num_cand):
    start = idx * num_cand
    end = start + num_cand
    hred = tr_c3[start:end, 19]
    label = tr_c3[start:end, 20]
    correct += np.sum(np.argmin(hred) == np.argmax(label))
    sessions += 1

print(correct, " ", sessions, " ", float(correct)/sessions)

(1984, ' ', 17960, ' ', 0.11046770601336303)


In [124]:
num_cand = 20
correct = 0
sessions = 0
for idx in np.arange(tr_c1.shape[0] / num_cand):
    start = idx * num_cand
    end = start + num_cand
    hred = tr_c1[start:end, 19]
    label = tr_c1[start:end, 20]
    correct += np.sum(np.argmin(hred) == np.argmax(label))
    sessions += 1

print(correct, " ", sessions, " ", float(correct)/sessions)

(1957, ' ', 17960, ' ', 0.10896436525612473)
