In [1]:
import pandas as pd
import numpy as np


def get_ground_truth(test_output):

    ground_truths = [] 
    df = test_output.copy()
    df[['session_position','session_length']] = df[['session_position','session_length']].astype('int64')
    df = df[['session_id','skip_2','session_position','session_length']].loc[df['session_position']*2 > df['session_length']]
    df = df.reset_index()
    current_index = 0
    while current_index < len(df):
        partial_length = df['session_length'].iloc[current_index]-df['session_position'].iloc[current_index]+1
        session_skips = list(df.loc[current_index:current_index+partial_length-1, 'skip_2'])
        ground_truths.append(session_skips)
        current_index += partial_length 
    return ground_truths


def get_submission(test_output):
    submission = []
    for s in test_output['session_id'].unique():
        submission.append(np.array(test_output['pred'][test_output['session_id']==s]*1))
    return submission


def evaluate(submission,groundtruth):
    ap_sum = 0.0
    first_pred_acc_sum = 0.0
    counter = 0
    for sub, tru in zip(submission, groundtruth):
        if len(sub) != len(tru):
            raise Exception('Line {} should contain {} predictions, but instead contains '
                            '{}'.format(counter+1,len(tru),len(sub)))
        ap_sum += ave_pre(sub,tru,counter)
        first_pred_acc_sum += sub[0] == tru[0]
        counter+=1
    ap = ap_sum/counter
    first_pred_acc = first_pred_acc_sum/counter
    return ap,first_pred_acc


def ave_pre(submission,groundtruth,counter):
    s = 0.0
    t = 0.0
    c = 1.0
    for x, y in zip(submission, groundtruth):
        if x != 0 and x != 1:
            raise Exception('Invalid prediction in line {}, should be 0 or 1'.format(counter))
        if x==y:
            s += 1.0
            t += s / c
        c += 1
    return t/len(groundtruth)

def precise_weight_perSession(df_test_pred):
    # calculate the average weight per session position (counted from the 1st track being predicted) * session length
    def new_weight(n):
            import itertools
            t = [1] * n
            p_list = list(itertools.product([0, 1], repeat=n))
            dd = pd.DataFrame(pd.Series(p_list).tolist())

            ap_list = []
            for p in p_list:
                ap_list.append(ave_pre(p,t,0))
            dd['ap'] = ap_list

            mean_diff = []
            for nn in range(n):
                mean_diff.append(dd.groupby(by = nn)['ap'].mean().diff()[1])

            return mean_diff/mean_diff[0]
    
    # calculate the average weight per session position (counted from the 1st track being predicted), average over session length
    pp = pd.DataFrame([[np.nan] * 10 for i in range(6)])
    c = 0
    for n in range(5,11):
        pp.iloc[c,:n] = new_weight(n)
        c += 1

    ave_weight_list = pp.iloc[-1]
    
    a = df_test_pred['session_position'] - np.floor(df_test_pred['session_length']/2) - 1 # for example, the 6th track in a 10-track session, a will be 6-(10/2)-1 = 0
    if (a<0).sum()>0:
        raise Exception('***a number equal or below 0***')
        
    # as the final AA is calculated by averaging the ap per session, each track in the shorter session will have higher contribution to the final AA
    weight = [i / j for i, j in zip(ave_weight_list[a], np.floor(df_test_pred['session_length']/2))]
    return weight

In [5]:
log_df = pd.read_csv('../data/raw/training_set/log_0_20180715_000000000000.csv')
log_df = log_df.loc[log_df['session_position']>(log_df['session_length']/2)]

In [6]:
weight = precise_weight_perSession(log_df)

In [7]:
weight

[0.1,
 0.07454802545197456,
 0.06182203817796182,
 0.05333804666195337,
 0.046975053024946964,
 0.04188465811534187,
 0.03764266235733764,
 0.03400666599333402,
 0.030825169174830847,
 0.027997172002828002,
 0.1,
 0.07454802545197456,
 0.06182203817796182,
 0.05333804666195337,
 0.046975053024946964,
 0.04188465811534187,
 0.03764266235733764,
 0.03400666599333402,
 0.030825169174830847,
 0.027997172002828002,
 0.1,
 0.07454802545197456,
 0.06182203817796182,
 0.05333804666195337,
 0.046975053024946964,
 0.04188465811534187,
 0.03764266235733764,
 0.03400666599333402,
 0.030825169174830847,
 0.027997172002828002,
 0.1,
 0.07454802545197456,
 0.06182203817796182,
 0.05333804666195337,
 0.046975053024946964,
 0.04188465811534187,
 0.03764266235733764,
 0.03400666599333402,
 0.030825169174830847,
 0.027997172002828002,
 0.2,
 0.14909605090394912,
 0.12364407635592364,
 0.10667609332390673,
 0.09395010604989393,
 0.08376931623068375,
 0.14285714285714285,
 0.10649717921710652,
 0.088317197

In [8]:
[1, 1*0.75, 1*0.75**2]

[1, 0.75, 0.5625]