In [1]:
import pandas as pd
import numpy as np
import tarfile
import io
import glob
import dask.dataframe as dd

# import xgboost as xgb
# from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

import lightgbm as lgb
import warnings
warnings.simplefilter("ignore")



tar = tarfile.open('../data/raw/20181120_track_features.tar.gz', 'r:gz')
csv_files = tar.getnames()

tf_df_list = []

for csv_file in [csv_files[2], csv_files[4]]:
    csv_contents = tar.extractfile(csv_file).read()
    tf_df_list.append(pd.read_csv(io.BytesIO(csv_contents), encoding='utf8'))

tf_df = pd.concat(tf_df_list, ignore_index=True)
tf_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)

kmean300_df = pd.read_csv('../data/interim/all_data/mbKMeans300clusters.csv', usecols=['track_id','clus'])
kmean300_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)



In [2]:
def get_sim(df_hist, df_lookup, sim_file_list, score_name_list):
    df_hist['ListenYes'] = (df_hist['skip_2'] == False)*1
    df_hist['ListenYes'].replace(0, -1, inplace = True)
    df_hist = df_hist.groupby(['session_id', 'clus']).agg({'ListenYes':['sum']})
    df_hist = df_hist.reset_index()
    df_hist.columns = df_hist.columns.droplevel(level = 1) # take out the unwanted level
    df_pivot = pd.pivot_table(df_hist, values = 'ListenYes',index='session_id', columns='clus')
    df_pivot = df_pivot.fillna(0)
    
    
    for sim_file, score_name in zip(sim_file_list, score_name_list):
        sim_matrix = pd.read_csv(sim_file).drop(columns=['Unnamed: 0'])
        sim_matrix.columns = list(map(str, range(0,len(sim_matrix))))
        df_sim_session = df_pivot.dot(sim_matrix)/sim_matrix.sum()
        
        df_lookup[score_name] = df_sim_session.lookup(df_lookup['session_id'],df_lookup['clus'].astype(str))
    
    return df_lookup

In [3]:

file_list = glob.glob('../data/raw/training_set/log_9*.csv')
    

In [4]:
col_FA = ['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch','hour_of_day','premium',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hist_user_behavior_reason_end_backbtn',
       'hist_user_behavior_reason_end_clickrow',
       'hist_user_behavior_reason_end_endplay',
       'hist_user_behavior_reason_end_fwdbtn',
       'hist_user_behavior_reason_end_logout',
       'hist_user_behavior_reason_end_remote',
       'hist_user_behavior_reason_end_trackdone',
       'hist_user_behavior_reason_start_appload',
       'hist_user_behavior_reason_start_backbtn',
       'hist_user_behavior_reason_start_clickrow',
       'hist_user_behavior_reason_start_endplay',
       'hist_user_behavior_reason_start_fwdbtn',
       'hist_user_behavior_reason_start_playbtn',
       'hist_user_behavior_reason_start_remote',
       'hist_user_behavior_reason_start_trackdone',
       'hist_user_behavior_reason_start_trackerror', 'context_type_catalog',
       'context_type_charts', 'context_type_editorial_playlist',
       'context_type_personalized_playlist', 'context_type_radio',
       'context_type_user_collection']


In [30]:
from timeit import default_timer as timer #to see how long the computation will take


for file in file_list[0:1]:
    start = timer()
    log_df = pd.read_csv(file)
    log_df = log_df.merge(kmean300_df)

    log_df_1 = log_df.loc[log_df['session_position']<(log_df['session_length']/2)]
    log_df_1['hour_of_day'] = log_df_1['hour_of_day'].astype('float')
    log_df_1['premium'] = log_df_1['premium'].astype('bool')
    log_df_1['weekday'] = log_df_1['date'].astype('datetime64[ns]').dt.dayofweek
    log_df_1 = log_df_1.drop(columns = ['date'])
    log_df_1 = pd.get_dummies(log_df_1, columns=['hist_user_behavior_reason_end', 'hist_user_behavior_reason_start', 'context_type'])
    log_df_1_summary = log_df_1.groupby(['session_id'])[col_FA].agg(['mean'])
    log_df_1_summary.columns = log_df_1_summary.columns.get_level_values(0)+'_'+log_df_1_summary.columns.get_level_values(1)
    log_df_history = log_df_1[['session_id','track_id_clean','skip_2','clus']]


    half_cut = log_df['session_length']/2
    log_df_2 = log_df.loc[log_df['session_position']>half_cut]
    log_df_2 = log_df_2[['session_id','track_id_clean','skip_2','session_position','session_length','clus']]
    log_df_2['weight'] = 1/(log_df['session_position']-half_cut)
    log_df_2 = log_df_2.merge(log_df_1_summary, on='session_id')


    sim_file_list = ['../models/SVD/similarity/k300_CanbDist.csv',
                     '../models/SVD/similarity/k300_CosSim.csv',
                     '../models/SVD/similarity/k300_LinCorr.csv',
                     '../models/SVD/similarity/k300_ManhDist.csv',
                     '../models/SVD/similarity/k300_HammDist.csv',
                     '../models/SVD/similarity/k300_SpearCorr.csv',
                     '../models/SVD/similarity/k300_KendCorr.csv']
    score_name_list = ['CanbDist300', 'CosSim300','LinCorr300','ManhDist300','HammDist300','SpearCorr300','KendCorr300']

    df_lookup = get_sim(log_df_history, log_df_2, sim_file_list, score_name_list)
    df_lookup = df_lookup.merge(tf_df)
    df_lookup.drop(columns = ['key','time_signature','mode'], inplace = True)
    df_lookup.sort_values(['session_id','session_position'], inplace = True)
    X = df_lookup.drop(columns = ['session_id','track_id_clean','skip_2']).to_numpy()
    
    bst = lgb.Booster(model_file='../models/SVD/LightGBM_BayesOpt/LightGBM_incremental_training/boost410.txt')  # init model

    ypred = bst.predict(X)
    print('Runtime per batch: %0.2fs' % (timer() - start))





Runtime per batch: 1428.98s


In [31]:
ytrue = df_lookup['skip_2']
ytrue

0           True
51         False
36         False
27         False
22          True
           ...  
1069270     True
668177      True
28823       True
668178     False
993324     False
Name: skip_2, Length: 1631612, dtype: bool

In [32]:
from sklearn.metrics import accuracy_score
print(accuracy_score(ypred>0.3, ytrue*1))
print(accuracy_score(ypred>0.4, ytrue*1))
print(accuracy_score(ypred>0.5, ytrue*1))
print(accuracy_score(ypred>0.6, ytrue*1))
print(accuracy_score(ypred>0.7, ytrue*1))

0.6089223418312687
0.6302913928066232
0.6397317499503559
0.6389307016619147
0.6217158245955533


In [33]:
## evaludation functions

def get_ground_truth(test_output):

    ground_truths = [] 
    df = test_output
    df[['session_position','session_length']] = df[['session_position','session_length']].astype('int64')
    df = df[['session_id','skip_2','session_position','session_length']].loc[df['session_position']*2 > df['session_length']]
    df = df.reset_index()
    current_index = 0
    while current_index < len(df):
        partial_length = df['session_length'].iloc[current_index]-df['session_position'].iloc[current_index]+1
        session_skips = list(df.loc[current_index:current_index+partial_length-1, 'skip_2'])
        ground_truths.append(session_skips)
        current_index += partial_length 
    return ground_truths


def get_submission(test_output):
    submission = []
    for s in test_output['session_id'].unique():
        submission.append(np.array(test_output['pred'][test_output['session_id']==s]*1))
    return submission


def evaluate(submission,groundtruth):
    ap_sum = 0.0
    first_pred_acc_sum = 0.0
    counter = 0
    for sub, tru in zip(submission, groundtruth):
        if len(sub) != len(tru):
            raise Exception('Line {} should contain {} predictions, but instead contains '
                            '{}'.format(counter+1,len(tru),len(sub)))
        ap_sum += ave_pre(sub,tru,counter)
        first_pred_acc_sum += sub[0] == tru[0]
        counter+=1
    ap = ap_sum/counter
    first_pred_acc = first_pred_acc_sum/counter
    return ap,first_pred_acc


def ave_pre(submission,groundtruth,counter):
    s = 0.0
    t = 0.0
    c = 1.0
    for x, y in zip(submission, groundtruth):
        if x != 0 and x != 1:
            raise Exception('Invalid prediction in line {}, should be 0 or 1'.format(counter))
        if x==y:
            s += 1.0
            t += s / c
        c += 1
    return t/len(groundtruth)

def spotify_eval(y_true, y_pred, input_df):
    df_temp = input_df.loc[y_true.index.values,['session_id','skip_2','session_position','session_length']]
    df_temp['pred'] = y_pred
    ground_truths = get_ground_truth(df_temp)
    submission = get_submission(df_temp)
#     return ground_truths, submission
    ap,first_pred_acc = evaluate(submission,ground_truths)
    return ap,first_pred_acc

In [34]:
s_list = df_lookup['session_id'].unique()[0:2000]
sel_row = df_lookup['session_id'].isin(s_list)

In [35]:
ap,first_pred_acc  = spotify_eval(ytrue[sel_row], ypred[sel_row]>0.5, df_lookup.sort_values(['session_id','session_position']).loc[sel_row])

In [36]:
ap

0.5344512774785841

In [37]:
first_pred_acc

0.7165

In [51]:
df_lookup.sort_values(['session_id','session_position'])

Unnamed: 0,session_id,track_id_clean,skip_2,session_position,session_length,clus,weight,skip_1_mean,skip_2_mean,skip_3_mean,...,tempo,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7
13,16_000018ea-e3cf-429c-b703-c0d4bf47dfd3,t_ad379e0e-9900-4f77-a987-22ab58ff534b,False,10,20,10,inf,0.666667,0.888889,1.0,...,77.442001,0.463010,-0.574928,0.405364,0.393322,-0.140053,-0.467671,0.146365,-0.253449,0.140439
0,16_000018ea-e3cf-429c-b703-c0d4bf47dfd3,t_c73a234b-75fa-467a-b4ff-c16d58542987,True,11,20,89,1.000000,0.666667,0.888889,1.0,...,167.035995,0.469646,-0.594298,0.442224,0.433947,-0.145637,-0.406807,0.116554,-0.340862,-0.015801
56,16_000018ea-e3cf-429c-b703-c0d4bf47dfd3,t_07119c40-33ec-4e54-97a5-8d463034585a,False,12,20,89,0.500000,0.666667,0.888889,1.0,...,73.510002,0.425829,-0.577018,0.417023,0.409219,-0.118601,-0.355002,0.129817,-0.272819,-0.006615
40,16_000018ea-e3cf-429c-b703-c0d4bf47dfd3,t_8e862b46-12e3-4d9f-9c61-d927cf1bec06,False,13,20,2,0.333333,0.666667,0.888889,1.0,...,72.973000,0.455954,-0.649986,0.372158,0.413511,-0.045339,-0.375019,0.094400,-0.341108,-0.004756
30,16_000018ea-e3cf-429c-b703-c0d4bf47dfd3,t_764c1d6d-8421-436f-a1d8-d6460fa17a2f,False,14,20,142,0.250000,0.666667,0.888889,1.0,...,77.814003,0.733364,-0.619060,0.417586,0.440102,-0.078921,-0.483829,0.115740,-0.238342,0.057817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191739,16_ffff3da9-96aa-4033-9923-eeaa80500343,t_23693da0-5ab0-455f-95db-a14bca368bf7,True,6,10,89,1.000000,1.000000,1.000000,1.0,...,133.819000,0.114424,-0.782381,0.339352,0.220222,0.048448,-0.325022,0.008673,-0.400867,0.176368
764446,16_ffff3da9-96aa-4033-9923-eeaa80500343,t_0b971195-768a-48b1-af21-d05d7db9ac5f,True,7,10,75,0.500000,1.000000,1.000000,1.0,...,152.968994,0.470737,-0.917530,0.316887,0.237263,0.130751,-0.362103,-0.051154,-0.521288,0.182506
34156,16_ffff3da9-96aa-4033-9923-eeaa80500343,t_afbf66ce-44de-46ac-a179-6fbd70d04225,True,8,10,1,0.333333,1.000000,1.000000,1.0,...,77.477997,0.081636,-0.672178,0.437069,0.380465,-0.074219,-0.306887,0.118911,-0.231941,0.108530
764447,16_ffff3da9-96aa-4033-9923-eeaa80500343,t_0b971195-768a-48b1-af21-d05d7db9ac5f,False,9,10,75,0.250000,1.000000,1.000000,1.0,...,152.968994,0.470737,-0.917530,0.316887,0.237263,0.130751,-0.362103,-0.051154,-0.521288,0.182506


In [28]:
len(df_lookup['skip_2'][sel_row])

19195