In [1]:
import pandas as pd
import numpy as np
import tarfile
import io
import glob
import dask.dataframe as dd

# import xgboost as xgb
# from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

import lightgbm as lgb
import warnings
warnings.simplefilter("ignore")



tar = tarfile.open('../data/raw/20181120_track_features.tar.gz', 'r:gz')
csv_files = tar.getnames()

tf_df_list = []

for csv_file in [csv_files[2], csv_files[4]]:
    csv_contents = tar.extractfile(csv_file).read()
    tf_df_list.append(pd.read_csv(io.BytesIO(csv_contents), encoding='utf8'))

tf_df = pd.concat(tf_df_list, ignore_index=True)
tf_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)

kmean300_df = pd.read_csv('../data/interim/all_data/mbKMeans300clusters.csv', usecols=['track_id','clus'])
kmean300_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)



In [16]:
import json

list_bayesOpt = glob.glob('../models/SVD/LightGBM_BayesOpt_dec17/for20180918/logs_shuffle10.json')

opt_best_df = pd.DataFrame()
for jsonFile in list_bayesOpt:
    with open(jsonFile) as f:
        optList = []
        for jsonObj in f:
            optDict = json.loads(jsonObj)
            optList.append(optDict)
        
        opt_df = pd.DataFrame(optList)
        opt_df = pd.concat([opt_df.drop(['params'], axis=1), opt_df['params'].apply(pd.Series)], axis=1)
        opt_best_df = pd.concat([opt_best_df,opt_df.sort_values('target',ascending=False).iloc[0:5]])

opt_best_df.sort_values('target',ascending=False)
# opt_df = pd.DataFrame(optList)
# opt_df = pd.concat([opt_df.drop(['params'], axis=1), opt_df['params'].apply(pd.Series)], axis=1)
# opt_df.sort_values('target',ascending=False)

Unnamed: 0,target,datetime,bagging_fraction,bagging_freq,batch_size,feature_fraction,learning_rate,min_gain_to_split,nFile,num_iterations,num_leaves
20,0.720543,"{'datetime': '2022-12-20 20:47:08', 'elapsed':...",0.8,2.0,10.0,0.731797,0.15348,0.089333,10.0,850.978738,25.156269
33,0.72036,"{'datetime': '2022-12-21 03:42:12', 'elapsed':...",0.8,2.0,10.0,0.435503,0.226245,0.088026,10.0,859.312532,29.996526
24,0.720333,"{'datetime': '2022-12-20 22:40:18', 'elapsed':...",0.8,2.0,10.0,0.798139,0.153018,0.028336,10.0,842.720473,25.606681
15,0.720302,"{'datetime': '2022-12-20 18:55:13', 'elapsed':...",0.8,2.0,10.0,0.673718,0.346856,0.013443,10.0,842.212623,27.897216
27,0.720035,"{'datetime': '2022-12-21 00:07:55', 'elapsed':...",0.8,2.0,10.0,0.728087,0.355348,0.078096,10.0,858.86716,27.280231


In [3]:
opt_best_df.mean()

target                 0.707934
bagging_fraction       0.873855
bagging_freq           1.954484
batch_size            10.000000
feature_fraction       0.711096
learning_rate          0.182898
min_gain_to_split      0.281763
nFile                 15.000000
num_iterations       828.763990
num_leaves            26.770270
dtype: float64

In [4]:
opt_best_df.median()

target                 0.708171
bagging_fraction       0.881030
bagging_freq           1.358203
batch_size            10.000000
feature_fraction       0.737053
learning_rate          0.186463
min_gain_to_split      0.322700
nFile                 15.000000
num_iterations       908.125834
num_leaves            28.884808
dtype: float64

In [5]:
def get_sim(df_hist, df_lookup, sim_file_list, score_name_list):
    df_hist['ListenYes'] = (df_hist['skip_2'] == False)*1
    df_hist['ListenYes'].replace(0, -1, inplace = True)
    df_hist = df_hist.groupby(['session_id', 'clus']).agg({'ListenYes':['sum']})
    df_hist = df_hist.reset_index()
    df_hist.columns = df_hist.columns.droplevel(level = 1) # take out the unwanted level
    df_pivot = pd.pivot_table(df_hist, values = 'ListenYes',index='session_id', columns='clus')
    df_pivot = df_pivot.fillna(0)
    
    
    for sim_file, score_name in zip(sim_file_list, score_name_list):
        sim_matrix = pd.read_csv(sim_file).drop(columns=['Unnamed: 0'])
        sim_matrix.columns = list(map(str, range(0,len(sim_matrix))))
        df_sim_session = df_pivot.dot(sim_matrix)/sim_matrix.sum()
        
        df_lookup[score_name] = df_sim_session.lookup(df_lookup['session_id'],df_lookup['clus'].astype(str))
    
    return df_lookup

In [6]:
import random

file_list = []

temp_list = []
for logN in range(10):
    temp_list.append('../data/raw/training_set/log_'+str(logN)+'_20180917_000000000000.csv')

random.Random(23).shuffle(temp_list)
file_list += temp_list
    
    
temp_list = []
for logN in range(10):
    temp_list.append('../data/raw/training_set/log_'+str(logN)+'_20180916_000000000000.csv')
    
random.Random(23).shuffle(temp_list)
file_list += temp_list


temp_list = []
for logN in range(10):
    temp_list.append('../data/raw/training_set/log_'+str(logN)+'_20180915_000000000000.csv')
    
random.Random(23).shuffle(temp_list)
file_list += temp_list


temp_list = []
for logN in range(10):
    temp_list.append('../data/raw/training_set/log_'+str(logN)+'_20180914_000000000000.csv')
    
random.Random(23).shuffle(temp_list)
file_list += temp_list


temp_list = []
for logN in range(10):
    temp_list.append('../data/raw/training_set/log_'+str(logN)+'_20180913_000000000000.csv')
    
random.Random(23).shuffle(temp_list)
file_list += temp_list


temp_list = []
for logN in range(10):
    temp_list.append('../data/raw/training_set/log_'+str(logN)+'_20180912_000000000000.csv')
    
random.Random(23).shuffle(temp_list)
file_list += temp_list


temp_list = []
for logN in range(10):
    temp_list.append('../data/raw/training_set/log_'+str(logN)+'_20180911_000000000000.csv')
    
random.Random(23).shuffle(temp_list)
file_list += temp_list

file_list

['../data/raw/training_set/log_7_20180917_000000000000.csv',
 '../data/raw/training_set/log_8_20180917_000000000000.csv',
 '../data/raw/training_set/log_5_20180917_000000000000.csv',
 '../data/raw/training_set/log_6_20180917_000000000000.csv',
 '../data/raw/training_set/log_3_20180917_000000000000.csv',
 '../data/raw/training_set/log_2_20180917_000000000000.csv',
 '../data/raw/training_set/log_9_20180917_000000000000.csv',
 '../data/raw/training_set/log_0_20180917_000000000000.csv',
 '../data/raw/training_set/log_1_20180917_000000000000.csv',
 '../data/raw/training_set/log_4_20180917_000000000000.csv',
 '../data/raw/training_set/log_7_20180916_000000000000.csv',
 '../data/raw/training_set/log_8_20180916_000000000000.csv',
 '../data/raw/training_set/log_5_20180916_000000000000.csv',
 '../data/raw/training_set/log_6_20180916_000000000000.csv',
 '../data/raw/training_set/log_3_20180916_000000000000.csv',
 '../data/raw/training_set/log_2_20180916_000000000000.csv',
 '../data/raw/training_s

In [7]:
len(file_list)

70

In [8]:
def prep_dfs(file, tf_df):
    log_df = pd.read_csv(file)
    log_df = log_df.merge(kmean300_df)

    log_df_1 = log_df.loc[log_df['session_position']<=(log_df['session_length']/2)]
    log_df_1['hour_of_day'] = log_df_1['hour_of_day'].astype('float')
    log_df_1['premium'] = log_df_1['premium'].astype('bool')
#     log_df_1['weekday'] = log_df_1['date'].astype('datetime64[ns]').dt.dayofweek
    log_df_1 = log_df_1.drop(columns = ['date'])
    log_df_1 = pd.get_dummies(log_df_1, columns=['hist_user_behavior_reason_end', 'hist_user_behavior_reason_start', 'context_type'], dtype = 'bool')
#     log_df_1 = pd.get_dummies(log_df_1, columns=['hist_user_behavior_reason_end', 'hist_user_behavior_reason_start', 'context_type','weekday'], dtype = 'bool')
    log_df_1 = log_df_1.merge(tf_df.drop(columns = ['time_signature','mode','key']))
    
                     
    col_bool = log_df_1.columns[log_df_1.dtypes=='bool']
    col_nonbool = log_df_1.columns[log_df_1.dtypes!='bool'].drop(['session_id','track_id_clean','clus'])
    
    # the non-convertable values will be set to 0
    log_df_1[col_nonbool] = log_df_1[col_nonbool].apply(pd.to_numeric, errors='coerce', downcast = 'float').fillna(0).astype('float32')

    # aggregate the track history where ['skip_2']==True
    log_df_1_summary_skip2True = pd.concat([log_df_1.loc[log_df_1['skip_2']==True].groupby(['session_id'])[col_bool].agg(['mean']), 
                                            log_df_1.loc[log_df_1['skip_2']==True].groupby(['session_id'])[col_nonbool].agg(['mean', 'std', 'median'])],
                                            axis = 1)
    log_df_1_summary_skip2True.columns = log_df_1_summary_skip2True.columns.get_level_values(0)+'_sk2True_'+log_df_1_summary_skip2True.columns.get_level_values(1)
    
    # aggregate the track history where ['skip_2']==False
    log_df_1_summary_skip2False = pd.concat([log_df_1.loc[log_df_1['skip_2']==False].groupby(['session_id'])[col_bool].agg(['mean']), 
                                             log_df_1.loc[log_df_1['skip_2']==False].groupby(['session_id'])[col_nonbool].agg(['mean', 'std', 'median'])],
                                             axis = 1)
    log_df_1_summary_skip2False.columns = log_df_1_summary_skip2False.columns.get_level_values(0)+'_sk2False_'+log_df_1_summary_skip2False.columns.get_level_values(1)
    
    
    log_df_history = log_df_1[['session_id','track_id_clean','skip_2','clus']]


    half_cut = log_df['session_length']/2

    # need to at least include 2 trials, otherwise the log_df_1_summary will confound with all the tracks in the same session

    #1st trial in the 2nd half
    log_df_2_1 = log_df.loc[(log_df['session_position']>half_cut) & (log_df['session_position']<=half_cut+1)]
    log_df_2_1 = log_df_2_1[['session_id','track_id_clean','skip_2','session_position','session_length','clus']]
    log_df_2_1['weight'] = 1

    #2nd trial in the 2nd half
    log_df_2_2 = log_df.loc[(log_df['session_position']>half_cut+1) & (log_df['session_position']<=half_cut+2)]
    log_df_2_2 = log_df_2_2[['session_id','track_id_clean','skip_2','session_position','session_length','clus']]
    log_df_2_2['weight'] = 0.75

    #3rd trial in the 2nd half
    log_df_2_3 = log_df.loc[(log_df['session_position']>half_cut+2) & (log_df['session_position']<=half_cut+3)]
    log_df_2_3 = log_df_2_3[['session_id','track_id_clean','skip_2','session_position','session_length','clus']]
    log_df_2_3['weight'] = 0.62
    
    #4th trial in the 2nd half
    log_df_2_4 = log_df.loc[(log_df['session_position']>half_cut+3) & (log_df['session_position']<=half_cut+4)]
    log_df_2_4 = log_df_2_4[['session_id','track_id_clean','skip_2','session_position','session_length','clus']]
    log_df_2_4['weight'] = 0.53
    
    #5th trial in the 2nd half
    log_df_2_5 = log_df.loc[(log_df['session_position']>half_cut+4) & (log_df['session_position']<=half_cut+5)]
    log_df_2_5 = log_df_2_5[['session_id','track_id_clean','skip_2','session_position','session_length','clus']]
    log_df_2_5['weight'] = 0.47

    #remaining trials in the 2nd half
    log_df_2_6 = log_df.loc[(log_df['session_position']>half_cut+5)]
    log_df_2_6 = log_df_2_6[['session_id','track_id_clean','skip_2','session_position','session_length','clus']]
    log_df_2_6['weight'] = 0.35

    log_df_2 = pd.concat([log_df_2_1,log_df_2_2,log_df_2_3,log_df_2_4,log_df_2_5,log_df_2_6])
    log_df_2 = log_df_2.merge(log_df_1_summary_skip2True, on='session_id')
    log_df_2 = log_df_2.merge(log_df_1_summary_skip2False, on='session_id')

    sim_file_list = ['../models/SVD/all_tracks/similarity_for20180918/k300_CanbDist.csv',
                     '../models/SVD/all_tracks/similarity_for20180918/k300_CosSim.csv',
                     '../models/SVD/all_tracks/similarity_for20180918/k300_LinCorr.csv',
                     '../models/SVD/all_tracks/similarity_for20180918/k300_ManhDist.csv',
                     '../models/SVD/all_tracks/similarity_for20180918/k300_HammDist.csv',
                     '../models/SVD/all_tracks/similarity_for20180918/k300_SpearCorr.csv',
                     '../models/SVD/all_tracks/similarity_for20180918/k300_KendCorr.csv',
                     '../models/SVD/all_tracks/similarity_for20180918/k300_ChebDist.csv',
                     '../models/SVD/all_tracks/similarity_for20180918/k300_BrayDist.csv']
    score_name_list = ['CanbDist300', 'CosSim300','LinCorr300','ManhDist300','HammDist300','SpearCorr300','KendCorr300','ChebDist','BrayDist']

    return get_sim(log_df_history, log_df_2, sim_file_list, score_name_list)

In [9]:
import random
from timeit import default_timer as timer #to see how long the computation will take

nFile = 65
batch_size = 5
while nFile < len(file_list):
    start = timer()
    nFile += batch_size
    df_lookup_list = []
    for file in file_list[(nFile-batch_size):min(nFile, len(file_list))]:
        df_lookup_list.append(prep_dfs(file, tf_df))

    df_lookup = pd.concat(df_lookup_list)
    df_lookup = df_lookup.merge(tf_df.drop(columns = ['key','time_signature','mode']))
    
    # check whether the column names match with the previous training set
    if nFile>batch_size:
        prev_feature_names = lgb.Booster(model_file='../models/SVD/LightGBM_BayesOpt_dec17/for20180918/boost_alltracks_incrementalTrain_5_dec23.txt').feature_name()
        if bool(set(prev_feature_names) - set(df_lookup.columns)): # if there are missing columns
            df_lookup[list(set(prev_feature_names) - set(df_lookup.columns))] = 0 # add the missed columns with 0
            
        if bool(set(df_lookup.columns)- set(prev_feature_names)): # if there are extra columns
            extra_cols = list(set(df_lookup.columns)- set(prev_feature_names) - set(['session_id','track_id_clean','skip_2','weight']))
            df_lookup.drop(columns = extra_cols, inplace = True)
    

    dtrain = lgb.Dataset(df_lookup.drop(columns = ['session_id','track_id_clean','skip_2','weight']).astype('float32'), 
                     label=df_lookup['skip_2'],
                     weight = df_lookup['weight'],
                     free_raw_data=False) # https://lightgbm.readthedocs.io/en/latest/FAQ.html#error-messages-cannot-before-construct-dataset

    
    params = {'num_leaves': 25,
              'learning_rate':0.15,
              'metric': 'binary_error',
              'num_iterations':851,
              'bagging_fraction':0.8,
              'bagging_freq':2,
              'feature_fraction':0.73,
              'min_gain_to_split':0.09,
              'objective': 'binary',
              'force_row_wise': True,
              'num_threads': 5,
              'verbosity': 0,
              'tree_learner': 'voting_parallel'} #https://lightgbm.readthedocs.io/en/latest/Parallel-Learning-Guide.html
    
    if nFile == batch_size:
        bst = lgb.train(params, dtrain)
    else: # continue training on the previous model
        bst = lgb.train(params, dtrain, init_model='../models/SVD/LightGBM_BayesOpt_dec17/for20180918/boost_alltracks_incrementalTrain_'+str(int(nFile-batch_size))+'_dec23.txt')
        
    bst.save_model('../models/SVD/LightGBM_BayesOpt_dec17/for20180918/boost_alltracks_incrementalTrain_'+str(int(nFile))+'_dec23.txt')

    print('Runtime per batch: %0.2fs' % (timer() - start))

Runtime per batch: 2644.98s
