In [1]:
import pandas as pd
import numpy as np
import tarfile
import io
import glob
import dask.dataframe as dd

# import xgboost as xgb
# from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

import lightgbm as lgb
import warnings
warnings.simplefilter("ignore")



tar = tarfile.open('../data/raw/20181120_track_features.tar.gz', 'r:gz')
csv_files = tar.getnames()

tf_df_list = []

for csv_file in [csv_files[2], csv_files[4]]:
    csv_contents = tar.extractfile(csv_file).read()
    tf_df_list.append(pd.read_csv(io.BytesIO(csv_contents), encoding='utf8'))

tf_df = pd.concat(tf_df_list, ignore_index=True)
tf_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)

kmean300_df = pd.read_csv('../data/interim/all_data/mbKMeans300clusters.csv', usecols=['track_id','clus'])
kmean300_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)



In [2]:
import json

list_bayesOpt = glob.glob('../models/SVD/LightGBM_BayesOpt/logs_shuffle*.json')

opt_best_df = pd.DataFrame()
for jsonFile in list_bayesOpt:
    with open(jsonFile) as f:
        optList = []
        for jsonObj in f:
            optDict = json.loads(jsonObj)
            optList.append(optDict)
        
        opt_df = pd.DataFrame(optList)
        opt_df = pd.concat([opt_df.drop(['params'], axis=1), opt_df['params'].apply(pd.Series)], axis=1)
        opt_best_df = pd.concat([opt_best_df,opt_df.sort_values('target',ascending=False).iloc[0:2]])

opt_best_df.sort_values('target',ascending=False)
# opt_df = pd.DataFrame(optList)
# opt_df = pd.concat([opt_df.drop(['params'], axis=1), opt_df['params'].apply(pd.Series)], axis=1)
# opt_df.sort_values('target',ascending=False)

Unnamed: 0,target,datetime,bagging_fraction,bagging_freq,batch_size,learning_rate,nFile,num_iterations,num_leaves
21,0.740106,"{'datetime': '2022-12-09 21:35:53', 'elapsed':...",0.808382,3.260206,10.0,0.045185,40.0,438.263273,45.704734
3,0.740106,"{'datetime': '2022-12-09 21:15:21', 'elapsed':...",0.663565,1.994299,10.0,0.094274,40.0,479.436874,37.916497
14,0.740093,"{'datetime': '2022-12-10 00:08:06', 'elapsed':...",0.883442,9.491606,10.0,0.018799,90.0,219.284683,93.503262
24,0.740057,"{'datetime': '2022-12-10 00:19:16', 'elapsed':...",0.317533,5.362842,10.0,0.079636,90.0,229.71574,96.115682
24,0.739845,"{'datetime': '2022-12-09 22:12:22', 'elapsed':...",0.882175,2.103547,10.0,0.04975,50.0,422.996505,62.27869
10,0.73982,"{'datetime': '2022-12-09 21:56:38', 'elapsed':...",0.702082,1.710437,10.0,0.088671,50.0,442.575021,46.245643
0,0.739717,"{'datetime': '2022-12-10 00:24:55', 'elapsed':...",0.565568,9.522663,10.0,0.028957,100.0,717.599877,20.878224
3,0.739596,"{'datetime': '2022-12-09 23:23:16', 'elapsed':...",0.663565,1.994299,10.0,0.094274,80.0,479.436874,37.916497
21,0.739565,"{'datetime': '2022-12-09 20:34:26', 'elapsed':...",0.834545,9.822706,10.0,0.067652,20.0,484.308256,89.495379
23,0.739562,"{'datetime': '2022-12-09 23:45:58', 'elapsed':...",0.976386,1.522631,10.0,0.075943,80.0,715.380881,56.672997


In [3]:
opt_best_df.mean()

target                0.739600
bagging_fraction      0.754410
bagging_freq          4.418216
batch_size           10.000000
learning_rate         0.072205
nFile                52.631579
num_iterations      534.327795
num_leaves           57.122572
dtype: float64

In [4]:
opt_best_df.median()

target                0.739562
bagging_fraction      0.702082
bagging_freq          2.103547
batch_size           10.000000
learning_rate         0.080182
nFile                50.000000
num_iterations      479.436874
num_leaves           56.672997
dtype: float64

In [5]:
def get_sim(df_hist, df_lookup, sim_file_list, score_name_list):
    df_hist['ListenYes'] = (df_hist['skip_2'] == False)*1
    df_hist['ListenYes'].replace(0, -1, inplace = True)
    df_hist = df_hist.groupby(['session_id', 'clus']).agg({'ListenYes':['sum']})
    df_hist = df_hist.reset_index()
    df_hist.columns = df_hist.columns.droplevel(level = 1) # take out the unwanted level
    df_pivot = pd.pivot_table(df_hist, values = 'ListenYes',index='session_id', columns='clus')
    df_pivot = df_pivot.fillna(0)
    
    
    for sim_file, score_name in zip(sim_file_list, score_name_list):
        sim_matrix = pd.read_csv(sim_file).drop(columns=['Unnamed: 0'])
        sim_matrix.columns = list(map(str, range(0,len(sim_matrix))))
        df_sim_session = df_pivot.dot(sim_matrix)/sim_matrix.sum()
        
        df_lookup[score_name] = df_sim_session.lookup(df_lookup['session_id'],df_lookup['clus'].astype(str))
    
    return df_lookup

In [6]:
file_list = []
for n in range(9):
    file_list = file_list + glob.glob('../data/raw/training_set/log_'+str(n)+'*.csv')
    

In [7]:
col_FA = ['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch','hour_of_day','premium',
       'no_pause_before_play', 'short_pause_before_play',
       'long_pause_before_play', 'hist_user_behavior_n_seekfwd',
       'hist_user_behavior_n_seekback', 'hist_user_behavior_is_shuffle',
       'hist_user_behavior_reason_end_backbtn',
       'hist_user_behavior_reason_end_clickrow',
       'hist_user_behavior_reason_end_endplay',
       'hist_user_behavior_reason_end_fwdbtn',
       'hist_user_behavior_reason_end_logout',
       'hist_user_behavior_reason_end_remote',
       'hist_user_behavior_reason_end_trackdone',
       'hist_user_behavior_reason_start_appload',
       'hist_user_behavior_reason_start_backbtn',
       'hist_user_behavior_reason_start_clickrow',
       'hist_user_behavior_reason_start_endplay',
       'hist_user_behavior_reason_start_fwdbtn',
       'hist_user_behavior_reason_start_playbtn',
       'hist_user_behavior_reason_start_remote',
       'hist_user_behavior_reason_start_trackdone',
       'hist_user_behavior_reason_start_trackerror', 'context_type_catalog',
       'context_type_charts', 'context_type_editorial_playlist',
       'context_type_personalized_playlist', 'context_type_radio',
       'context_type_user_collection']


In [None]:
import random
from timeit import default_timer as timer #to see how long the computation will take

nFile = 220
batch_size = 10
while nFile < len(file_list):
    start = timer()
    nFile += batch_size
    df_lookup_list = []
    for file in file_list[(nFile-batch_size):min(nFile, len(file_list))]:
       
        log_df = pd.read_csv(file)
        log_df = log_df.merge(kmean300_df)

        log_df_1 = log_df.loc[log_df['session_position']<(log_df['session_length']/2)]
        log_df_1['hour_of_day'] = log_df_1['hour_of_day'].astype('float')
        log_df_1['premium'] = log_df_1['premium'].astype('bool')
        log_df_1['weekday'] = log_df_1['date'].astype('datetime64[ns]').dt.dayofweek
        log_df_1 = log_df_1.drop(columns = ['date'])
        log_df_1 = pd.get_dummies(log_df_1, columns=['hist_user_behavior_reason_end', 'hist_user_behavior_reason_start', 'context_type'])
        log_df_1_summary = log_df_1.groupby(['session_id'])[col_FA].agg(['mean'])
        log_df_1_summary.columns = log_df_1_summary.columns.get_level_values(0)+'_'+log_df_1_summary.columns.get_level_values(1)
        log_df_history = log_df_1[['session_id','track_id_clean','skip_2','clus']]


        half_cut = log_df['session_length']/2

        # need to at least include 2 trials, otherwise the log_df_1_summary will confound with all the tracks in the same session

        #1st trial in the 2nd half
        log_df_2_1 = log_df.loc[(log_df['session_position']>=half_cut) & (log_df['session_position']<half_cut+1)]
        log_df_2_1 = log_df_2_1[['session_id','track_id_clean','skip_2','session_position','session_length','clus']]
        log_df_2_1['weight'] = 1
        
        #2nd trial in the 2nd half
        log_df_2_2 = log_df.loc[(log_df['session_position']>=half_cut+1) & (log_df['session_position']<half_cut+2)]
        log_df_2_2 = log_df_2_2[['session_id','track_id_clean','skip_2','session_position','session_length','clus']]
        log_df_2_2['weight'] = 0.5
        
        log_df_2 = pd.concat([log_df_2_1,log_df_2_2])
        log_df_2 = log_df_2.merge(log_df_1_summary, on='session_id')


        sim_file_list = ['../models/SVD/similarity/k300_CanbDist.csv',
                         '../models/SVD/similarity/k300_CosSim.csv',
                         '../models/SVD/similarity/k300_LinCorr.csv',
                         '../models/SVD/similarity/k300_ManhDist.csv',
                         '../models/SVD/similarity/k300_HammDist.csv',
                         '../models/SVD/similarity/k300_SpearCorr.csv',
                         '../models/SVD/similarity/k300_KendCorr.csv']
        score_name_list = ['CanbDist300', 'CosSim300','LinCorr300','ManhDist300','HammDist300','SpearCorr300','KendCorr300']

        df_lookup_list.append(get_sim(log_df_history, log_df_2, sim_file_list, score_name_list))


    df_lookup = pd.concat(df_lookup_list)
    df_lookup = df_lookup.merge(tf_df)
    df_lookup.drop(columns = ['key','time_signature','mode'], inplace = True)
#     df_lookup = pd.get_dummies(df_lookup, columns=['key','time_signature','mode'])

    dtrain = lgb.Dataset(df_lookup.drop(columns = ['session_id','track_id_clean','skip_2']), 
                         label=df_lookup['skip_2'],
                         weight = df_lookup['weight'])
    
    params = {'num_leaves': 45,
              'learning_rate':0.07,
              'metric': 'binary_error',
              'num_iterations':440,
              'bagging_fraction':0.72,
              'bagging_freq':3,
              'objective': 'binary',
              'force_row_wise': True,
              'num_threads': 5,
              'verbosity': 0,
              'tree_learner': 'data'} #https://lightgbm.readthedocs.io/en/latest/Parallel-Learning-Guide.html
    
    if nFile == batch_size:
        bst = lgb.train(params, dtrain, num_boost_round=100)
    else: # continue training on the previous model
        bst = lgb.train(params, dtrain, num_boost_round=100, init_model='../models/SVD/LightGBM_BayesOpt/LightGBM_incremental_training/boost'+str(int(nFile-batch_size))+'.txt')
        
    bst.save_model('../models/SVD/LightGBM_BayesOpt/LightGBM_incremental_training/boost'+str(int(nFile))+'.txt')

    print('Runtime per batch: %0.2fs' % (timer() - start))

Runtime per batch: 1257.38s
Runtime per batch: 1116.35s
Runtime per batch: 1311.74s
Runtime per batch: 1366.48s
Runtime per batch: 1274.30s
Runtime per batch: 1293.05s
Runtime per batch: 1517.06s
Runtime per batch: 2278.90s
Runtime per batch: 2680.02s
Runtime per batch: 1501.70s
Runtime per batch: 1490.85s
Runtime per batch: 1509.07s
Runtime per batch: 1644.65s
Runtime per batch: 1726.37s
