In [1]:
import pandas as pd
import numpy as np
import tarfile
import io
import glob
import dask.dataframe as dd

# import xgboost as xgb
# from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

import lightgbm as lgb
import warnings
warnings.simplefilter("ignore")



tar = tarfile.open('../data/raw/20181120_track_features.tar.gz', 'r:gz')
csv_files = tar.getnames()

tf_df_list = []

for csv_file in [csv_files[2], csv_files[4]]:
    csv_contents = tar.extractfile(csv_file).read()
    tf_df_list.append(pd.read_csv(io.BytesIO(csv_contents), encoding='utf8'))

tf_df = pd.concat(tf_df_list, ignore_index=True)
tf_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)

kmean300_df = pd.read_csv('../data/interim/all_data/mbKMeans300clusters.csv', usecols=['track_id','clus'])
kmean300_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)



In [2]:
def get_sim(df_hist, df_lookup, sim_file_list, score_name_list):
    df_hist['ListenYes'] = (df_hist['skip_2'] == False)*1
    df_hist['ListenYes'].replace(0, -1, inplace = True)
    df_hist = df_hist.groupby(['session_id', 'clus']).agg({'ListenYes':['sum']})
    df_hist = df_hist.reset_index()
    df_hist.columns = df_hist.columns.droplevel(level = 1) # take out the unwanted level
    df_pivot = pd.pivot_table(df_hist, values = 'ListenYes',index='session_id', columns='clus')
    df_pivot = df_pivot.fillna(0)
    
    
    for sim_file, score_name in zip(sim_file_list, score_name_list):
        sim_matrix = pd.read_csv(sim_file).drop(columns=['Unnamed: 0'])
        sim_matrix.columns = list(map(str, range(0,len(sim_matrix))))
        df_sim_session = df_pivot.dot(sim_matrix)/sim_matrix.sum()
        
        df_lookup[score_name] = df_sim_session.lookup(df_lookup['session_id'],df_lookup['clus'].astype(str))
    
    return df_lookup

In [4]:
file_list = []
for n in range(9):
    file_list = file_list + glob.glob('../data/raw/training_set/log_'+str(n)+'*.csv')
    

In [None]:
import random

for nShuffle in range(10):
    df_lookup_list = []
    random.shuffle(file_list) # randomly sampled 30 files to do parameter tuning
    for file in file_list[0:30]:
        log_df = pd.read_csv(file)

        log_df = log_df[['session_id','track_id_clean','skip_2','session_position','session_length','hour_of_day','premium']].merge(kmean300_df)

        log_df_1 = log_df.loc[log_df['session_position']<(log_df['session_length']/2)]

        # as the entire dataset will be too big to train, 
        # train the first track of the 2nd half (prediction set) should be enough, 
        # as this track contribute most to the spotify metric

        half_cut = log_df['session_length']/2

        log_df_2 = log_df.loc[(log_df['session_position']>=half_cut) & (log_df['session_position']<half_cut+1)]
        log_df_2['weight'] = 1


        sim_file_list = ['../models/SVD/similarity/k300_CanbDist.csv',
                         '../models/SVD/similarity/k300_CosSim.csv',
                         '../models/SVD/similarity/k300_LinCorr.csv',
                         '../models/SVD/similarity/k300_ManhDist.csv',
                         '../models/SVD/similarity/k300_HammDist.csv',
                         '../models/SVD/similarity/k300_SpearCorr.csv',
                         '../models/SVD/similarity/k300_KendCorr.csv']
        score_name_list = ['CanbDist300', 'CosSim300','LinCorr300','ManhDist300','HammDist300','SpearCorr300','KendCorr300']

        df_lookup_list.append(get_sim(log_df_1, log_df_2, sim_file_list, score_name_list))


    df_lookup = pd.concat(df_lookup_list)
    df_lookup = df_lookup.merge(tf_df)
    df_lookup = pd.get_dummies(df_lookup, columns=['key','time_signature','mode'])

    dtrain = lgb.Dataset(df_lookup.drop(columns = ['session_id','track_id_clean','skip_2']), 
                         label=df_lookup['skip_2'],
                         weight = df_lookup['weight'])

    def bo_tune_lgb(num_leaves, learning_rate, num_iterations):
        params = {'num_leaves': int(num_leaves),
                  'learning_rate':learning_rate,
                  'metric': 'binary_error',
                  'num_iterations':int(num_iterations),
                  'early_stopping_round':5,
                  'objective': 'binary',
                  'force_row_wise': True,
                  'num_threads': 4,
                  'verbosity': 0}
        cv_result = lgb.cv(params, dtrain, num_boost_round=100, nfold=4)
        return 1-cv_result['binary_error-mean'][-1]

    lgb_bo = BayesianOptimization(bo_tune_lgb, {'num_leaves': (5, 40),
                                                'learning_rate':(0,1),
                                                'num_iterations': (10,100)
                                                })

    logger = JSONLogger(path='../models/SVD/LightGBM_BayesOpt/logs_shuffle'+str(nShuffle)+'.json')
    lgb_bo.subscribe(Events.OPTIMIZATION_STEP, logger)

    from timeit import default_timer as timer #to see how long the computation will take
    start = timer()
    lgb_bo.maximize(n_iter=1, init_points=1)
    print('Runtime: %0.2fs' % (timer() - start))

In [61]:


def bo_tune_lgb(num_leaves, learning_rate, num_iterations):
    params = {'num_leaves': int(num_leaves),
              'learning_rate':learning_rate,
              'metric': 'binary_error',
              'num_iterations':int(num_iterations),
              'early_stopping_round':5,
              'objective': 'binary',
              'force_row_wise': True,
              'num_threads': 4,
              'verbosity': 0}
    cv_result = lgb.cv(params, dtrain, num_boost_round=100, nfold=4)
    return 1-cv_result['binary_error-mean'][-1]

lgb_bo = BayesianOptimization(bo_tune_lgb, {'num_leaves': (5, 40),
                                            'learning_rate':(0,1),
                                            'num_iterations': (10,100)
                                            })

logger = JSONLogger(path="../models/SVD/LightGBM_BayesOpt/logs.json")
lgb_bo.subscribe(Events.OPTIMIZATION_STEP, logger)

from timeit import default_timer as timer #to see how long the computation will take
start = timer()
lgb_bo.maximize(n_iter=1, init_points=1)
print('Runtime: %0.2fs' % (timer() - start))

Runtime: 190.79s


In [60]:
lgb_bo.max

{'target': 0.6800784035870492,
 'params': {'learning_rate': 0.573254784005274,
  'num_iterations': 42.51568335725786,
  'num_leaves': 35.97155978086444}}