In [1]:
import pandas as pd
import numpy as np
import tarfile
import io
import glob
import dask.dataframe as dd

# import xgboost as xgb
# from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

import lightgbm as lgb
import warnings
warnings.simplefilter("ignore")



tar = tarfile.open('../data/raw/20181120_track_features.tar.gz', 'r:gz')
csv_files = tar.getnames()

tf_df_list = []

for csv_file in [csv_files[2], csv_files[4]]:
    csv_contents = tar.extractfile(csv_file).read()
    tf_df_list.append(pd.read_csv(io.BytesIO(csv_contents), encoding='utf8'))

tf_df = pd.concat(tf_df_list, ignore_index=True)
tf_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)

kmean300_df = pd.read_csv('../data/interim/all_data/mbKMeans300clusters.csv', usecols=['track_id','clus'])
kmean300_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)



In [2]:
import json

list_bayesOpt = glob.glob('../models/SVD/LightGBM_BayesOpt/logs_shuffle*.json')

optList = []
for jsonFile in list_bayesOpt:
    with open(jsonFile) as f:
        for jsonObj in f:
            optDict = json.loads(jsonObj)
            optList.append(optDict)


opt_df = pd.DataFrame(optList)
opt_df = pd.concat([opt_df.drop(['params'], axis=1), opt_df['params'].apply(pd.Series)], axis=1)
opt_df.sort_values('target',ascending=False)

Unnamed: 0,target,datetime,learning_rate,num_iterations,num_leaves
34,0.681502,"{'datetime': '2022-12-07 23:44:47', 'elapsed':...",0.358113,289.131177,26.084380
35,0.681493,"{'datetime': '2022-12-07 23:45:47', 'elapsed':...",0.133303,207.760447,33.150367
48,0.681441,"{'datetime': '2022-12-07 23:52:21', 'elapsed':...",0.254124,290.196840,26.016096
45,0.681438,"{'datetime': '2022-12-07 23:51:02', 'elapsed':...",0.078170,206.492684,34.384655
41,0.681308,"{'datetime': '2022-12-07 23:48:25', 'elapsed':...",0.288358,193.105700,28.048837
...,...,...,...,...,...
62,0.678831,"{'datetime': '2022-12-08 00:07:41', 'elapsed':...",0.893276,382.369298,29.126509
54,0.678700,"{'datetime': '2022-12-08 00:02:54', 'elapsed':...",0.996263,99.691578,36.570915
63,0.678698,"{'datetime': '2022-12-08 00:08:00', 'elapsed':...",0.527956,60.654934,6.889188
58,0.678485,"{'datetime': '2022-12-08 00:04:55', 'elapsed':...",0.095424,479.653026,6.649225


In [3]:
def get_sim(df_hist, df_lookup, sim_file_list, score_name_list):
    df_hist['ListenYes'] = (df_hist['skip_2'] == False)*1
    df_hist['ListenYes'].replace(0, -1, inplace = True)
    df_hist = df_hist.groupby(['session_id', 'clus']).agg({'ListenYes':['sum']})
    df_hist = df_hist.reset_index()
    df_hist.columns = df_hist.columns.droplevel(level = 1) # take out the unwanted level
    df_pivot = pd.pivot_table(df_hist, values = 'ListenYes',index='session_id', columns='clus')
    df_pivot = df_pivot.fillna(0)
    
    
    for sim_file, score_name in zip(sim_file_list, score_name_list):
        sim_matrix = pd.read_csv(sim_file).drop(columns=['Unnamed: 0'])
        sim_matrix.columns = list(map(str, range(0,len(sim_matrix))))
        df_sim_session = df_pivot.dot(sim_matrix)/sim_matrix.sum()
        
        df_lookup[score_name] = df_sim_session.lookup(df_lookup['session_id'],df_lookup['clus'].astype(str))
    
    return df_lookup

In [4]:
file_list = []
for n in range(9):
    file_list = file_list + glob.glob('../data/raw/training_set/log_'+str(n)+'*.csv')
    

In [5]:
import random
from timeit import default_timer as timer #to see how long the computation will take

nFile = 0
batch_size = 20
while nFile < len(file_list):
    nFile += batch_size
    df_lookup_list = []
    for file in file_list[(nFile-batch_size):min(nFile, len(file_list))]:
        
        start = timer()
        
        log_df = pd.read_csv(file)

        log_df = log_df[['session_id','track_id_clean','skip_2','session_position','session_length','hour_of_day','premium']].merge(kmean300_df)
        log_df['hour_of_day'] = log_df['hour_of_day'].astype('float')
        log_df['premium'] = log_df['premium'].astype('bool')
        log_df_1 = log_df.loc[log_df['session_position']<(log_df['session_length']/2)]

        half_cut = log_df['session_length']/2

        log_df_2 = log_df.loc[(log_df['session_position']>=half_cut) & (log_df['session_position']<half_cut+1)]
        log_df_2['weight'] = 1


        sim_file_list = ['../models/SVD/similarity/k300_CanbDist.csv',
                         '../models/SVD/similarity/k300_CosSim.csv',
                         '../models/SVD/similarity/k300_LinCorr.csv',
                         '../models/SVD/similarity/k300_ManhDist.csv',
                         '../models/SVD/similarity/k300_HammDist.csv',
                         '../models/SVD/similarity/k300_SpearCorr.csv',
                         '../models/SVD/similarity/k300_KendCorr.csv']
        score_name_list = ['CanbDist300', 'CosSim300','LinCorr300','ManhDist300','HammDist300','SpearCorr300','KendCorr300']

        df_lookup_list.append(get_sim(log_df_1, log_df_2, sim_file_list, score_name_list))


    df_lookup = pd.concat(df_lookup_list)
    df_lookup = df_lookup.merge(tf_df)
    df_lookup = pd.get_dummies(df_lookup, columns=['key','time_signature','mode'])

    dtrain = lgb.Dataset(df_lookup.drop(columns = ['session_id','track_id_clean','skip_2']), 
                         label=df_lookup['skip_2'],
                         weight = df_lookup['weight'])
    
    params = {'num_leaves': 26,
                  'learning_rate':0.358,
                  'metric': 'binary_error',
                  'num_iterations':290,
                  'early_stopping_round':5,
                  'objective': 'binary',
                  'force_row_wise': True,
                  'num_threads': 5,
                  'verbosity': 0}
    
    if nFile == 20:
        bst = lgb.train(params, dtrain, num_boost_round=100)
    else:
        bst = lgb.train(params, dtrain, num_boost_round=100, init_model='../models/SVD/LightGBM_BayesOpt/LightGBM_incremental_training/boost'+str(nFile-batch_size)+'.txt')
        
    bst.save_model(path='../models/SVD/LightGBM_BayesOpt/LightGBM_incremental_training/boost'+str(nFile)+'.txt')

    print('Runtime per batch: %0.2fs' % (timer() - start))

NameError: name 'df_lookup_list' is not defined