In [1]:
import pandas as pd
import numpy as np
import tarfile
import io
import glob
import dask.dataframe as dd

import xgboost as xgb
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization

import warnings
warnings.simplefilter("ignore")



tar = tarfile.open('../data/raw/20181120_track_features.tar.gz', 'r:gz')
csv_files = tar.getnames()

df_list = []

for csv_file in [csv_files[2], csv_files[4]]:
    csv_contents = tar.extractfile(csv_file).read()
    df_list.append(pd.read_csv(io.BytesIO(csv_contents), encoding='utf8'))

tf_df = pd.concat(df_list, ignore_index=True)
tf_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)

kmean100_df = pd.read_csv('../data/interim/all_data/mbKMeans100clusters.csv', usecols=['track_id','clus'])
kmean100_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)



In [2]:
def get_sim(df_hist, df_lookup, sim_file_list, score_name_list):
    df_hist['ListenYes'] = (df_hist['skip_2'] == False)*1
    df_hist['ListenYes'].replace(0, -1, inplace = True)
    df_hist = df_hist.groupby(['session_id', 'clus']).agg({'ListenYes':['sum']})
    df_hist = df_hist.reset_index()
    df_hist.columns = df_hist.columns.droplevel(level = 1) # take out the unwanted level
    df_pivot = pd.pivot_table(df_hist, values = 'ListenYes',index='session_id', columns='clus')
    df_pivot = df_pivot.fillna(0)
    
    
    for sim_file, score_name in zip(sim_file_list, score_name_list):
        sim_matrix = pd.read_csv(sim_file).drop(columns=['Unnamed: 0'])
        sim_matrix.columns = list(map(str, range(0,len(sim_matrix))))
        df_sim_session = df_pivot.dot(sim_matrix)/sim_matrix.sum()
        
        df_lookup[score_name] = df_sim_session.lookup(df_lookup['session_id'],df_lookup['clus'].astype(str))
    
    return df_lookup

In [3]:
file_list = glob.glob('../data/raw/training_set/log_0*.csv')
file_list = file_list[0:round(len(file_list))]

In [4]:
df_lookup_list = []
for file in file_list:
    print(file)
    log_df = pd.read_csv(file)

    log_df = log_df[['session_id','track_id_clean','skip_2','session_position','session_length']].merge(kmean100_df)

    log_df_1 = log_df.loc[log_df['session_position']<(log_df['session_length']/2)]

    # as the entire dataset will be too big to train, 
    # train the first 2 tracks of the 2nd half (prediction set) should be enough, 
    # as these 2 contribute most to the spotify metric

    half_cut = log_df['session_length']/2

    log_df_2_1 = log_df.loc[(log_df['session_position']>=half_cut) & (log_df['session_position']<half_cut+1)]
    log_df_2_1['weight'] = 2
#     log_df_2_2 = log_df.loc[(log_df['session_position']>=half_cut+1) & (log_df['session_position']<half_cut+2)]
#     log_df_2_2['weight'] = 1

#     log_df_2 = pd.concat([log_df_2_1, log_df_2_2])
    log_df_2 = log_df_2_1


    sim_file_list = ['../models/SVD/similarity/k100_CanbDist.csv',
                     '../models/SVD/similarity/k100_CosSim.csv',
                     '../models/SVD/similarity/k100_LinCorr.csv',
                     '../models/SVD/similarity/k100_ManhDist.csv',
                     '../models/SVD/similarity/k100_HammDist.csv',
                     '../models/SVD/similarity/k100_SpearCorr.csv',
                     '../models/SVD/similarity/k100_KendCorr.csv']
    score_name_list = ['CanbDist100', 'CosSim100','LinCorr100','ManhDist100','HammDist100','SpearCorr100','KendCorr100']

    df_lookup_list.append(get_sim(log_df_1, log_df_2, sim_file_list, score_name_list))
    

df_lookup = pd.concat(df_lookup_list)

../data/raw/training_set/log_0_20180722_000000000000.csv
../data/raw/training_set/log_0_20180821_000000000000.csv
../data/raw/training_set/log_0_20180918_000000000000.csv
../data/raw/training_set/log_0_20180907_000000000000.csv
../data/raw/training_set/log_0_20180717_000000000000.csv
../data/raw/training_set/log_0_20180729_000000000000.csv
../data/raw/training_set/log_0_20180814_000000000000.csv
../data/raw/training_set/log_0_20180913_000000000000.csv
../data/raw/training_set/log_0_20180831_000000000000.csv
../data/raw/training_set/log_0_20180908_000000000000.csv
../data/raw/training_set/log_0_20180917_000000000000.csv
../data/raw/training_set/log_0_20180810_000000000000.csv
../data/raw/training_set/log_0_20180804_000000000000.csv
../data/raw/training_set/log_0_20180903_000000000000.csv
../data/raw/training_set/log_0_20180825_000000000000.csv
../data/raw/training_set/log_0_20180726_000000000000.csv
../data/raw/training_set/log_0_20180718_000000000000.csv
../data/raw/training_set/log_0_

In [None]:
df_lookup = df_lookup.merge(tf_df)
df_lookup = pd.get_dummies(df_lookup, columns=['key','time_signature','mode'])

In [5]:
dtrain = xgb.DMatrix(df_lookup.drop(columns = ['session_id','track_id_clean','skip_2']), 
                     label=df_lookup['skip_2'],
                     weight = df_lookup['weight'])

In [None]:
def bo_tune_xgb(max_depth, gamma ,learning_rate, subsample, eta):
    params = {'max_depth': int(max_depth),
              'gamma': gamma,
              'learning_rate':learning_rate,
              'subsample': subsample,
              'eta': eta,
              'eval_metric': 'error'}
    #Cross validating with the specified parameters in 5 folds and 70 iterations
    cv_result = xgb.cv(params, dtrain, num_boost_round=100, nfold=5)
    #Return the negative RMSE
    return 1-cv_result['test-error-mean'].iloc[-1]

#Invoking the Bayesian Optimizer with the specified parameters to tune
xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (3, 10),
                                             'gamma': (0, 1),
                                             'learning_rate':(0,0.01),
                                             'subsample': (0.3, 0.7),
                                             'eta':(0.1,0.3)
                                            })

from timeit import default_timer as timer #to see how long the computation will take
start = timer()
xgb_bo.maximize(n_iter=5, init_points=8, acq='ei')
print('Runtime: %0.2fs' % (timer() - start))

|   iter    |  target   |    eta    |   gamma   | learni... | max_depth | subsample |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6744   [0m | [0m0.2931   [0m | [0m0.629    [0m | [0m0.0005772[0m | [0m5.554    [0m | [0m0.4406   [0m |
| [95m2        [0m | [95m0.6771   [0m | [95m0.1596   [0m | [95m0.0728   [0m | [95m0.00119  [0m | [95m7.945    [0m | [95m0.4848   [0m |
| [0m3        [0m | [0m0.6764   [0m | [0m0.1992   [0m | [0m0.6796   [0m | [0m0.007437 [0m | [0m4.475    [0m | [0m0.577    [0m |
| [95m4        [0m | [95m0.6775   [0m | [95m0.111    [0m | [95m0.9902   [0m | [95m0.005649 [0m | [95m7.75     [0m | [95m0.3952   [0m |
| [0m5        [0m | [0m0.6771   [0m | [0m0.2256   [0m | [0m0.7947   [0m | [0m0.001575 [0m | [0m6.171    [0m | [0m0.4506   [0m |
| [95m6        [0m | [95m0.6783   [0m | [95m0.2642   [0m | [95m0.7488   [0m | [95m0.004421 [0m

In [None]:
xgb_bo.max['params']

In [38]:
params = {'max_depth': 8,
              'gamma': 0.5899964093512567,
              'learning_rate':0.009417981929522201,
              'subsample': 0.8,
              'eta': 0.1,
              'eval_metric': 'error'}
cv_result = xgb.cv(params, dtrain, num_boost_round=70, nfold=5)


In [39]:
cv_result

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.330065,0.000556,0.336290,0.001781
1,0.327947,0.000455,0.335058,0.001734
2,0.327521,0.000556,0.334215,0.001634
3,0.327208,0.000382,0.334010,0.001586
4,0.326928,0.000401,0.333675,0.001769
...,...,...,...,...
65,0.324187,0.000580,0.332452,0.001705
66,0.324150,0.000591,0.332451,0.001782
67,0.324103,0.000569,0.332460,0.001714
68,0.324076,0.000612,0.332460,0.001717
