# Imports

In [1]:
import os
os.chdir(os.getcwd() + '/..')


In [2]:
print(os.getcwd())

/home/giovanni/Desktop/RecSys


In [3]:
import data.data as data
import inout.importexport as imp
import numpy as np
import pandas as pd
import xgboost as xgb
import random
import math
from pandas.api.types import CategoricalDtype
from scipy.sparse import load_npz
import gc

# Create the complete dataframe

#### Read the recommendations from csv

In [163]:
raw_recs = imp.importcsv('reranking/train_als_4coll_xgboost25_23-58-10_25_tracks.csv', check_len=-1)

print(raw_recs[0:2])

[[7, 4492, 15779, 14714, 11257, 20242, 15167, 15908, 3648, 4469, 8749, 17154, 8985, 11641, 5309, 11636, 14400, 4189, 8275, 6615, 8362, 13123, 17780, 20443, 17495, 1063], [25, 4202, 699, 1900, 19134, 14338, 10438, 7077, 12768, 16869, 4899, 800, 2018, 4720, 11633, 10414, 11393, 18567, 11943, 15261, 19390, 12716, 11347, 1422, 7545, 1726]]


#### Explode each row into multiple rows (one per interaction)

In [164]:
recs_tracks = []
cutoff = 20
for rec in raw_recs:
    playlist_id = rec[0]
    for i in range(len(rec[1:])):
        if i == cutoff:
            break
        t = rec[1:][i]
        recs_tracks.append([playlist_id, t])
recs_df = pd.DataFrame(recs_tracks, columns=['playlist_id','track_id'])

print(recs_df)

        playlist_id  track_id
0                 7      4492
1                 7     15779
2                 7     14714
3                 7     11257
4                 7     20242
5                 7     15167
6                 7     15908
7                 7      3648
8                 7      4469
9                 7      8749
10                7     17154
11                7      8985
12                7     11641
13                7      5309
14                7     11636
15                7     14400
16                7      4189
17                7      8275
18                7      6615
19                7      8362
20               25      4202
21               25       699
22               25      1900
23               25     19134
24               25     14338
25               25     10438
26               25      7077
27               25     12768
28               25     16869
29               25      4899
...             ...       ...
199970        50417     11241
199971    

#### Append the 'profile_length' column to the recommendation dataframe

In [165]:
target_ids = data.get_target_playlists()
targetURM = data.get_urm_train_1()[target_ids]
user_profile_lengths = np.array(targetURM.sum(axis=1)).flatten()
profile_lengths_df = pd.DataFrame({'playlist_id': target_ids, 'profile_length': user_profile_lengths})

print(profile_lengths_df)

      playlist_id  profile_length
0               7              28
1              25              13
2              29              18
3              34              24
4              50               8
5              52              16
6              60               8
7              64              12
8              77              29
9              80              18
10            106              11
11            138              38
12            143              36
13            149              15
14            169               8
15            180              35
16            210               9
17            231              12
18            241              22
19            244              22
20            256              13
21            258              13
22            265              16
23            268              36
24            271              25
25            272               6
26            275              14
27            277              20
28            

In [196]:
rec_lengths_df = recs_df.merge(profile_lengths_df, on='playlist_id')
print(rec_lengths_df)

        playlist_id  track_id  profile_length
0                 7      4492              28
1                 7     15779              28
2                 7     14714              28
3                 7     11257              28
4                 7     20242              28
5                 7     15167              28
6                 7     15908              28
7                 7      3648              28
8                 7      4469              28
9                 7      8749              28
10                7     17154              28
11                7      8985              28
12                7     11641              28
13                7      5309              28
14                7     11636              28
15                7     14400              28
16                7      4189              28
17                7      8275              28
18                7      6615              28
19                7      8362              28
20               25      4202     

#### Position among the recs feature 

In [201]:
def assign_pos_feature(x):
    f = np.flip(np.arange(len(x)))
    return pd.DataFrame({'pos_score': f})

In [202]:
position_scores = rec_lengths_df.groupby('playlist_id', as_index=False, sort=False).apply(assign_pos_feature)
rec_position_df = pd.concat([rec_lengths_df, position_scores.reset_index().drop(['level_0', 'level_1'], axis=1)], axis=1)

In [203]:
rec_position_df

Unnamed: 0,playlist_id,track_id,profile_length,pos_score
0,7,4492,28,19
1,7,15779,28,18
2,7,14714,28,17
3,7,11257,28,16
4,7,20242,28,15
5,7,15167,28,14
6,7,15908,28,13
7,7,3648,28,12
8,7,4469,28,11
9,7,8749,28,10


In [186]:
rec_position_df

Unnamed: 0,playlist_id,track_id,profile_length,pos_score
0,7,4492,28,1.000000
1,7,15779,28,0.947368
2,7,14714,28,0.894737
3,7,11257,28,0.842105
4,7,20242,28,0.789474
5,7,15167,28,0.736842
6,7,15908,28,0.684211
7,7,3648,28,0.631579
8,7,4469,28,0.578947
9,7,8749,28,0.526316


#### Popularity feature

In [204]:
df = data.get_playlists_df()
popularity = df.groupby(['track_id']).size().reset_index(name='popularity')

In [205]:
rec_pop_df = rec_position_df.join(popularity.set_index('track_id'), on='track_id')
print(rec_pop_df)

        playlist_id  track_id  profile_length  pos_score  popularity
0                 7      4492              28         19         265
1                 7     15779              28         18         417
2                 7     14714              28         17         483
3                 7     11257              28         16         358
4                 7     20242              28         15        1095
5                 7     15167              28         14         387
6                 7     15908              28         13         145
7                 7      3648              28         12         432
8                 7      4469              28         11         195
9                 7      8749              28         10        1016
10                7     17154              28          9         458
11                7      8985              28          8         344
12                7     11641              28          7         188
13                7      5309     

#### Append the 'label' column 

In [206]:
urm_test = data.get_urm_test_1()
test_labels = []

last_playlist_id = -1
for idx,row in recs_df.iterrows():
    current_playlist_id = row['playlist_id']
    track_id = row['track_id']
    # cache the row of the urm test if same playlist of the previous iteration
    if not current_playlist_id == last_playlist_id:
        # tracks ids in the t row of urm test
        tracks_ids = urm_test.getrow(current_playlist_id).nonzero()[1]
        last_playlist_id = current_playlist_id
    
    test_labels.append(1 if track_id in tracks_ids else 0)

test_labels_df = pd.DataFrame({'label': test_labels})

In [207]:
rec_label_df = pd.concat([rec_pop_df, test_labels_df], axis=1)
print(rec_label_df)

        playlist_id  track_id  profile_length  pos_score  popularity  label
0                 7      4492              28         19         265      0
1                 7     15779              28         18         417      0
2                 7     14714              28         17         483      1
3                 7     11257              28         16         358      0
4                 7     20242              28         15        1095      0
5                 7     15167              28         14         387      0
6                 7     15908              28         13         145      0
7                 7      3648              28         12         432      0
8                 7      4469              28         11         195      0
9                 7      8749              28         10        1016      0
10                7     17154              28          9         458      0
11                7      8985              28          8         344      0
12          

In [208]:
matrix = [load_npz('raw_data/saved_r_hat_evaluation/4collaborative_l2.npz'), load_npz('raw_data/saved_r_hat_evaluation/als_l2.npz')]
scores = [[], []]
n = len(matrix)
for idx,row in rec_label_df.iterrows():
    for i in range(len(matrix)):
        scores[i].append(matrix[i][row['playlist_id'], row['track_id']])

dfn1 = pd.DataFrame({'score_4_coll': scores[0]/max(scores[0])})
dfn2 = pd.DataFrame({'score_als': scores[1]/max(scores[1])})
rec_scores_df = pd.concat([rec_label_df, dfn1, dfn2], axis=1)
print(rec_scores_df)

        playlist_id  track_id  profile_length  pos_score  popularity  label  \
0                 7      4492              28         19         265      0   
1                 7     15779              28         18         417      0   
2                 7     14714              28         17         483      1   
3                 7     11257              28         16         358      0   
4                 7     20242              28         15        1095      0   
5                 7     15167              28         14         387      0   
6                 7     15908              28         13         145      0   
7                 7      3648              28         12         432      0   
8                 7      4469              28         11         195      0   
9                 7      8749              28         10        1016      0   
10                7     17154              28          9         458      0   
11                7      8985              28       

In [209]:
del matrix
gc.collect()

84

#### Append the tracks features (album, artist, duration)

In [210]:
tdf = data.get_tracks_df()
rec_feature_track_df = rec_scores_df.join(tdf.set_index('track_id'), on='track_id')
print(rec_feature_track_df)

        playlist_id  track_id  profile_length  pos_score  popularity  label  \
0                 7      4492              28         19         265      0   
1                 7     15779              28         18         417      0   
2                 7     14714              28         17         483      1   
3                 7     11257              28         16         358      0   
4                 7     20242              28         15        1095      0   
5                 7     15167              28         14         387      0   
6                 7     15908              28         13         145      0   
7                 7      3648              28         12         432      0   
8                 7      4469              28         11         195      0   
9                 7      8749              28         10        1016      0   
10                7     17154              28          9         458      0   
11                7      8985              28       

# I'm happy with the features gathered

In [211]:
full = rec_feature_track_df.copy()

# Split into train and test dataframes

In [212]:
def func(x):
    n = x['label'].sum()
    ones = x.loc[x['label'] == 1]
    zeros = x.loc[x['label'] == 0].sample(min(n, x.shape[0]-n))
    return pd.concat([ones,zeros])

In [213]:
tgt = data.get_target_playlists()
train_tgt = random.sample(tgt, math.floor(len(tgt)*0.8))
test_tgt = list(set(tgt) - set(train_tgt))
train = full.loc[full['playlist_id'].isin(train_tgt)]
test = full.loc[full['playlist_id'].isin(test_tgt)]

In [214]:
train = train.groupby(['playlist_id'], as_index=False).apply(func)
train = train.reset_index().drop(['level_0', 'level_1'], axis=1)

In [234]:
train

Unnamed: 0,playlist_id,track_id,profile_length,pos_score,popularity,label,score_4_coll,score_als,album_id,artist_id,duration_sec
0,3,11102,8,14,53,1,0.086523,0.336496,4755,3518,261
1,3,8426,8,0,95,0,0.042393,0.291074,7979,484,201
2,6,15770,24,9,502,1,0.141490,0.233898,2698,3413,156
3,6,8145,24,8,959,0,0.132030,0.238927,10637,275,275
4,7,14714,28,17,483,1,0.263038,0.268332,9580,1111,184
5,7,5309,28,6,227,0,0.168165,0.263514,743,5954,179
6,18,14547,8,5,294,1,0.068176,0.341717,1640,3440,228
7,18,5640,8,11,83,0,0.067975,0.386570,1327,4709,256
8,19,5069,11,19,174,1,0.144245,0.529448,9231,49,305
9,19,4466,11,7,152,0,0.073247,0.335596,5133,2529,226


In [235]:
test

Unnamed: 0,playlist_id,track_id,profile_length,pos_score,popularity,label,score_4_coll,score_als,album_id,artist_id,duration_sec
180,80,9168,18,19,153,1,0.539081,0.624306,3489,3688,170
181,80,11063,18,18,205,1,0.493552,0.667141,3489,3688,242
182,80,13987,18,17,56,1,0.404099,0.441687,3489,3688,218
183,80,14683,18,16,74,0,0.339273,0.526728,8685,3688,164
184,80,6,18,15,262,1,0.293821,0.541153,4426,2029,151
185,80,18026,18,14,316,0,0.315829,0.478588,7364,1781,216
186,80,10203,18,13,81,0,0.241104,0.518926,5649,3688,223
187,80,9670,18,12,319,0,0.246196,0.510507,1513,1914,178
188,80,16391,18,11,81,0,0.213839,0.552323,5649,3688,212
189,80,10694,18,10,152,0,0.225251,0.478268,5827,3932,199


# One hot encodings of non ordered features

In [236]:
to_onehot = []
to_drop = ['album_id', 'artist_id', 'playlist_id', 'track_id']
to_concat_train = []
to_concat_test = []

In [237]:
def onehotize(full, df, str):
    exp = full[str].unique()
    print(len(exp))
    df.loc[:, (str)] = df[str].astype(CategoricalDtype(categories = exp))
    oh = pd.get_dummies(df[str], prefix=str).to_sparse(fill_value=0)
    return oh

In [238]:
for name in to_onehot:
    to_concat_train.append(onehotize(full, train, name))
    to_concat_test.append(onehotize(full, test, name))
    train = train.drop(name, axis=1)
    test = test.drop(name, axis=1)
to_concat_train.insert(0, train)
to_concat_test.insert(0, test)

In [239]:
train = pd.concat(to_concat_train, axis=1)
test = pd.concat(to_concat_test, axis=1)

In [240]:
traind = train.copy()
testd = test.copy()
for j in to_drop:
    traind = traind.drop(j, axis=1)
    testd = testd.drop(j, axis=1)

In [241]:
traind

Unnamed: 0,profile_length,pos_score,popularity,label,score_4_coll,score_als,duration_sec
0,8,14,53,1,0.086523,0.336496,261
1,8,0,95,0,0.042393,0.291074,201
2,24,9,502,1,0.141490,0.233898,156
3,24,8,959,0,0.132030,0.238927,275
4,28,17,483,1,0.263038,0.268332,184
5,28,6,227,0,0.168165,0.263514,179
6,8,5,294,1,0.068176,0.341717,228
7,8,11,83,0,0.067975,0.386570,256
8,11,19,174,1,0.144245,0.529448,305
9,11,7,152,0,0.073247,0.335596,226


In [242]:
testd

Unnamed: 0,profile_length,pos_score,popularity,label,score_4_coll,score_als,duration_sec
180,18,19,153,1,0.539081,0.624306,170
181,18,18,205,1,0.493552,0.667141,242
182,18,17,56,1,0.404099,0.441687,218
183,18,16,74,0,0.339273,0.526728,164
184,18,15,262,1,0.293821,0.541153,151
185,18,14,316,0,0.315829,0.478588,216
186,18,13,81,0,0.241104,0.518926,223
187,18,12,319,0,0.246196,0.510507,178
188,18,11,81,0,0.213839,0.552323,212
189,18,10,152,0,0.225251,0.478268,199


# Train and save the model

In [222]:
import xgboost as xgb

label_train = traind.label
trainu = traind.drop(['label'], axis=1)

label_test = testd.label
testu = testd.drop(['label'], axis=1)

dtrain = xgb.DMatrix(trainu, label=label_train)
dtest = xgb.DMatrix(testu, label=label_test)

In [225]:
param = {'max_depth': 16, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic',
         'lambda': 2, 'alpha': 2, 'tree_method': 'auto', 'grow_policy':'lossguide', 'subsample': 0.5}
param['nthread'] = 4
param['eval_metric'] = 'auc'

evallist = [(dtest, 'eval'), (dtrain, 'train')]

num_round = 100
model_trained = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-auc:0.623676	train-auc:0.692352
[1]	eval-auc:0.634729	train-auc:0.723929
[2]	eval-auc:0.640843	train-auc:0.73785
[3]	eval-auc:0.64318	train-auc:0.745828
[4]	eval-auc:0.644223	train-auc:0.754353
[5]	eval-auc:0.64662	train-auc:0.762586
[6]	eval-auc:0.64919	train-auc:0.768222
[7]	eval-auc:0.647754	train-auc:0.774252
[8]	eval-auc:0.649078	train-auc:0.778943
[9]	eval-auc:0.647644	train-auc:0.78334
[10]	eval-auc:0.649119	train-auc:0.788011
[11]	eval-auc:0.649396	train-auc:0.793262
[12]	eval-auc:0.649757	train-auc:0.797136
[13]	eval-auc:0.649516	train-auc:0.801585
[14]	eval-auc:0.647662	train-auc:0.80696
[15]	eval-auc:0.647119	train-auc:0.81274
[16]	eval-auc:0.648184	train-auc:0.815546
[17]	eval-auc:0.648203	train-auc:0.820646
[18]	eval-auc:0.648305	train-auc:0.823356
[19]	eval-auc:0.647248	train-auc:0.828483
[20]	eval-auc:0.646328	train-auc:0.832831
[21]	eval-auc:0.646326	train-auc:0.837376
[22]	eval-auc:0.646093	train-auc:0.841767
[23]	eval-auc:0.646063	train-auc:0.845316
[24]	eval

In [29]:
# model_trained.save_model('0001.model')

# Evaluate the improvement in MAP

In [226]:
def evaluate(recommendations, test_urm, at_k=10):
    aps = 0.0
    for r in recommendations:
        row = test_urm.getrow(r[0]).indices
        m = min(at_k, len(row))

        ap = 0.0
        n_elems_found = 0.0
        for j in range(1, m+1):
            if r[j] in row:
                n_elems_found += 1
                ap = ap + n_elems_found/j
        if m > 0:
            ap = ap/m
            aps += ap

    return aps/len(recommendations)

#### Predictions for test users

In [227]:
ypred = model_trained.predict(xgb.DMatrix(testu))

#### From the predictions, reccomend the most probable rearrenged tracks

In [228]:
df_preds = pd.DataFrame({'score': ypred})
df_boosted = pd.concat([test, df_preds], axis=1)
def fu(x):
    x = x.sort_values(by=['score'], ascending=False).head(10)
    return np.concatenate((x.playlist_id.unique(), x.track_id.unique())).astype(np.int32)
df_boosted_sorted = df_boosted.groupby(['playlist_id'], as_index=False).apply(fu)
recs = list(df_boosted_sorted.values)

#### Evaluate on the URM test

In [229]:
new_map = evaluate(recs, data.get_urm_test_1())

#### Evaluate the old MAP on just the test users

In [230]:
ids = test.playlist_id.unique()
old_recs = []
for j in raw_recs:
    if j[0] in ids:
        old_recs.append(j[:11])

In [246]:
old_map = evaluate(old_recs, data.get_urm_test_1())
print('old map: {} new map: {}. difference of: {} percent'.format(old_map, new_map, (new_map/old_map)-1))

old map: 0.0828687503149407 new map: 0.07808035667044591. difference of: -0.05778286297665425 percent
