### Imports

In [1]:
import os
os.chdir('/home/giovanni/Desktop/RecSys')
import data.data as data
import inout.importexport as imp
import numpy as np
import pandas as pd
import xgboost as xgb
import random
import math
from pandas.api.types import CategoricalDtype
from scipy.sparse import load_npz
import gc


### Create the complete dataframe

#### Read the recommendations from csv

In [2]:
raw_recs = imp.importcsv('reranking/to_boost_als_4coll_gxboost_23-27-25.csv', check_len=-1)

print(raw_recs[0:2])

[[7, 4492, 15779, 20443, 10300, 5042, 17154, 4052, 1316, 11641, 11636, 15908, 3648, 15167, 4189, 10100, 11257, 8749, 13123, 14765, 4542, 20242, 17276, 358, 11233, 19638], [25, 699, 4202, 1900, 16869, 19134, 4649, 14338, 800, 5620, 7077, 10839, 2018, 16432, 10438, 10414, 12768, 9731, 3788, 4899, 18991, 18567, 1726, 15261, 12716, 16866]]


#### Explode each row into multiple rows (one per interaction)

In [3]:
recs_tracks = []
for rec in raw_recs:
    playlist_id = rec[0]
    for t in rec[1:]:
        recs_tracks.append([playlist_id, t])
recs_df = pd.DataFrame(recs_tracks, columns=['playlist_id','track_id'])

print(recs_df)

        playlist_id  track_id
0                 7      4492
1                 7     15779
2                 7     20443
3                 7     10300
4                 7      5042
5                 7     17154
6                 7      4052
7                 7      1316
8                 7     11641
9                 7     11636
10                7     15908
11                7      3648
12                7     15167
13                7      4189
14                7     10100
15                7     11257
16                7      8749
17                7     13123
18                7     14765
19                7      4542
20                7     20242
21                7     17276
22                7       358
23                7     11233
24                7     19638
25               25       699
26               25      4202
27               25      1900
28               25     16869
29               25     19134
...             ...       ...
249970        50417     13862
249971    

#### Append the 'profile_length' column to the recommendation dataframe

In [4]:
target_ids = data.get_target_playlists()
targetURM = data.get_urm_train_1()[target_ids]
user_profile_lengths = np.array(targetURM.sum(axis=1)).flatten()
profile_lengths_df = pd.DataFrame({'playlist_id': target_ids, 'profile_length': user_profile_lengths})

print(profile_lengths_df.head(10))

   playlist_id  profile_length
0            7              28
1           25              13
2           29              18
3           34              24
4           50               8
5           52              16
6           60               8
7           64              12
8           77              29
9           80              18


In [5]:
rec_lengths_df = recs_df.merge(profile_lengths_df, on='playlist_id')
print(rec_lengths_df)

        playlist_id  track_id  profile_length
0                 7      4492              28
1                 7     15779              28
2                 7     20443              28
3                 7     10300              28
4                 7      5042              28
5                 7     17154              28
6                 7      4052              28
7                 7      1316              28
8                 7     11641              28
9                 7     11636              28
10                7     15908              28
11                7      3648              28
12                7     15167              28
13                7      4189              28
14                7     10100              28
15                7     11257              28
16                7      8749              28
17                7     13123              28
18                7     14765              28
19                7      4542              28
20                7     20242     

#### Popularity feature

In [6]:
df = data.get_playlists_df()
popularity = df.groupby(['track_id']).size().reset_index(name='popularity')
print(popularity)

       track_id  popularity
0             0          26
1             1          51
2             2          56
3             3           8
4             4          12
5             5          35
6             6         262
7             7          42
8             8           9
9             9           2
10           10          18
11           11           2
12           12          50
13           13           1
14           14           2
15           15         106
16           16          20
17           17           2
18           18           1
19           19          33
20           20          67
21           21         113
22           22          24
23           23          82
24           24         148
25           25           8
26           26         153
27           27         178
28           28         161
29           29           1
...         ...         ...
20605     20605         239
20606     20606           1
20607     20607          28
20608     20608     

In [7]:
rec_pop_df = rec_lengths_df.join(popularity.set_index('track_id'), on='track_id')
print(rec_pop_df)

        playlist_id  track_id  profile_length  popularity
0                 7      4492              28         265
1                 7     15779              28         417
2                 7     20443              28         551
3                 7     10300              28         351
4                 7      5042              28         200
5                 7     17154              28         458
6                 7      4052              28         406
7                 7      1316              28         242
8                 7     11641              28         188
9                 7     11636              28         211
10                7     15908              28         145
11                7      3648              28         432
12                7     15167              28         387
13                7      4189              28         281
14                7     10100              28         419
15                7     11257              28         358
16            

In [8]:
matrix = [load_npz('raw_data/saved_r_hat/4_collaborative_l2.npz'), load_npz('raw_data/saved_r_hat/als_l2.npz')]
scores = [[], []]
n = len(matrix)
for idx,row in rec_pop_df.iterrows():
    for i in range(len(matrix)):
        scores[i].append(matrix[i][row['playlist_id'], row['track_id']])

dfn1 = pd.DataFrame({'score_4_coll': scores[0]/max(scores[0])})
dfn2 = pd.DataFrame({'score_als': scores[1]/max(scores[1])})
rec_scores_df = pd.concat([rec_pop_df, dfn1, dfn2], axis=1)
print(rec_scores_df)

        playlist_id  track_id  profile_length  popularity  score_4_coll  \
0                 7      4492              28         265      0.329752   
1                 7     15779              28         417      0.261610   
2                 7     20443              28         551      0.277379   
3                 7     10300              28         351      0.257043   
4                 7      5042              28         200      0.259011   
5                 7     17154              28         458      0.311490   
6                 7      4052              28         406      0.293608   
7                 7      1316              28         242      0.190845   
8                 7     11641              28         188      0.245369   
9                 7     11636              28         211      0.228529   
10                7     15908              28         145      0.223109   
11                7      3648              28         432      0.264806   
12                7     1

In [9]:
del matrix
gc.collect()

0

#### Append the tracks features (album, artist, duration)

In [10]:
tdf = data.get_tracks_df()
rec_feature_track_df = rec_scores_df.join(tdf.set_index('track_id'), on='track_id')
print(rec_feature_track_df)

        playlist_id  track_id  profile_length  popularity  score_4_coll  \
0                 7      4492              28         265      0.329752   
1                 7     15779              28         417      0.261610   
2                 7     20443              28         551      0.277379   
3                 7     10300              28         351      0.257043   
4                 7      5042              28         200      0.259011   
5                 7     17154              28         458      0.311490   
6                 7      4052              28         406      0.293608   
7                 7      1316              28         242      0.190845   
8                 7     11641              28         188      0.245369   
9                 7     11636              28         211      0.228529   
10                7     15908              28         145      0.223109   
11                7      3648              28         432      0.264806   
12                7     1

### I'm happy with the features gathered

In [11]:
full = rec_feature_track_df.copy()

### Split into train and test dataframes

#### One hot encodings

In [12]:
to_concat = []
to_onehot = []
to_drop = ['album_id', 'artist_id']

In [13]:
def onehotize(df, string):
    oh = pd.get_dummies(df[string], prefix=string).to_sparse(fill_value=0)
    return oh

In [14]:
for name in to_onehot:
    print(name)
    to_concat.append(onehotize(full, name))
    full = full.drop(name, axis=1)
to_concat.insert(0, full)

In [15]:
to_boost = pd.concat(to_concat, axis=1)

In [16]:
for j in to_drop:
    to_boost = to_boost.drop(j, axis=1)

In [17]:
to_boost

Unnamed: 0,playlist_id,track_id,profile_length,popularity,score_4_coll,score_als,duration_sec
0,7,4492,28,265,0.329752,0.442740,232
1,7,15779,28,417,0.261610,0.390388,181
2,7,20443,28,551,0.277379,0.329203,215
3,7,10300,28,351,0.257043,0.346657,214
4,7,5042,28,200,0.259011,0.335101,274
5,7,17154,28,458,0.311490,0.274686,235
6,7,4052,28,406,0.293608,0.280296,181
7,7,1316,28,242,0.190845,0.380661,281
8,7,11641,28,188,0.245369,0.315944,204
9,7,11636,28,211,0.228529,0.317780,174


In [18]:
to_boost_u = to_boost.drop(['playlist_id', 'track_id'], axis=1)
to_boost_u

Unnamed: 0,profile_length,popularity,score_4_coll,score_als,duration_sec
0,28,265,0.329752,0.442740,232
1,28,417,0.261610,0.390388,181
2,28,551,0.277379,0.329203,215
3,28,351,0.257043,0.346657,214
4,28,200,0.259011,0.335101,274
5,28,458,0.311490,0.274686,235
6,28,406,0.293608,0.280296,181
7,28,242,0.190845,0.380661,281
8,28,188,0.245369,0.315944,204
9,28,211,0.228529,0.317780,174


# Boost the predictions

In [19]:
import xgboost as xgb

dtrain = xgb.DMatrix(to_boost_u)
bst = xgb.Booster({'nthread': 4})  # init model
bst.load_model('reranking/0001.model')

#### Get the boosted predictions

In [20]:
ypred = bst.predict(dtrain)
ypred

array([9.8363334e-01, 5.5404943e-01, 2.9238046e-04, ..., 3.8335010e-01,
       1.1833406e-04, 2.2381176e-01], dtype=float32)

#### Rerank the starting predictions according to the boosting

In [83]:
df_preds = pd.DataFrame({'score': ypred})

In [84]:
df_boosted = pd.concat([to_boost, df_preds], axis=1)
df_boosted

Unnamed: 0,playlist_id,track_id,profile_length,popularity,score_4_coll,score_als,duration_sec,score
0,7,4492,28,265,0.329752,0.442740,232,9.836333e-01
1,7,15779,28,417,0.261610,0.390388,181,5.540494e-01
2,7,20443,28,551,0.277379,0.329203,215,2.923805e-04
3,7,10300,28,351,0.257043,0.346657,214,1.292331e-02
4,7,5042,28,200,0.259011,0.335101,274,3.022513e-01
5,7,17154,28,458,0.311490,0.274686,235,7.337776e-07
6,7,4052,28,406,0.293608,0.280296,181,4.641293e-05
7,7,1316,28,242,0.190845,0.380661,281,6.256528e-01
8,7,11641,28,188,0.245369,0.315944,204,9.950534e-03
9,7,11636,28,211,0.228529,0.317780,174,1.159180e-02


In [89]:
def fu(x):
    x = x.sort_values(by=['score'], ascending=False).head(10)
    return x #np.concatenate((x.playlist_id.unique(), x.track_id.unique()))

In [90]:
df_boosted_sorted = df_boosted.groupby(['playlist_id'], as_index=False).apply(fu)

In [91]:
df_boosted_sorted

Unnamed: 0,Unnamed: 1,playlist_id,track_id,profile_length,popularity,score_4_coll,score_als,duration_sec,score
0,125001,3,10293,8,113,0.166257,0.506911,201,9.977565e-01
0,125002,3,3321,8,31,0.150796,0.352289,304,9.723908e-01
0,125004,3,12111,8,56,0.134493,0.350621,196,7.807191e-01
0,125000,3,344,8,318,0.200120,0.520604,139,5.752509e-01
0,125006,3,13960,8,29,0.132169,0.299893,224,5.485993e-02
0,125008,3,18344,8,19,0.128185,0.269791,297,2.979531e-02
0,125010,3,6579,8,21,0.118964,0.261678,113,4.075164e-03
0,125005,3,752,8,136,0.110032,0.418694,121,3.702970e-03
0,125003,3,9740,8,152,0.120064,0.412946,223,3.687916e-03
0,125021,3,6153,8,41,0.065812,0.234561,221,1.937961e-05


In [88]:
imp.exportcsv(list(df_boosted_sorted.values), name='boosted_als_l2_4coll_l2')