### Imports

In [1]:
import os
os.chdir(os.getcwd() + '/..')
print(os.getcwd())

import data.data as data
import inout.importexport as imp
import numpy as np
import pandas as pd
import xgboost as xgb
import random
import math
from pandas.api.types import CategoricalDtype


/home/giovanni/Desktop/RecSys


### Create the complete dataframe

#### Read the recommendations from csv

In [2]:
raw_recs = imp.importcsv('submission/13-12-2018/gxboost25recommendations_12-17-23.csv', check_len=-1)

print(raw_recs[0:2])

[[7, 4492, 15779, 14714, 11257, 20242, 15167, 15908, 3648, 4469, 8749, 17154, 8985, 11641, 5309, 11636, 14400, 4189, 8275, 6615, 8362, 13123, 17780, 20443, 17495, 1063], [25, 4202, 699, 1900, 19134, 14338, 10438, 7077, 12768, 16869, 4899, 800, 2018, 4720, 11633, 10414, 11393, 18567, 11943, 15261, 19390, 12716, 11347, 1422, 7545, 1726]]


#### Explode each row into multiple rows (one per interaction)

In [3]:
recs_tracks = []
for rec in raw_recs:
    playlist_id = rec[0]
    for t in rec[1:]:
        recs_tracks.append([playlist_id, t])
recs_df = pd.DataFrame(recs_tracks, columns=['playlist_id','track_id'])

print(recs_df)

        playlist_id  track_id
0                 7      4492
1                 7     15779
2                 7     14714
3                 7     11257
4                 7     20242
5                 7     15167
6                 7     15908
7                 7      3648
8                 7      4469
9                 7      8749
10                7     17154
11                7      8985
12                7     11641
13                7      5309
14                7     11636
15                7     14400
16                7      4189
17                7      8275
18                7      6615
19                7      8362
20                7     13123
21                7     17780
22                7     20443
23                7     17495
24                7      1063
25               25      4202
26               25       699
27               25      1900
28               25     19134
29               25     14338
...             ...       ...
249970        50417      4952
249971    

#### Append the 'profile_length' column to the recommendation dataframe

In [4]:
target_ids = data.get_target_playlists()
targetURM = data.get_urm_train_1()[target_ids]
user_profile_lengths = np.array(targetURM.sum(axis=1)).flatten()
profile_lengths_df = pd.DataFrame({'playlist_id': target_ids, 'profile_length': user_profile_lengths})

print(profile_lengths_df.head(10))

   playlist_id  profile_length
0            7              28
1           25              13
2           29              18
3           34              24
4           50               8
5           52              16
6           60               8
7           64              12
8           77              29
9           80              18


In [5]:
rec_lengths_df = recs_df.merge(profile_lengths_df, on='playlist_id')
print(rec_lengths_df)

        playlist_id  track_id  profile_length
0                 7      4492              28
1                 7     15779              28
2                 7     14714              28
3                 7     11257              28
4                 7     20242              28
5                 7     15167              28
6                 7     15908              28
7                 7      3648              28
8                 7      4469              28
9                 7      8749              28
10                7     17154              28
11                7      8985              28
12                7     11641              28
13                7      5309              28
14                7     11636              28
15                7     14400              28
16                7      4189              28
17                7      8275              28
18                7      6615              28
19                7      8362              28
20                7     13123     

#### Popularity feature

In [6]:
df = data.get_playlists_df()
popularity = df.groupby(['track_id']).size().reset_index(name='popularity')
print(popularity)

       track_id  popularity
0             0          26
1             1          51
2             2          56
3             3           8
4             4          12
5             5          35
6             6         262
7             7          42
8             8           9
9             9           2
10           10          18
11           11           2
12           12          50
13           13           1
14           14           2
15           15         106
16           16          20
17           17           2
18           18           1
19           19          33
20           20          67
21           21         113
22           22          24
23           23          82
24           24         148
25           25           8
26           26         153
27           27         178
28           28         161
29           29           1
...         ...         ...
20605     20605         239
20606     20606           1
20607     20607          28
20608     20608     

In [7]:
rec_pop_df = rec_lengths_df.join(popularity.set_index('track_id'), on='track_id')
print(rec_pop_df)

        playlist_id  track_id  profile_length  popularity
0                 7      4492              28         265
1                 7     15779              28         417
2                 7     14714              28         483
3                 7     11257              28         358
4                 7     20242              28        1095
5                 7     15167              28         387
6                 7     15908              28         145
7                 7      3648              28         432
8                 7      4469              28         195
9                 7      8749              28        1016
10                7     17154              28         458
11                7      8985              28         344
12                7     11641              28         188
13                7      5309              28         227
14                7     11636              28         211
15                7     14400              28         184
16            

#### Append the 'label' column 

In [8]:
urm_test = data.get_urm_test_1()
test_labels = []

last_playlist_id = -1
for idx,row in recs_df.iterrows():
    current_playlist_id = row['playlist_id']
    track_id = row['track_id']
    # cache the row of the urm test if same playlist of the previous iteration
    if not current_playlist_id == last_playlist_id:
        # tracks ids in the t row of urm test
        tracks_ids = urm_test.getrow(current_playlist_id).nonzero()[1]
        last_playlist_id = current_playlist_id
    
    test_labels.append(1 if track_id in tracks_ids else 0)

test_labels_df = pd.DataFrame({'label': test_labels})

In [9]:
rec_label_df = pd.concat([rec_pop_df, test_labels_df], axis=1)
print(rec_label_df)

        playlist_id  track_id  profile_length  popularity  label
0                 7      4492              28         265      0
1                 7     15779              28         417      0
2                 7     14714              28         483      1
3                 7     11257              28         358      0
4                 7     20242              28        1095      0
5                 7     15167              28         387      0
6                 7     15908              28         145      0
7                 7      3648              28         432      0
8                 7      4469              28         195      0
9                 7      8749              28        1016      0
10                7     17154              28         458      0
11                7      8985              28         344      0
12                7     11641              28         188      0
13                7      5309              28         227      0
14                7     1

#### Append the tracks features (album, artist, duration)

In [10]:
tdf = data.get_tracks_df()
rec_feature_track_df = rec_label_df.join(tdf.set_index('track_id'), on='track_id')
print(rec_feature_track_df)

        playlist_id  track_id  profile_length  popularity  label  album_id  \
0                 7      4492              28         265      0      6189   
1                 7     15779              28         417      0      2480   
2                 7     14714              28         483      1      9580   
3                 7     11257              28         358      0     11635   
4                 7     20242              28        1095      0     11149   
5                 7     15167              28         387      0      7077   
6                 7     15908              28         145      0      8706   
7                 7      3648              28         432      0     11588   
8                 7      4469              28         195      0     10636   
9                 7      8749              28        1016      0      2186   
10                7     17154              28         458      0      3112   
11                7      8985              28         344      0

### I'm happy with the features gathered

In [11]:
feature_df = rec_feature_track_df

### Split into train and test dataframes

In [12]:
def func(x):
    n = x['label'].sum()
    ones = x.loc[x['label'] == 1]
    zeros = x.loc[x['label'] == 0].sample(n)
    return pd.concat([ones,zeros])

In [13]:
full = feature_df.groupby(['playlist_id'], as_index=False).apply(func)

In [14]:
full = full.reset_index().drop(['level_0', 'level_1'], axis=1)

In [15]:
tgt = data.get_target_playlists()
train_tgt = random.sample(tgt, math.floor(len(tgt)*0.8))
test_tgt = list(set(tgt) - set(train_tgt))
train = full.loc[full['playlist_id'].isin(train_tgt)]
test = full.loc[full['playlist_id'].isin(test_tgt)]

#### One hot encodings

In [16]:
to_concat_train = []
to_concat_test = []
to_onehot = ['album_id', 'artist_id', 'track_id', 'playlist_id']

In [17]:
def onehotize(full, df, str):
    exp = full[str].unique()
    print(len(exp))
    df.loc[:, (str)] = df[str].astype(CategoricalDtype(categories = exp))
    oh = pd.get_dummies(df[str], prefix=str).to_sparse(fill_value=0)
    return oh

In [18]:
for name in to_onehot:
    to_concat_train.append(onehotize(full, train, name))
    to_concat_test.append(onehotize(full, test, name))
    train = train.drop(name, axis=1)
    test = test.drop(name, axis=1)
to_concat_train.insert(0, train)
to_concat_test.insert(0, test)

3613


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


3613
1940
1940
6101
6101
7428
7428


In [19]:
train = pd.concat(to_concat_train, axis=1)
test = pd.concat(to_concat_test, axis=1)



In [22]:
test

Unnamed: 0,profile_length,popularity,label,duration_sec,album_id_4755,album_id_621,album_id_2698,album_id_2739,album_id_9580,album_id_1211,...,playlist_id_50374,playlist_id_50382,playlist_id_50395,playlist_id_50405,playlist_id_50410,playlist_id_50412,playlist_id_50416,playlist_id_50424,playlist_id_50428,playlist_id_50431
4,28,483,1,184,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,28,202,1,250,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,28,638,1,235,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,28,358,0,237,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,28,223,0,224,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,28,281,0,235,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,13,646,1,263,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,13,220,1,169,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,13,60,0,350,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,13,246,0,222,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
