### Imports

In [None]:
import os
os.chdir(os.getcwd() + '/..')
print(os.getcwd())

import data.data as data
import inout.importexport as imp
import numpy as np
import pandas as pd
import xgboost as xgb


### Create the complete dataframe

#### Read the recommendations from csv

In [None]:
raw_recs = imp.importcsv('submission/13-12-2018/gxboost25recommendations_12-17-23.csv', check_len=-1)

print(raw_recs[0:2])

#### Explode each row into multiple rows (one per interaction)

In [None]:
recs_tracks = []
for rec in raw_recs:
    playlist_id = rec[0]
    for t in rec[1:]:
        recs_tracks.append([playlist_id, t])
recs_df = pd.DataFrame(recs_tracks, columns=['playlist_id','track_id'])

print(recs_df.head(50))

#### Append the 'profile_length' column to the recommendation dataframe

In [None]:
target_ids = data.get_target_playlists()
targetURM = data.get_urm_train_1()[target_ids]
user_profile_lengths = np.array(targetURM.sum(axis=1)).flatten()
profile_lengths_df = pd.DataFrame({'playlist_id': target_ids, 'profile_length': user_profile_lengths})

print(profile_lengths_df.head(10))

In [None]:
# group users
recs_tracks = []
for rec in raw_recs:
    playlist_id = rec[0]
    for t in rec[1:]:
        recs_tracks.append([playlist_id, t])
recs_df = pd.DataFrame(recs_tracks, columns=['playlist_id','track_id'])

print(recs_df.head(50))

In [None]:
rec_lengths_df = recs_df.merge(profile_lengths_df, on='playlist_id')
print(rec_lengths_df.head(40))

#### Popularity feature

In [None]:
df = data.get_playlists_df()
popularity = df.groupby(['track_id']).size().reset_index(name='popularity')['popularity']
rec_lengths_df = pd.concat([rec_lengths_df, popularity], axis=1)


#### One hot encodings

In [None]:
ohp = pd.get_dummies(rec_lengths_df['playlist_id'])
ohp = ohp.to_sparse(fill_value=0)
# rec_lengths_df = rec_lengths_df.drop(['playlist_id'], axis=1)

In [None]:
oht = pd.get_dummies(rec_lengths_df['track_id'])
oht = oht.to_sparse(fill_value=0)
rec_lengths_df = rec_lengths_df.drop(['track_id'], axis=1)

In [None]:
oh1 = pd.concat([rec_lengths_df, ohp, oht], axis=1)

#### Append the 'label' column 

In [None]:
urm_test = data.get_urm_test_1()
test_labels = []

last_playlist_id = -1
for idx,row in recs_df.iterrows():
    current_playlist_id = row['playlist_id']
    track_id = row['track_id']
    # cache the row of the urm test if same playlist of the previous iteration
    if not current_playlist_id == last_playlist_id:
        # tracks ids in the t row of urm test
        tracks_ids = urm_test.getrow(current_playlist_id).nonzero()[1]
        last_playlist_id = current_playlist_id
    
    test_labels.append(1 if track_id in tracks_ids else 0)

test_labels_df = pd.DataFrame({'label': test_labels})

In [None]:
print(len(test_labels), test_labels[0:30])

In [None]:
oh1 = pd.concat([oh1, test_labels_df], axis=1)

### Split into train and test dataframes

In [None]:
def func(x):
    n = x['label'].sum()
    ones = x.loc[x['label'] == 1]
    zeros = x.loc[x['label'] == 0].sample(n)
    return pd.concat([ones,zeros])

In [None]:
full = oh1.head(100).groupby(['playlist_id'], as_index=False).apply(func)

In [None]:
full = full.reset_index().drop(['level_0', 'level_1'], axis=1)

In [None]:
import data.data as d
import random
import math
tgt = d.get_target_playlists()
train_tgt = random.sample(tgt, math.floor(len(tgt)/2))
test_tgt = list(set(tgt) - set(train_tgt))
train = full.loc[full['playlist_id'].isin(train_tgt)]
test = full.loc[full['playlist_id'].isin(test_tgt)]

In [None]:
test

### Train with XGBoost

In [None]:
dtrain = xgb.DMatrix(oh1, [])