### Imports

In [None]:
import os
os.chdir(os.getcwd() + '/..')
print(os.getcwd())

import data.data as data
import inout.importexport as imp
import numpy as np
import pandas as pd
import xgboost as xgb
import random
import math
from pandas.api.types import CategoricalDtype


### Create the complete dataframe

#### Read the recommendations from csv

In [None]:
raw_recs = imp.importcsv('submission/13-12-2018/gxboost25recommendations_12-17-23.csv', check_len=-1)

print(raw_recs[0:2])

#### Explode each row into multiple rows (one per interaction)

In [None]:
recs_tracks = []
for rec in raw_recs:
    playlist_id = rec[0]
    for t in rec[1:]:
        recs_tracks.append([playlist_id, t])
recs_df = pd.DataFrame(recs_tracks, columns=['playlist_id','track_id'])

print(recs_df)

#### Append the 'profile_length' column to the recommendation dataframe

In [None]:
target_ids = data.get_target_playlists()
targetURM = data.get_urm_train_1()[target_ids]
user_profile_lengths = np.array(targetURM.sum(axis=1)).flatten()
profile_lengths_df = pd.DataFrame({'playlist_id': target_ids, 'profile_length': user_profile_lengths})

print(profile_lengths_df.head(10))

In [None]:
rec_lengths_df = recs_df.merge(profile_lengths_df, on='playlist_id')
print(rec_lengths_df)

#### Popularity feature

In [None]:
df = data.get_playlists_df()
popularity = df.groupby(['track_id']).size().reset_index(name='popularity')
print(popularity)

In [None]:
rec_pop_df = rec_lengths_df.join(popularity.set_index('track_id'), on='track_id')
print(rec_pop_df)

#### Append the 'label' column 

In [None]:
urm_test = data.get_urm_test_1()
test_labels = []

last_playlist_id = -1
for idx,row in recs_df.iterrows():
    current_playlist_id = row['playlist_id']
    track_id = row['track_id']
    # cache the row of the urm test if same playlist of the previous iteration
    if not current_playlist_id == last_playlist_id:
        # tracks ids in the t row of urm test
        tracks_ids = urm_test.getrow(current_playlist_id).nonzero()[1]
        last_playlist_id = current_playlist_id
    
    test_labels.append(1 if track_id in tracks_ids else 0)

test_labels_df = pd.DataFrame({'label': test_labels})

In [None]:
rec_label_df = pd.concat([rec_pop_df, test_labels_df], axis=1)
print(rec_label_df)

#### Append the tracks features (album, artist, duration)

In [None]:
tdf = data.get_tracks_df()
rec_feature_track_df = rec_label_df.join(tdf.set_index('track_id'), on='track_id')
print(rec_feature_track_df)

### I'm happy with the features gathered

In [None]:
feature_df = rec_feature_track_df

### Split into train and test dataframes

In [None]:
def func(x):
    n = x['label'].sum()
    ones = x.loc[x['label'] == 1]
    zeros = x.loc[x['label'] == 0].sample(n)
    return pd.concat([ones,zeros])

In [None]:
full = feature_df.groupby(['playlist_id'], as_index=False).apply(func)

In [None]:
full = full.reset_index().drop(['level_0', 'level_1'], axis=1)

In [None]:
tgt = data.get_target_playlists()
train_tgt = random.sample(tgt, math.floor(len(tgt)*0.8))
test_tgt = list(set(tgt) - set(train_tgt))
train = full.loc[full['playlist_id'].isin(train_tgt)]
test = full.loc[full['playlist_id'].isin(test_tgt)]

#### One hot encodings

In [None]:
to_concat_train = []
to_concat_test = []
to_onehot = ['album_id', 'artist_id', 'track_id', 'playlist_id']

In [None]:
def onehotize(full, df, str):
    exp = full[str].unique()
    print(len(exp))
    df.loc[:, (str)] = df[str].astype(CategoricalDtype(categories = exp))
    oh = pd.get_dummies(df[str], prefix=str).to_sparse(fill_value=0)
    return oh

In [None]:
for name in to_onehot:
    to_concat_train.append(onehotize(full, train, name))
    to_concat_test.append(onehotize(full, test, name))
    train = train.drop(name, axis=1)
    test = test.drop(name, axis=1)

In [None]:
train = pd.concat(to_concat_train, axis=1)
test = pd.concat(to_concat_test, axis=1)

### Train with XGBoost

In [None]:
dtrain = xgb.DMatrix(oh1, [])