<a href="https://colab.research.google.com/github/dhruthick/cse256project/blob/main/recommendation/xgboost_no-mood-features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost - Final Stage of recommendation without mood-based features

## Imports and setup

In [2]:
!pip install lightfm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=879182 sha256=cc545804f673efacaaea186319222972756e100cf41129f1b21341bc3b2c5580
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [8]:
import pandas as pd
import numpy as np
import joblib
import math

import pickle
from tqdm.notebook import tqdm

from lightfm import LightFM

import csv
from collections import defaultdict

import xgboost
from sklearn.model_selection import train_test_split

In [2]:
data_path='/content/drive/MyDrive/cse256/project/data/'
models_path='/content/drive/MyDrive/cse256/project/models/'


## Read data

In [11]:
all_interactions=pd.read_csv(data_path+'all_interactions.csv').drop('pid',axis=1)
interactions_train=pd.read_csv(data_path+'interactions_train.csv').drop('pid',axis=1)
interactions_val=pd.read_csv(data_path+'interactions_val.csv').drop('pid',axis=1)
cs_model=joblib.load(open(models_path+'lightfm_model.pkl', 'rb'))

## Some necessary data structures

In [12]:
num_playlists=len(np.unique(interactions_train.gen_pid))
num_tracks=len(np.unique(all_interactions.tid))

In [13]:
playlistsPerTrack = defaultdict(set) 
tracksPerPlaylist = defaultdict(set) 
trackNames={}
track_id2artist_uri={}
track_id2album_uri={}
artist_count=defaultdict(int)
album_count=defaultdict(int)
for pid,tid,track_name,artist,album_uri,artist_uri in tqdm(interactions_train[['gen_pid','tid','track_name','artist_name','album_uri','artist_uri']].values.tolist()):
    playlistsPerTrack[tid].add(pid)
    tracksPerPlaylist[pid].add(tid)
    trackNames[tid] = (track_name, artist)
    track_id2artist_uri[tid]=artist_uri
    track_id2album_uri[tid]=album_uri
    artist_count[artist_uri]+=1
    album_count[album_uri]+=1

  0%|          | 0/598734 [00:00<?, ?it/s]

In [14]:
playlists_info={}
for pid, ptracks in interactions_train.groupby('gen_pid'):
    track_ids= ptracks.tid.values.tolist()
    album_uris=ptracks.album_uri.tolist()
    artist_uris=ptracks.artist_uri.tolist()
    playlists_info[pid]={
        'tracks':track_ids,
        'albums':album_uris,
        'artists':artist_uris
    }

## Functions for evaluating recommendation

In [15]:
def get_scores(pid,N):
  all_tracks=np.unique(all_interactions.tid)
  ptracks=tracksPerPlaylist[pid]
  pred_tracks=[t for t in all_tracks if t not in ptracks]
  scores=cs_model.predict(user_ids=[pid for i in range(len(pred_tracks))],
                item_ids=pred_tracks)
  scores=[(scores[i],pred_tracks[i]) for i in range(len(pred_tracks))]
  scores.sort(reverse=True)
  return scores[:N]


In [16]:
def evaluate_playlist(pid,scores,N):
  relevantTracks=set(interactions_test[interactions_test['gen_pid']==pid].tid.values)
  scores=scores[:N]
  recommendedTracks=set([t[1] for t in scores])
  rprc=len(recommendedTracks.intersection(relevantTracks))/len(relevantTracks)
  dcg=0
  for i in range(len(scores)):
    if scores[i][1] in relevantTracks:
      dcg+=math.log(2)/math.log(i+2)
  ndcg=dcg/len(relevantTracks)
  rec_click=int(N/10)+1
  for i in range(0,int(N/10)):
    recommendedTracks=set([t[1] for t in scores[i*10:(i*10+10)]])
    if len(recommendedTracks.intersection(relevantTracks))>0:
      rec_click=i+1
      break
  return rprc,ndcg,rec_click

## Fetching the top-1000 candidate tracks from the candidate selction model and constructing relevant features

In [19]:
columns=['pid','tid','score','rank','track_bias','playlist_bias','dot_product','max_coo','min_coo','mean_coo','median_coc',
         'max_ncoo','min_ncoo','mean_ncoo','median_coo','track_artist_count','track_album_count',
         'track_artist_share','track_album_share','global_track_count','global_artist_count','global_album_count',
         'unique_album_count','unique_artist_count','num_tracks_in_playlist','target']
with open(data_path+'xg_data.csv','w') as f:
  write = csv.writer(f)
  write.writerow(columns)
  track_biases, track_embeddings = cs_model.get_item_representations()
  playlist_biases, playlist_embeddings = cs_model.get_user_representations()
  for pid in tqdm(np.unique(interactions_val.gen_pid)):
    candidate_tracks=get_scores(pid,750)
    tracks_in_playlist=tracksPerPlaylist[pid]
    relevant_tracks=set(interactions_val[interactions_val['gen_pid']==pid].tid.values)
    playlist_features=[len(np.unique(playlists_info[pid]['albums'])),
                      len(np.unique(playlists_info[pid]['artists'])),len(tracks_in_playlist)]
    for i in range(len(candidate_tracks)):
      score,tid=candidate_tracks[i]
      sample=[pid,tid,score,i,track_biases[tid],playlist_biases[pid],
              track_embeddings[tid].dot(playlist_embeddings[pid])]
      playlists_with_ctrack=playlistsPerTrack[tid]
      co_occurence=[]
      norm_co_occurence=[]
      for playlist_track in tracks_in_playlist:
        other_playlists=playlistsPerTrack[playlist_track]
        n=len(other_playlists.intersection(playlists_with_ctrack))
        co_occurence.append(n)
        norm_co_occurence.append(n/len(other_playlists))
      sample.extend([max(co_occurence),min(co_occurence),np.mean(co_occurence),np.median(co_occurence)])
      sample.extend([max(norm_co_occurence),min(norm_co_occurence),np.mean(norm_co_occurence),np.median(norm_co_occurence)])
      t_artist_count=playlists_info[pid]['artists'].count(track_id2artist_uri[tid])
      t_album_count=playlists_info[pid]['albums'].count(track_id2album_uri[tid])
      sample.extend([t_artist_count,t_album_count,
                    t_artist_count/len(tracks_in_playlist),
                    t_album_count/len(tracks_in_playlist)])
      sample.extend([len(playlists_with_ctrack),
                    artist_count[track_id2artist_uri[tid]],
                    album_count[track_id2album_uri[tid]]])
      sample.extend(playlist_features)
      sample.append(1 if tid in relevant_tracks else 0)
      write.writerow(sample)


  0%|          | 0/17500 [00:00<?, ?it/s]

In [3]:
xg_data=pd.read_csv(data_path+'xg_data.csv')

## Training XGBoost

In [4]:
train,test=train_test_split(xg_data,test_size=0.20,stratify=xg_data.target)
train.to_csv(data_path+'xg_data_train.csv',index=False)
test.to_csv(data_path+'xg_data_test.csv',index=False)

In [3]:
cols = ['pid', 'tid', 'target']
train=pd.read_csv(data_path+'xg_data_train.csv')
xgtrain = xgboost.DMatrix(train.drop(cols, axis=1), train.target)
del train

In [4]:
test=pd.read_csv(data_path+'xg_data_test.csv')
xgval = xgboost.DMatrix(test.drop(cols, axis=1), test.target)
del test

In [5]:
params = {
    'objective':'binary:logistic', 
    'eta':0.1, 
    'booster':'gbtree',
    'max_depth':7,         
    'nthread':50,  
    'seed':1,    
    'eval_metric':'auc',
}

a = xgboost.train(
    params=list(params.items()),  
    early_stopping_rounds=30, 
    verbose_eval=10, 
    dtrain=xgtrain,
    evals=[(xgtrain, 'train'), (xgval, 'test')],
    num_boost_round=300,
)

[0]	train-auc:0.85906	test-auc:0.85577
[10]	train-auc:0.86902	test-auc:0.86562
[20]	train-auc:0.87114	test-auc:0.86704
[30]	train-auc:0.87704	test-auc:0.87277
[40]	train-auc:0.87965	test-auc:0.87483
[50]	train-auc:0.88194	test-auc:0.87662
[60]	train-auc:0.88427	test-auc:0.87852
[70]	train-auc:0.88604	test-auc:0.87980
[80]	train-auc:0.88751	test-auc:0.88079
[90]	train-auc:0.88870	test-auc:0.88142
[100]	train-auc:0.88955	test-auc:0.88178
[110]	train-auc:0.89015	test-auc:0.88200
[120]	train-auc:0.89078	test-auc:0.88216
[130]	train-auc:0.89136	test-auc:0.88228
[140]	train-auc:0.89199	test-auc:0.88237
[150]	train-auc:0.89247	test-auc:0.88243
[160]	train-auc:0.89292	test-auc:0.88248
[170]	train-auc:0.89338	test-auc:0.88255
[180]	train-auc:0.89378	test-auc:0.88257
[190]	train-auc:0.89422	test-auc:0.88260
[200]	train-auc:0.89467	test-auc:0.88263
[210]	train-auc:0.89505	test-auc:0.88263
[220]	train-auc:0.89545	test-auc:0.88265
[230]	train-auc:0.89578	test-auc:0.88267
[240]	train-auc:0.89623	tes

In [9]:
pickle.dump(a, open(models_path+'xgboost.pkl', "wb"))

## Evaluating XGBoost

In [10]:
a = pickle.load(open(models_path+'xgboost.pkl', "rb"))

In [17]:
interactions_test=pd.read_csv(data_path+'interactions_test.csv')

In [19]:
cols=['score','rank','track_bias','playlist_bias','dot_product','max_coo','min_coo','mean_coo','median_coc',
         'max_ncoo','min_ncoo','mean_ncoo','median_coo','track_artist_count','track_album_count',
         'track_artist_share','track_album_share','global_track_count','global_artist_count','global_album_count',
         'unique_album_count','unique_artist_count','num_tracks_in_playlist']

In [27]:
track_biases, track_embeddings = cs_model.get_item_representations()
playlist_biases, playlist_embeddings = cs_model.get_user_representations()

rprcs,ndcgs,rec_clicks=[],[],[]

for pid in tqdm(np.unique(interactions_test.gen_pid)):
  candidates=[]
  candidate_stats=[]
  candidate_tracks=get_scores(pid,750)
  tracks_in_playlist=tracksPerPlaylist[pid]
  # relevant_tracks=set(interactions_val[interactions_val['gen_pid']==pid].tid.values)
  playlist_features=[len(np.unique(playlists_info[pid]['albums'])),
                     len(np.unique(playlists_info[pid]['artists'])),len(tracks_in_playlist)]
  for i in range(len(candidate_tracks)):
    score,tid=candidate_tracks[i]
    candidates.append(tid)
    sample=[score,i,track_biases[tid],playlist_biases[pid],
            track_embeddings[tid].dot(playlist_embeddings[pid])]
    playlists_with_ctrack=playlistsPerTrack[tid]
    co_occurence=[]
    norm_co_occurence=[]
    for playlist_track in tracks_in_playlist:
      other_playlists=playlistsPerTrack[playlist_track]
      n=len(other_playlists.intersection(playlists_with_ctrack))
      co_occurence.append(n)
      norm_co_occurence.append(n/len(other_playlists))
    sample.extend([max(co_occurence),min(co_occurence),np.mean(co_occurence),np.median(co_occurence)])
    sample.extend([max(norm_co_occurence),min(norm_co_occurence),np.mean(norm_co_occurence),np.median(norm_co_occurence)])
    t_artist_count=playlists_info[pid]['artists'].count(track_id2artist_uri[tid])
    t_album_count=playlists_info[pid]['albums'].count(track_id2album_uri[tid])
    sample.extend([t_artist_count,t_album_count,
                   t_artist_count/len(tracks_in_playlist),
                   t_album_count/len(tracks_in_playlist)])
    sample.extend([len(playlists_with_ctrack),
                   artist_count[track_id2artist_uri[tid]],
                   album_count[track_id2album_uri[tid]]])
    sample.extend(playlist_features)
    # sample.append(1 if tid in relevant_tracks else 0)
    candidate_stats.append(sample)
  xgtest=xgboost.DMatrix(pd.DataFrame(candidate_stats, columns=cols))
  pred_scores=a.predict(xgtest)
  pred_scores=[(pred_scores[i],candidates[i]) for i in range(len(candidates))]
  pred_scores.sort(reverse=True)
  rprc,ndcg,rec_click=evaluate_playlist(pid,pred_scores,500)
  rprcs.append(rprc)
  ndcgs.append(ndcg)
  rec_clicks.append(rec_click)

print(f'\nAverage R-Precision: {np.average(rprcs)}')
print(f'Average NDCG: {np.average(ndcgs)}')
print(f'Average Recommendation Clicks: {np.average(rec_clicks)}')

  0%|          | 0/17500 [00:00<?, ?it/s]


Average R-Precision: 0.5661144419053539
Average NDCG: 0.13695909616709162
Average Recommendation Clicks: 7.3496
