<a href="https://colab.research.google.com/github/dhruthick/cse256project/blob/main/recommendation/lightfm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup and Imports

In [None]:
!pip install lightfm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=867268 sha256=5d50845611e0e13e9bc54c51601ff513a5c425a2c17037787ea6b15a0547a20f
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import math

import joblib

import scipy.sparse as sp
from tqdm import tqdm

from lightfm import LightFM

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

import csv
from collections import defaultdict

## Read data

In [None]:
data_path='/content/drive/MyDrive/cse256/project/data/'
models_path='/content/drive/MyDrive/cse256/project/models/'
all_interactions=pd.read_csv(data_path+'all_interactions.csv').drop('pid',axis=1)
interactions_train=pd.read_csv(data_path+'interactions_train.csv').drop('pid',axis=1)
interactions_val=pd.read_csv(data_path+'interactions_val.csv').drop('pid',axis=1)

In [None]:
playlist_info=pd.read_csv(data_path+'playlists.csv')
train_pids=np.unique(interactions_train.gen_pid)
playlist_info=playlist_info[playlist_info.pid.isin(train_pids)]

## Necessary data structures

In [None]:
num_playlists=len(np.unique(interactions_train.gen_pid))
num_tracks=len(np.unique(all_interactions.tid))

In [None]:
playlistsPerTrack = defaultdict(set)
tracksPerPlaylist = defaultdict(set)
trackNames={}
track_id2artist_uri={}
track_id2album_uri={}
artist_count=defaultdict(int)
album_count=defaultdict(int)
for pid,tid,track_name,artist,album_uri,artist_uri in tqdm(interactions_train[['gen_pid','tid','track_name','artist_name','album_uri','artist_uri']].values.tolist()):
    playlistsPerTrack[tid].add(pid)
    tracksPerPlaylist[pid].add(tid)
    trackNames[tid] = (track_name, artist)
    track_id2artist_uri[tid]=artist_uri
    track_id2album_uri[tid]=album_uri
    artist_count[artist_uri]+=1
    album_count[album_uri]+=1

In [None]:
playlists_info={}
for pid, ptracks in interactions_train.groupby('gen_pid'):
    track_ids= ptracks.tid.values.tolist()
    album_uris=ptracks.album_uri.tolist()
    artist_uris=ptracks.artist_uri.tolist()
    playlists_info[pid]={
        'tracks':track_ids,
        'albums':album_uris,
        'artists':artist_uris
    }

## Playlist name as a feature

In [None]:
playlist_names=playlist_info.set_index('pid').name
playlist_names=playlist_names.reindex(np.arange(num_playlists)).fillna('')
vectorizer = CountVectorizer(max_features=20000)
user_features = vectorizer.fit_transform(playlist_names)

In [None]:
user_features = sp.hstack([sp.eye(num_playlists), user_features])

## Setup interaction matrix

In [None]:
X_train=sp.coo_matrix(
    (np.ones(len(interactions_train)), (interactions_train.gen_pid,interactions_train.tid)),
    shape=(num_playlists,num_tracks)
)

## Functions to evaluate recommendations

In [None]:
def get_scores(pid,N):
  all_tracks=np.unique(all_interactions.tid)
  ptracks=tracksPerPlaylist[pid]
  pred_tracks=[t for t in all_tracks if t not in ptracks]
  scores=model.predict(user_ids=[pid for i in range(len(pred_tracks))],
                item_ids=pred_tracks)
  scores=[(scores[i],pred_tracks[i]) for i in range(len(pred_tracks))]
  scores.sort(reverse=True)
  return scores[:N]


In [None]:
def evaluate_playlist(pid,N):
  relevantTracks=set(interactions_val[interactions_val['gen_pid']==pid].tid.values)
  scores=get_scores(pid,N)
  recommendedTracks=set([t[1] for t in scores])
  rprc=len(recommendedTracks.intersection(relevantTracks))/len(relevantTracks)
  dcg=0
  for i in range(len(scores)):
    if scores[i][1] in relevantTracks:
      dcg+=math.log(2)/math.log(i+2)
  ndcg=dcg/len(relevantTracks)
  rec_click=int(N/10)+1
  for i in range(0,int(N/10)):
    recommendedTracks=set([t[1] for t in scores[i*10:(i*10+10)]])
    if len(recommendedTracks.intersection(relevantTracks))>0:
      rec_click=i+1
      break
  return rprc,ndcg,rec_click

In [None]:
def evaluate(k,N,print=False):
  val_pids=np.unique(interactions_val.gen_pid)
  rprcs,ndcgs,rec_clicks=[],[],[]
  for pid in tqdm(val_pids[:k]):
    rprc,ndcg,rec_click=evaluate_playlist(pid,N=N)
    rprcs.append(rprc)
    ndcgs.append(ndcg)
    rec_clicks.append(rec_click)
  if print:
    print(f'\nAverage R-Precision: {np.average(rprcs)}')
    print(f'Average NDCG: {np.average(ndcgs)}')
    print(f'Average Recommendation Clicks: {np.average(rec_clicks)}')
  return np.average(rprcs),np.average(ndcgs),np.average(rec_clicks)

## LightFM training

In [None]:
best_arprc=0
model = LightFM(no_components=200, loss='warp', learning_rate=0.02, max_sampled=400, random_state=1, user_alpha=1e-05)
for i in range(60):
    model.fit_partial(X_train, epochs=5, verbose=True)
    arprc,andcg,arec_clicks=evaluate(25,500,print=False)
    print(f'Average R-Precision: {arprc}')
    if best_arprc<arprc:
      best_arprc=arprc
      print('Saving_model...',end='')
      joblib.dump(model, open(models_path+'lightfm_model.pkl', 'wb'))
      print('Done')


Epoch: 100%|██████████| 5/5 [03:39<00:00, 43.92s/it]
100%|██████████| 25/25 [00:10<00:00,  2.31it/s]


Average R-Precision: 0.2666961634856371
Saving_model...Done


Epoch: 100%|██████████| 5/5 [05:48<00:00, 69.79s/it]
100%|██████████| 25/25 [00:10<00:00,  2.31it/s]


Average R-Precision: 0.3306712760325629
Saving_model...Done


Epoch: 100%|██████████| 5/5 [06:45<00:00, 81.06s/it]
100%|██████████| 25/25 [00:09<00:00,  2.56it/s]


Average R-Precision: 0.40153215371696993
Saving_model...Done


Epoch: 100%|██████████| 5/5 [07:35<00:00, 91.01s/it]
100%|██████████| 25/25 [00:09<00:00,  2.58it/s]


Average R-Precision: 0.4013037546650415


Epoch: 100%|██████████| 5/5 [08:23<00:00, 100.71s/it]
100%|██████████| 25/25 [00:09<00:00,  2.61it/s]


Average R-Precision: 0.4013841731024394


Epoch: 100%|██████████| 5/5 [09:10<00:00, 110.01s/it]
100%|██████████| 25/25 [00:09<00:00,  2.51it/s]


Average R-Precision: 0.37500077871904497


Epoch: 100%|██████████| 5/5 [09:56<00:00, 119.25s/it]
100%|██████████| 25/25 [00:10<00:00,  2.37it/s]


Average R-Precision: 0.3981995448651795


Epoch: 100%|██████████| 5/5 [10:42<00:00, 128.56s/it]
100%|██████████| 25/25 [00:10<00:00,  2.36it/s]


Average R-Precision: 0.44809252145380823
Saving_model...Done


Epoch: 100%|██████████| 5/5 [11:22<00:00, 136.41s/it]
100%|██████████| 25/25 [00:09<00:00,  2.56it/s]


Average R-Precision: 0.45272744208872895
Saving_model...Done


Epoch: 100%|██████████| 5/5 [12:05<00:00, 145.01s/it]
100%|██████████| 25/25 [00:11<00:00,  2.23it/s]


Average R-Precision: 0.4526322039934908


Epoch: 100%|██████████| 5/5 [12:44<00:00, 152.82s/it]
100%|██████████| 25/25 [00:10<00:00,  2.38it/s]


Average R-Precision: 0.4407274420887289


Epoch: 100%|██████████| 5/5 [13:16<00:00, 159.28s/it]
100%|██████████| 25/25 [00:08<00:00,  2.79it/s]


Average R-Precision: 0.43550733299245886


Epoch: 100%|██████████| 5/5 [13:41<00:00, 164.22s/it]
100%|██████████| 25/25 [00:08<00:00,  2.78it/s]


Average R-Precision: 0.4167237072614647


Epoch: 100%|██████████| 5/5 [14:04<00:00, 168.80s/it]
100%|██████████| 25/25 [00:10<00:00,  2.41it/s]


Average R-Precision: 0.41002217081164444


Epoch: 100%|██████████| 5/5 [14:03<00:00, 168.77s/it]
100%|██████████| 25/25 [00:10<00:00,  2.44it/s]


Average R-Precision: 0.4019635627530364


Epoch: 100%|██████████| 5/5 [14:28<00:00, 173.80s/it]
100%|██████████| 25/25 [00:10<00:00,  2.44it/s]


Average R-Precision: 0.41218334297281667


Epoch: 100%|██████████| 5/5 [14:41<00:00, 176.29s/it]
100%|██████████| 25/25 [00:11<00:00,  2.15it/s]


Average R-Precision: 0.4119635627530364


Epoch: 100%|██████████| 5/5 [14:52<00:00, 178.52s/it]
100%|██████████| 25/25 [00:08<00:00,  2.92it/s]


Average R-Precision: 0.40643975322922693


Epoch: 100%|██████████| 5/5 [15:04<00:00, 180.91s/it]
100%|██████████| 25/25 [00:09<00:00,  2.58it/s]


Average R-Precision: 0.4035826103720841


Epoch: 100%|██████████| 5/5 [15:13<00:00, 182.71s/it]
100%|██████████| 25/25 [00:10<00:00,  2.43it/s]


Average R-Precision: 0.4024297281665703


Epoch: 100%|██████████| 5/5 [15:29<00:00, 185.92s/it]
100%|██████████| 25/25 [00:10<00:00,  2.43it/s]


Average R-Precision: 0.4076119144013881


Epoch: 100%|██████████| 5/5 [15:39<00:00, 187.84s/it]
100%|██████████| 25/25 [00:10<00:00,  2.42it/s]


Average R-Precision: 0.3955066512434934


Epoch: 100%|██████████| 5/5 [15:57<00:00, 191.51s/it]
100%|██████████| 25/25 [00:10<00:00,  2.38it/s]


Average R-Precision: 0.41074474648158854


Epoch: 100%|██████████| 5/5 [16:25<00:00, 197.11s/it]
100%|██████████| 25/25 [00:10<00:00,  2.41it/s]


Average R-Precision: 0.40366782340466556


Epoch: 100%|██████████| 5/5 [16:34<00:00, 198.82s/it]
100%|██████████| 25/25 [00:08<00:00,  2.86it/s]


Average R-Precision: 0.40366782340466556


Epoch: 100%|██████████| 5/5 [16:20<00:00, 196.18s/it]
100%|██████████| 25/25 [00:10<00:00,  2.49it/s]


Average R-Precision: 0.40100115673799885


Epoch: 100%|██████████| 5/5 [16:17<00:00, 195.57s/it]
100%|██████████| 25/25 [00:10<00:00,  2.50it/s]


Average R-Precision: 0.39100115673799885


Epoch: 100%|██████████| 5/5 [16:20<00:00, 196.10s/it]
100%|██████████| 25/25 [00:09<00:00,  2.52it/s]


Average R-Precision: 0.38909639483323694


Epoch:  80%|████████  | 4/5 [13:05<03:16, 196.43s/it]