# LightFM Music Recommender

## Imports

In [1]:
#!pip install lightfm
!pip install fastparquet



In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import sparse as sp
from warnings import filterwarnings
from matplotlib import pyplot as plt

import os

from lightfm import LightFM
from sklearn.metrics import pairwise as pw
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score, reciprocal_rank

sns.set()
filterwarnings('ignore')

%matplotlib inline



In [3]:
from lightfm.data import Dataset

### Small dataset

In [4]:
train = pd.read_parquet('train.parquet', engine='fastparquet')
val = pd.read_parquet('val.parquet', engine='fastparquet')

In [5]:
train = train.iloc[:,[2,1,0]]
val = val.iloc[:,[2,1,0]]

In [6]:
full = pd.concat([train, val]).drop_duplicates()

In [7]:
d = Dataset()

In [8]:
d.fit(np.unique(full['user_id']), full['recording_mbid'].unique())

In [9]:
train_interactions, train_weights = d.build_interactions(train.values)
val_interactions, val_weights = d.build_interactions(val.values)

In [10]:
%%time

model = LightFM(loss='warp-kos', learning_rate=0.05)
model.fit(train_interactions, epochs=10)

CPU times: user 6.37 s, sys: 28.9 ms, total: 6.4 s
Wall time: 6.41 s


<lightfm.lightfm.LightFM at 0x7fa1d4302370>

In [11]:
precision_at_k(model, val_interactions, k = 100).mean()

0.028906249

In [12]:
auc_score(model, val_interactions).mean().round(4)

0.4688

In [13]:
precision_at_k(model, train_interactions, k = 100).mean()

0.22630036

In [14]:
auc_score(model, train_interactions).mean().round(4)

0.8477

### Full dataset

In [15]:
file_path_val = '/Users/christinegao/Documents/dsga 1004/final-project-group16/val_interaction_noncoldstart.parquet/'
file_path_train = '/Users/christinegao/Documents/dsga 1004/final-project-group16/train_interaction_noncoldstart.parquet/'

files_val = [file for file in os.listdir('/Users/christinegao/Documents/dsga 1004/final-project-group16/val_interaction_noncoldstart.parquet') if file.endswith('.parquet')]
files_train = [file for file in os.listdir('/Users/christinegao/Documents/dsga 1004/final-project-group16/train_interaction_noncoldstart.parquet') if file.endswith('.parquet')]
# read each CSV file into a Pandas DataFrame and store in a list


val_list = []
for file in files_val:
    file_name = file_path_val+file
    df = pd.read_parquet(file_name)
    val_list.append(df)

# concatenate all DataFrames in the list into a single DataFrame
val = pd.concat(val_list, ignore_index=True)


train_list = []
for file in files_train:
    file_name = file_path_train+file
    df = pd.read_parquet(file_name)
    train_list.append(df)

# concatenate all DataFrames in the list into a single DataFrame
val_big = pd.concat(val_list, ignore_index=True)
train_big = pd.concat(train_list, ignore_index=True)

In [16]:
val_big = val_big.iloc[:,[0,2,1]]
train_big = train_big.iloc[:,[0,2,1]]

In [17]:
full = pd.concat([train_big, val_big]).drop_duplicates()

In [18]:
d = Dataset()
d.fit(np.unique(full['user_id']), full['recording_mbid'].unique())

In [19]:
train_interactions, train_weights = d.build_interactions(train_big.values)
val_interactions, val_weights = d.build_interactions(val_big.values)

In [20]:
%%time

model = LightFM(loss='warp-kos', learning_rate=0.05)
model.fit(train_interactions, epochs=10)

CPU times: user 3min 38s, sys: 1.18 s, total: 3min 39s
Wall time: 3min 41s


<lightfm.lightfm.LightFM at 0x7fa1d42e5d60>

In [21]:
precision_at_k(model, val_interactions, k = 100).mean().round(4)

0.0006

In [22]:
auc_score(model, val_interactions).mean().round(4)

0.4078

In [23]:
precision_at_k(model, train_interactions, k=100).mean().round(4)

0.1765