## Import Implicit package for ALS

# Alternative Least Square Algorithm

In [1]:
# Import Necessary for Building Model
import implicit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse.linalg import spsolve
from sklearn.model_selection import GridSearchCV
import scipy.sparse as sparse


In [2]:
# Create a shortcut direction to import data from computer
general_path = '/Users/user/Documents/Đồ án tốt nghiệp/Music Data/lastfm-dataset-1K'


In [3]:
# Import original dataset of Million Song Dataset
data = pd.read_table(f'{general_path}/userid-timestamp-artid-artname-traid-traname.tsv', error_bad_lines=False, names=['user_id','time_stamp','artist_id','artist_name','song_id','song_name'])
data.head(10)


Unnamed: 0,user_id,time_stamp,artist_id,artist_name,song_id,song_name
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)
5,user_000001,2009-05-04T13:38:31Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,To Stanford (Live_2009_4_15)
6,user_000001,2009-05-04T13:33:28Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Improvisation (Live_2009_4_15)
7,user_000001,2009-05-04T13:23:45Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Glacier (Live_2009_4_15)
8,user_000001,2009-05-04T13:19:22Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Parolibre (Live_2009_4_15)
9,user_000001,2009-05-04T13:13:38Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Bibo No Aozora (Live_2009_4_15)


In [4]:
# Make a copy dataset
df = data.copy()

# Drop any rows have NaN value
df.dropna(inplace=True)

# Drop unnecessary columns
df.drop(['artist_id', 'song_id', 'time_stamp'], axis=1, inplace=True)


In [6]:
# Combine artist_name and song_name into a column
df['song_artist'] = df[['song_name', 'artist_name']].agg(' - '.join, axis=1)
df.head(10)


Unnamed: 0,user_id,artist_name,song_name,song_artist
10,user_000001,坂本龍一,The Last Emperor (Theme),The Last Emperor (Theme) - 坂本龍一
12,user_000001,坂本龍一,Tibetan Dance (Version),Tibetan Dance (Version) - 坂本龍一
14,user_000001,Underworld,"Boy, Boy, Boy (Switch Remix)","Boy, Boy, Boy (Switch Remix) - Underworld"
15,user_000001,Underworld,Crocodile (Innervisions Orchestra Mix),Crocodile (Innervisions Orchestra Mix) - Under...
16,user_000001,Ennio Morricone,Ninna Nanna In Blu (Raw Deal Remix),Ninna Nanna In Blu (Raw Deal Remix) - Ennio Mo...
17,user_000001,Minus 8,Elysian Fields,Elysian Fields - Minus 8
18,user_000001,Beanfield,Planetary Deadlock,Planetary Deadlock - Beanfield
19,user_000001,Dj Linus,Good Morning Love Coffee Is Ready,Good Morning Love Coffee Is Ready - Dj Linus
20,user_000001,Alif Tree,Deadly Species,Deadly Species - Alif Tree
21,user_000001,Minus 8,Cold Fusion,Cold Fusion - Minus 8


In [7]:
# Continue to drop song_name and artist_name column
df.drop(['song_name', 'artist_name'], axis=1, inplace=True)

# Add times column into dataset
df['times'] = 1


In [8]:
# Change to lowercase for string data
df['song_artist'] = df['song_artist'].str.lower()


In [9]:
# Function for group by song plays of each song for each user
def count_song(df):
    df1 = pd.DataFrame(columns=df.columns.to_list())
    
    # Loop through each user
    for i in df['user_id'].unique():
        # Get the id of user
        user = df[df['user_id'] == i]
        # Count the song plays
        count = pd.DataFrame(user.groupby(['user_id','song_artist']).times.count())
        # Reset index for DataFrame
        count.reset_index(inplace=True)
        # Append data into DataFrame
        df1 = pd.concat([df1,count])
        
    return df1


In [10]:
# Take the output dataframe for count_song function
count_df = count_song(df)


In [11]:
# Create numeric category columns for user_id and song_artist_id
count_df['user_id_cat'] = count_df['user_id'].astype('category').cat.codes
count_df['song_artist_cat'] = count_df['song_artist'].astype('category').cat.codes


In [12]:
# The implicit library expects data as item-user matrix
sparse_item_user = sparse.csr_matrix((count_df['times'].astype(float), (count_df['song_artist_cat'], count_df['user_id_cat'])))
sparse_user_item = sparse.csr_matrix((count_df['times'].astype(float), (count_df['user_id_cat'], count_df['song_artist_cat'])))


In [13]:
from implicit.evaluation import precision_at_k, train_test_split, AUC_at_k

benchmark = []

np.random.seed(1234)
item_user_train, item_user_test = train_test_split(sparse_item_user, train_percentage=0.75)

alpha_val = 40
data_conf = (item_user_train*alpha_val).astype("double")
#Building the model
for i in [0.1, 0.01, 0.001]:
    for j in [20, 50, 100]:
        model = implicit.als.AlternatingLeastSquares(factors=j, regularization=i, iterations=20, use_cg=True)
        model.fit(data_conf)
        
        p = precision_at_k(model, data_conf.T.tocsr(), item_user_test.T.tocsr(), K=10)
        benchmark.append(p)
        print("regularization: {}, factors: {}, score: {}".format(i, j, p))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=992.0), HTML(value='')))


regularization: 0.1, factors: 20, score: 0.07238114628932098


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=992.0), HTML(value='')))


regularization: 0.1, factors: 50, score: 0.12206047032474804


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=992.0), HTML(value='')))


regularization: 0.1, factors: 100, score: 0.1591163595642879


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=992.0), HTML(value='')))


regularization: 0.01, factors: 20, score: 0.07156673114119923


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=992.0), HTML(value='')))


regularization: 0.01, factors: 50, score: 0.11880280973226102


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=992.0), HTML(value='')))


regularization: 0.01, factors: 100, score: 0.1607451898605314


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=992.0), HTML(value='')))


regularization: 0.001, factors: 20, score: 0.07227934439580576


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=992.0), HTML(value='')))


regularization: 0.001, factors: 50, score: 0.1266415555329329


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=992.0), HTML(value='')))


regularization: 0.001, factors: 100, score: 0.16624249211035325


In [15]:
# Initialize the als model and fit it using the sparse item_user matrix
model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.001, iterations=20, use_cg=True)

# Calculate the cofidence by multiplying it by our alpha value
alpha_val = 40
data_conf = (sparse_item_user * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [19]:
# Recommend song for user
def recommend_song(user, number):
    # Take the user into the method .recommend()
    recommended = model.recommend(user, sparse_user_item, N=number, filter_already_liked_items=True)
    
    # Create lists to contain songs and scores
    song = []
    scores = []
    
    # Get artist names from ids
    for item in recommended:
        idx, score = item
        song.append(count_df['song_artist'].loc[count_df['song_artist_cat'] == idx].iloc[0])
        scores.append(score)
        
    recommendations = pd.DataFrame({'song_artist': song, 'score': scores})
    return recommendations


In [20]:
# Find similar songs
def similar_song(item, number):
    similar = model.similar_items(item, number)
    
    song = []
    scores = []
    for item in similar:
        idx, score = item
        song.append(count_df['song_artist'].loc[count_df['song_artist_cat'] == idx].iloc[0])
        scores.append(score)
        
    similar_songs = pd.DataFrame({'song_artist': song, 'scores': scores})
    return similar_songs


In [21]:
similar_song(390598, 10)

Unnamed: 0,song_artist,scores
0,you must pay - godheadsilo,0.04439
1,living room - l. voag,0.044389
2,mercurochrome - steel pole bath tub,0.044389
3,wounded bird - the judybats,0.044389
4,empty particles - a frames,0.044389
5,pluie sans nuages - abbc,0.044389
6,bored to death - kustomized,0.044389
7,untitled - cop shoot cop,0.044389
8,pull me back in - run westy run,0.044389
9,too long - m.o.g.,0.044389


In [22]:
recommend_song(19, 10)

Unnamed: 0,song_artist,score
0,u boat - kasabian,1.511003
1,what's the word - we are scientists,1.408819
2,worth the wait - we are scientists,1.374809
3,two more years - bloc party,1.360508
4,callbacks - we are scientists,1.336251
5,the sweets - yeah yeah yeahs,1.326645
6,god knows - mando diao,1.324954
7,can't lose - we are scientists,1.313945
8,as sure as the sun - black rebel motorcycle club,1.2998
9,someone says - editors,1.25995


In [23]:
# Save count_df file into csv format
count_df.to_csv('user-song-plays.csv', index=False)

In [25]:
# Reduce the origin dataset into 100 songs are listened the most by each user
new_cf_df = pd.DataFrame(columns=count_df.columns.to_list())

for i in range(len(np.unique(count_df['user_id_cat']))):
    id_user = count_df[count_df['user_id_cat'] == i].sort_values("times", ascending=False)
    pop_songs = id_user.head(100)
    new_cf_df = new_cf_df.append(pop_songs, ignore_index=True)

In [26]:
# Save reduced dataset 
new_cf_df.to_csv('reduced_cf_df.csv', index=False)