Name: **Christian Mundwiler**  
Project Title: **Music Recommendation Service**  
Class: **CSCI 4930 Machine Learning**

In [10]:
# import libraries
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from recommender import Recommender

In [11]:
song_plays = pd.read_csv('datasets/train_triplets.csv', delim_whitespace=True)
song_plays.columns = ['User ID', 'Song ID', 'Listen Counts']

In [None]:
unique_tracks = pd.read_csv('datasets/unique_tracks.txt', delimiter="<SEP>")
unique_tracks.columns = ['Track ID', 'Song ID', 'Artist', 'Song Title']

In [13]:
song_plays.head()

Unnamed: 0,User ID,Song ID,Listen Counts
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBNZDC12A6D4FC103,1


In [14]:
unique_tracks.head()

Unnamed: 0,Track ID,Song ID,Artist,Song Title
0,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
1,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
2,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
3,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens
4,TRMMMXN128F42936A5,SOZVAPQ12A8C13B63C,David Montgomery,"Symphony No. 1 G minor ""Sinfonie Serieuse""/All..."


In [15]:
# get dataset info
print(song_plays.shape[0])
print(unique_tracks.shape[0])

48373585
999999


In [16]:
# remove dups
unique_tracks.drop_duplicates(subset=['Song ID'], inplace=True)

In [17]:
# merge datasets
song_data = pd.merge(song_plays, unique_tracks, on = "Song ID", how = "left")

In [18]:
song_data.drop(columns=['Track ID'], inplace = True)

In [19]:
song_data.head()

Unnamed: 0,User ID,Song ID,Listen Counts,Artist,Song Title
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,Billy Preston,Nothing from Nothing
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Paco De Lucia,Entre Dos Aguas
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,Josh Rouse,Under Cold Blue Stars
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,The Dead 60s,Riot Radio (Soundtrack Version)
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBNZDC12A6D4FC103,1,Amset,Sin límites (I)


In [20]:
# verify merge
song_data.shape[0]

48373585

In [21]:
# find NaN values
song_data.isnull().sum(axis = 0)

User ID           0
Song ID           0
Listen Counts     0
Artist            3
Song Title       68
dtype: int64

In [22]:
song_data.dropna(axis = 0, subset = ['Song Title'], inplace = True)

In [23]:
# verify drop
song_data.shape[0]

48373517

In [24]:
user_listens = song_data.groupby('User ID')['Song ID'].count()

In [25]:

print(f"Users listen to an average of {np.around(user_listens.mean(), decimals=4)} songs each.")
print(f"The median for songs listened to by users is {np.median(user_listens)} songs")

Users listen to an average of 47.4567 songs each.
The median for songs listened to by users is 27.0 songs


In [26]:
song_users = song_data.groupby('Song ID')['User ID'].count()

In [27]:
print(f"Songs were listened to by an average of {np.around(song_users.mean(), decimals=4)} users each.")
print(f"The median amount of unique listeners for each song is {np.median(song_users)} listeners.")

Songs were listened to by an average of 125.7955 users each.
The median amount of unique listeners for each song is 13.0 listeners.


In [28]:
print(f"The maximum listeners for one song was {np.max(song_users)}.")
print(f"The minimum listeners for one song was {np.min(song_users)}.")

The maximum listeners for one song was 110479.
The minimum listeners for one song was 1.


In [29]:

#Unique songs
unique_songs = song_data['Song Title'].unique().shape[0]
print(f"There are {unique_songs} unique songs in the dataset")

There are 306720 unique songs in the dataset


In [30]:
#Unique users
unique_users = song_data['User ID'].unique().shape[0]
print(f"There are {unique_users} unique users in the dataset")

There are 1019318 unique users in the dataset


In [31]:
values_matrix = unique_users * unique_songs
zero_values_matrix = values_matrix - song_data.shape[0]
print(values_matrix)
print(zero_values_matrix)

312645216960
312596843443


In [32]:
print(f"The matrix of users * songs has {zero_values_matrix} cells that are zero")


The matrix of users * songs has 312596843443 cells that are zero


In [33]:
# since data is so sparse, only those users who have listened to more than  songs.
user_id_list = user_listens[user_listens > 175].index.to_list()
song_id_list = song_users[song_users > 175].index.to_list()

In [34]:
# Filtered the dataset to keep only those users with more than 175 listened
song_data_ten_listens = song_data[song_data['User ID'].isin(user_id_list)].reset_index(drop=True)

In [35]:
song_data_complete = song_data_ten_listens[song_data_ten_listens['Song ID'].isin(song_id_list)].reset_index(drop=True)

In [36]:
songs_features = song_data_complete.pivot_table(index='Song ID', columns='User ID', values='Listen Counts').fillna(0)

In [37]:
songs_features.head()

User ID,00028f3cff4872bff3e9985cfa32e01a8d54e374,0002e94348b2543c6e6ccf408b0160d14064e46f,0003a64f7a69e5b87a80b09c3772227185c235c7,00043d7bc800ceff4a90459e189eba5d442a1d3d,00063d8046ee046b43709a819211ccfc4b21a6d0,0009dc26c4ae94eb9409a726c26d3fe749f6faa1,000d80cd9b58a8f77b33aa613dcfc5cbf1daf5e8,000dd746b05684caffc8875784cb27c4503632ca,000ebc858861aca26bac9b49f650ed424cf882fc,00106661302d2251d8bb661c91850caa65096457,...,fff4493d530cd62e88bed73849b35687a08b1ee7,fff6c30c773e6ffafcac213c9afd9666afaf6d63,fff759a45a3a68de552740e8285a97d5f65d4e58,fff787a6f818de536687c0402d42c62841ab5d5c,fff83c8596c1519f90fd5c5ed540f2ad93ea7bc5,fff90dcb01432ff9b9b837ed9af3c533d533fa22,fffb0b218640d86e5cb99d41cd3ecad977142da5,fffbab4b8416fc41d05fcbdcf0e6735c4f37cb39,fffc0df75a48d823ad5abfaf2a1ee61eb1e3302c,ffffcfeb0c1b66bd212ea58d918c7dc62fb9c3a5
Song ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SOAAAGQ12A8C1420C8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAABSB12A8C143E55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAACGG12A58A7A034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAACPJ12A81C21360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAACSG12AB018DC80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# obtain a sparse matrix
matrix_songs_features = csr_matrix(songs_features.values)

In [59]:
song_data = song_data.rename(columns={'Song Title': 'title'})

In [61]:
df_unique_songs = song_data.drop_duplicates(subset=['Song ID']).reset_index(drop=True)[['Song ID', 'title']]


In [63]:
# find song titles from song ids
decode_id_song = {
    song: i for i, song in 
    enumerate(list(df_unique_songs.set_index('Song ID').loc[songs_features.index].title))
}

In [67]:
model = Recommender(metric='cosine', algorithm='brute', k=10, data=matrix_songs_features, decode_id_song=decode_id_song)

In [88]:
song = 'whole lotta love'

In [89]:
new_recommendations = model.make_recommendation(new_song=song, n_recommendations=10)

Starting the recommendation process for whole lotta love ...
... Done


In [91]:
print(f"The recommendations for {song} are:")
for song in new_recommendations:
    print(f"{song}")

The recommendations for Sweet Love Child are:
Faraway Land
No Ordinary Love feat. Nirva Dorsaint-Ready
She'd Rather Be With Me
Superstitious
Da Cor Do Pecado
Do You Believe In Magic?
Simple As That
Borivóknak való
Naturally
Sweet Love Child
