# Unsupervised Machine Learning: SVD

## Importar Librerías

In [66]:
%reset -f

In [67]:
import warnings

import numpy as np
import pandas as pd   

## Cargar Datos

In [68]:
# Load the dataset
df = pd.read_csv('./datasets/kaggle/spotify_cleaned_dataset.csv')
print(df.shape)
df.head()

(20594, 24)


Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Title,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon
0,Gorillaz,Feel Good Inc.,Demon Days,album,0.818,0.705,-6.679,0.177,0.00836,0.00233,...,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221.0,6220896.0,169907.0,True,True,1040235000.0,1.150082,Spotify
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,0.676,0.703,-5.815,0.0302,0.0869,0.000687,...,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645.0,1079128.0,31003.0,True,True,310083700.0,15.183585,Spotify
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,0.695,0.923,-3.93,0.0522,0.0425,0.0469,...,Gorillaz - New Gold ft. Tame Impala & Bootie B...,Gorillaz,8435055.0,282142.0,7399.0,True,True,63063470.0,7.956897,Spotify
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,0.689,0.739,-5.81,0.026,1.5e-05,0.509,...,Gorillaz - On Melancholy Hill (Official Video),Gorillaz,211754952.0,1788577.0,55229.0,True,True,434663600.0,11.546875,Spotify
4,Gorillaz,Clint Eastwood,Gorillaz,album,0.663,0.694,-8.627,0.171,0.0253,0.0,...,Gorillaz - Clint Eastwood (Official Video),Gorillaz,618480958.0,6197318.0,155930.0,True,True,617259700.0,9.942693,Youtube


In [69]:
# Load the ratings dataset
df_ratings = pd.read_csv('./datasets/kaggle/ratings.csv')
print(df_ratings.shape)
df_ratings.head()

(168, 3)


Unnamed: 0,track_id,user_id,rating
0,9666,3,5
1,9667,3,5
2,9668,3,5
3,9669,3,5
4,9670,3,5


## Sistema de Recomendación con SVD

In [None]:
# !pip install scikit-surprise

In [70]:
from surprise import Dataset
from surprise import Reader

# Get minimum and maximum rating from the dataset
min_rating = df_ratings.rating.min()
max_rating = df_ratings.rating.max()

# Load dataset in the way Surprise expects 
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(df_ratings[['user_id', 'track_id', 'rating']], reader)

In [61]:
# from surprise.model_selection import train_test_split

# trainset, testset = train_test_split(data, test_size=0.25)

In [71]:
from surprise import SVD

# Modelling
# algo = SVD()

In [86]:
from surprise.model_selection import cross_validate

# Modelling
algo = SVD(n_epochs=10)
results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8754  1.0861  0.9501  1.0743  0.9234  1.1414  1.0985  0.8934  1.2292  0.7676  1.0039  0.1357  
MAE (testset)     0.7538  0.9102  0.7536  0.9098  0.8362  0.9809  0.9337  0.7525  1.0351  0.6485  0.8514  0.1159  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    


In [87]:
print("Average RMSE: ", np.average(results["test_rmse"]))
print("Average MAE: ", np.average(results["test_mae"]))

Average RMSE:  1.0039371530062875
Average MAE:  0.8514226897038484


In [88]:
# Hyperparameter tuning
from surprise.model_selection import GridSearchCV
 
param_grid = {
  'n_factors': [20, 50, 100],
  'n_epochs': [5, 10, 20]
}
 
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)
gs.fit(data)
 
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.0035645852311728
{'n_factors': 100, 'n_epochs': 20}


In [89]:
# best hyperparameters
best_factor = gs.best_params['rmse']['n_factors']
best_epoch = gs.best_params['rmse']['n_epochs']

In [90]:
# Modelling
algo = SVD(n_factors=best_factor, n_epochs=best_epoch)

In [91]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=0.25)

In [92]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13b53ad20>

In [93]:
predictions = algo.test(testset)
predictions[:10]

[Prediction(uid=4, iid=12392, r_ui=1.0, est=1.8034596388866642, details={'was_impossible': False}),
 Prediction(uid=4, iid=11963, r_ui=2.0, est=1.7574730021087865, details={'was_impossible': False}),
 Prediction(uid=3, iid=18517, r_ui=4.0, est=2.900658482309226, details={'was_impossible': False}),
 Prediction(uid=3, iid=17199, r_ui=3.0, est=3.153620450210349, details={'was_impossible': False}),
 Prediction(uid=3, iid=11971, r_ui=2.0, est=3.391809151791307, details={'was_impossible': False}),
 Prediction(uid=3, iid=12973, r_ui=2.0, est=3.678380706001884, details={'was_impossible': False}),
 Prediction(uid=3, iid=18519, r_ui=4.0, est=3.394173401138915, details={'was_impossible': False}),
 Prediction(uid=3, iid=12428, r_ui=3.0, est=3.4612716236723777, details={'was_impossible': False}),
 Prediction(uid=4, iid=18521, r_ui=1.0, est=1.9517234495845142, details={'was_impossible': False}),
 Prediction(uid=3, iid=11965, r_ui=2.0, est=3.445405396799664, details={'was_impossible': False})]

In [94]:
# Ref. https://surprise.readthedocs.io/en/stable/predictions_module.html
# Tal y como indica la documentación, el método .predict() toma como parámetros el (raw) user id y el (raw) item id
algo.predict(uid=str(3), iid=str(9666)) # Recomendación para el usuario 3 y la canción 9666 (Complicated - Avril Lavigne)

Prediction(uid='3', iid='9666', r_ui=None, est=2.5873015873015874, details={'was_impossible': False})

In [95]:
# https://surprise.readthedocs.io/en/stable/FAQ.html
from collections import defaultdict
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est, true_r))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [96]:
n = 10
top_n = get_top_n(predictions, n=n)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():    
    print(uid, [iid for (iid, _, _) in user_ratings])
    # print(f'Top {n} item recommendations for user {uid}:')    
    # print([iid for (iid, _, _) in user_ratings],'\n' if uid!=list(top_n)[-1] else '')

4 [18523, 9668, 18521, 12970, 9667, 51, 17200, 12392, 9675, 17201]
3 [641, 12973, 7978, 639, 18514, 12428, 18516, 11965, 645, 18519]


In [97]:
usuario = 3

# Print the recommended items for the selected user
print(f'Top {n} item recommendations for user {usuario}:')    
df_user = pd.DataFrame([[iid, df.loc[iid,'Artist'], df.loc[iid,'Track'], est, true_r] for (iid, est, true_r) in top_n.get(usuario)],
                            columns=['track_id','Artist','Track','estimate_rating',f'user_{usuario}_true_rating'])
df_user

Top 10 item recommendations for user 3:


Unnamed: 0,track_id,Artist,Track,estimate_rating,user_3_true_rating
0,641,Bon Jovi,It's My Life,3.750684,5.0
1,12973,OneRepublic,Counting Stars,3.678381,2.0
2,7978,David Guetta,I'm Good (Blue),3.586489,2.0
3,639,Bon Jovi,Livin' On A Prayer,3.519076,5.0
4,18514,Olivia Rodrigo,good 4 u,3.513555,4.0
5,12428,Taylor Swift,Snow On The Beach (feat. Lana Del Rey),3.461272,3.0
6,18516,Olivia Rodrigo,drivers license,3.451309,4.0
7,11965,Panic! At The Disco,I Write Sins Not Tragedies,3.445405,2.0
8,645,Bon Jovi,Runaway,3.399287,5.0
9,18519,Olivia Rodrigo,happier,3.394173,4.0


In [98]:
usuario = 4

# Print the recommended items for the selected user
print(f'Top {n} item recommendations for user {usuario}:')    
df_user = pd.DataFrame([[iid, df.loc[iid,'Artist'], df.loc[iid,'Track'], est, true_r] for (iid, est, true_r) in top_n.get(usuario)],
                            columns=['track_id','Artist','Track','estimate_rating',f'user_{usuario}_true_rating'])
df_user


Top 10 item recommendations for user 4:


Unnamed: 0,track_id,Artist,Track,estimate_rating,user_4_true_rating
0,18523,Olivia Rodrigo,"All I Want - From ""High School Musical: The Mu...",2.150173,1.0
1,9668,Avril Lavigne,Girlfriend,2.034719,1.0
2,18521,Olivia Rodrigo,brutal,1.951723,1.0
3,12970,Katy Perry,Firework,1.879105,2.0
4,9667,Avril Lavigne,Sk8er Boi,1.859726,1.0
5,51,Daft Punk,One More Time,1.820988,1.0
6,17200,Shawn Mendes,Stitches,1.808208,1.0
7,12392,Miley Cyrus,Flowers,1.80346,1.0
8,9675,Avril Lavigne,Here's to Never Growing Up,1.80346,1.0
9,17201,Shawn Mendes,Summer of Love (Shawn Mendes & Tainy),1.80346,1.0


### 3.3 Evaluación

Para el conjunto de `testset`, evaluamos el error RMSE entre las predicciones y las verdaderas calificaciones que le habían dado a las películas.

In [99]:
from surprise import accuracy

# Calculate RMSE for test dataset
accuracy.rmse(predictions)

RMSE: 0.9585


0.9584796535126707