# Unsupervised Machine Learning: SVD

## Importar Librerías

In [186]:
%reset -f

In [187]:
# ===== Librerías ==========================================
import warnings

import numpy as np
import pandas as pd   

## Cargar Datos

In [188]:
# Load the dataset
df = pd.read_csv('./datasets/kaggle/spotify_cleaned_dataset.csv')
print(df.shape)
df.head()

(20594, 24)


Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Title,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon
0,Gorillaz,Feel Good Inc.,Demon Days,album,0.818,0.705,-6.679,0.177,0.00836,0.00233,...,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221.0,6220896.0,169907.0,True,True,1040235000.0,1.150082,Spotify
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,0.676,0.703,-5.815,0.0302,0.0869,0.000687,...,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645.0,1079128.0,31003.0,True,True,310083700.0,15.183585,Spotify
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,0.695,0.923,-3.93,0.0522,0.0425,0.0469,...,Gorillaz - New Gold ft. Tame Impala & Bootie B...,Gorillaz,8435055.0,282142.0,7399.0,True,True,63063470.0,7.956897,Spotify
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,0.689,0.739,-5.81,0.026,1.5e-05,0.509,...,Gorillaz - On Melancholy Hill (Official Video),Gorillaz,211754952.0,1788577.0,55229.0,True,True,434663600.0,11.546875,Spotify
4,Gorillaz,Clint Eastwood,Gorillaz,album,0.663,0.694,-8.627,0.171,0.0253,0.0,...,Gorillaz - Clint Eastwood (Official Video),Gorillaz,618480958.0,6197318.0,155930.0,True,True,617259700.0,9.942693,Youtube


In [189]:
# Load the ratings dataset
df_ratings = pd.read_csv('./datasets/kaggle/ratings.csv')
print(df_ratings.shape)
df_ratings.head()

(168, 3)


Unnamed: 0,track_id,user_id,rating
0,9666,3,5
1,9667,3,5
2,9668,3,5
3,9669,3,5
4,9670,3,5


## Sistema de Recomendación con SVD

In [190]:
import gc #garbage collector

In [191]:
# !pip install scikit-surprise
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split

In [195]:
# Modelling
from surprise import SVD
from surprise.model_selection import cross_validate

# Get minimum and maximum rating from the dataset
min_rating = df_ratings.rating.min()
max_rating = df_ratings.rating.max()
 
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(df_ratings[['user_id', 'track_id', 'rating']], reader)

svd = SVD(n_epochs=10)
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.2147  1.1573  1.1393  0.9612  0.7214  1.1109  1.1105  0.9464  0.8978  0.6616  0.9921  0.1792  
MAE (testset)     1.0034  1.0132  0.8620  0.8333  0.6163  0.9124  0.9596  0.8260  0.7799  0.6039  0.8410  0.1365  
Fit time          0.00    0.00    0.01    0.00    0.00    0.04    0.00    0.00    0.00    0.01    0.01    0.01    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    


In [197]:
print("Average RMSE: ", np.average(results["test_rmse"]))
print("Average MAE: ", np.average(results["test_mae"]))

Average RMSE:  0.9921159856970352
Average MAE:  0.8410078038938842


In [198]:
# Hyperparameter tuning
from surprise.model_selection import GridSearchCV
 
param_grid = {
  'n_factors': [20, 50, 100],
  'n_epochs': [5, 10, 20]
}
 
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)
gs.fit(data)
 
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.988011702221226
{'n_factors': 50, 'n_epochs': 20}


In [199]:
# best hyperparameters
best_factor = gs.best_params['rmse']['n_factors']
best_epoch = gs.best_params['rmse']['n_epochs']

In [175]:
reader = Reader()

In [176]:
data = Dataset.load_from_df(df_ratings[['user_id','track_id','rating']], reader)

In [200]:
trainset, testset = train_test_split(data, test_size=0.25)

In [204]:
from surprise import SVD
# algo = SVD()
algo = SVD(n_factors=best_factor, n_epochs=best_epoch)

In [205]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1494b1940>

In [206]:
# Ref. https://surprise.readthedocs.io/en/stable/predictions_module.html
predictions = algo.test(testset)
predictions[:10]

[Prediction(uid=4, iid=17203, r_ui=1.0, est=1.8165586836991978, details={'was_impossible': False}),
 Prediction(uid=4, iid=14482, r_ui=1.0, est=1.8298271871446, details={'was_impossible': False}),
 Prediction(uid=4, iid=12970, r_ui=2.0, est=1.9348806830614975, details={'was_impossible': False}),
 Prediction(uid=4, iid=51, r_ui=1.0, est=1.845274940066108, details={'was_impossible': False}),
 Prediction(uid=3, iid=12392, r_ui=3.0, est=3.3244024376751473, details={'was_impossible': False}),
 Prediction(uid=4, iid=12962, r_ui=2.0, est=1.845274940066108, details={'was_impossible': False}),
 Prediction(uid=4, iid=17204, r_ui=1.0, est=1.7510806801749346, details={'was_impossible': False}),
 Prediction(uid=3, iid=12383, r_ui=2.0, est=3.563915666156804, details={'was_impossible': False}),
 Prediction(uid=3, iid=9675, r_ui=5.0, est=3.3830741086758795, details={'was_impossible': False}),
 Prediction(uid=4, iid=11970, r_ui=2.0, est=1.6484975827724326, details={'was_impossible': False})]

In [207]:
algo.predict(uid=str(3), iid=str(9666)) # Recomendación para el usuario 3 y la canción 9666 (Complicated - Avril Lavigne)

Prediction(uid='3', iid='9666', r_ui=None, est=2.6031746031746033, details={'was_impossible': False})

In [208]:
# https://surprise.readthedocs.io/en/stable/FAQ.html
from collections import defaultdict
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est, true_r))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _, _) in user_ratings])
    if uid==3:
        df_3 = [[iid, df.loc[iid,'Artist'], df.loc[iid,'Track'], est, true_r] for (iid, est, true_r) in user_ratings]
    if uid==4:
        df_4 = [[iid, df.loc[iid,'Artist'], df.loc[iid,'Track'], est, true_r] for (iid, est, true_r) in user_ratings]



4 [646, 18514, 9669, 9666, 9674, 12430, 12970, 17198, 18519, 18517]
3 [12429, 648, 12383, 639, 7978, 643, 17197, 12427, 9675, 12962]


In [209]:
df_3 = pd.DataFrame(df_3)
df_3


Unnamed: 0,0,1,2,3,4
0,12429,Taylor Swift,Lover,3.582445,3.0
1,648,Bon Jovi,Thank You For Loving Me,3.567789,5.0
2,12383,Ed Sheeran,Bad Habits,3.563916,2.0
3,639,Bon Jovi,Livin' On A Prayer,3.555042,5.0
4,7978,David Guetta,I'm Good (Blue),3.492022,2.0
5,643,Bon Jovi,Wanted Dead Or Alive,3.461102,5.0
6,17197,Shawn Mendes,Treat You Better,3.456222,3.0
7,12427,Taylor Swift,cardigan,3.4349,3.0
8,9675,Avril Lavigne,Here's to Never Growing Up,3.383074,5.0
9,12962,Katy Perry,Dark Horse,3.383074,4.0


In [210]:
df_4 = pd.DataFrame(df_4)
df_4


Unnamed: 0,0,1,2,3,4
0,646,Bon Jovi,I'll Be There For You,2.062265,3.0
1,18514,Olivia Rodrigo,good 4 u,2.033364,1.0
2,9669,Avril Lavigne,What the Hell,2.001093,1.0
3,9666,Avril Lavigne,Complicated,1.980032,1.0
4,9674,Avril Lavigne,Bite Me,1.950237,1.0
5,12430,Taylor Swift,"You're On Your Own, Kid",1.948674,2.0
6,12970,Katy Perry,Firework,1.934881,2.0
7,17198,Shawn Mendes,Mercy,1.931758,1.0
8,18519,Olivia Rodrigo,happier,1.919472,1.0
9,18517,Olivia Rodrigo,deja vu,1.870244,1.0


In [None]:
# # Ref. https://medium.com/tiket-com/get-to-know-with-surprise-2281dd227c3e
# def generate_recommendation(model, user_id, ratings_df, movies_df, n_items):
#    # Get a list of all movie IDs from dataset
#    movie_ids = ratings_df["track_id"].unique()
 
#    # Get a list of all movie IDs that have been watched by user
#    movie_ids_user = ratings_df.loc[ratings_df["user_id"] == user_id, "track_id"]

#     # Get a list off all movie IDS that that have not been watched by user
#    movie_ids_to_pred = np.setdiff1d(movie_ids, movie_ids_user)
   
#    # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
#    test_set = [[user_id, movie_id, 4] for movie_id in movie_ids_to_pred]
  
#    # Predict the ratings and generate recommendations
#    predictions = model.test(test_set)
#    pred_ratings = np.array([pred.est for pred in predictions])
#    print("Top {0} item recommendations for user {1}:".format(n_items, user_id))
#    # Rank top-n movies based on the predicted ratings   
#    index_max = (-pred_ratings).argsort()[:n_items]
#    for i in index_max:
#        movie_id = movie_ids_to_pred[i]
#        print(movies_df[movies_df["track_id"]==movie_id]["track"].values[0], pred_ratings[i])

# # define which user ID that we want to give recommendation
# userID = 3
# # define how many top-n movies that we want to recommend
# n_items = 10
# # generate recommendation using the model that we have trained
# generate_recommendation(algo, userID, df_ratings, df, n_items)       

[]
Top 10 item recommendations for user 3:


In [102]:
usuario = 3  # Usuario para el cual queremos la recomendación
rating = 5   # Canciones que ese usuario ha valorado con 4 o 5 estrellas

In [103]:
df_user = df_ratings[(df_ratings['user_id'] == usuario) & (df_ratings['rating'] >= rating)]
df_user = df_user.reset_index(drop=True)
df_user['Track'] = df['Track'].loc[df_user.track_id].values
df_user

Unnamed: 0,track_id,user_id,rating,Track
0,9666,3,5,Complicated
1,9667,3,5,Sk8er Boi
2,9668,3,5,Girlfriend
3,9669,3,5,What the Hell
4,9670,3,5,I'm with You
5,9671,3,5,My Happy Ending
6,9672,3,5,When You're Gone
7,9673,3,5,I’m a Mess (with YUNGBLUD)
8,9674,3,5,Bite Me
9,9675,3,5,Here's to Never Growing Up


In [104]:
recomendaciones_usuario = df[['Artist','Track']].copy()
print(recomendaciones_usuario.shape)
recomendaciones_usuario.head()

(20592, 2)


Unnamed: 0,Artist,Track
0,Gorillaz,Feel Good Inc.
1,Gorillaz,Rhinestone Eyes
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown)
3,Gorillaz,On Melancholy Hill
4,Gorillaz,Clint Eastwood


In [105]:
recomendaciones_usuario.index.name = 'track_id'
recomendaciones_usuario

Unnamed: 0_level_0,Artist,Track
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Gorillaz,Feel Good Inc.
1,Gorillaz,Rhinestone Eyes
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown)
3,Gorillaz,On Melancholy Hill
4,Gorillaz,Clint Eastwood
...,...,...
20589,SICK LEGEND,JUST DANCE HARDSTYLE
20590,SICK LEGEND,SET FIRE TO THE RAIN HARDSTYLE
20591,SICK LEGEND,OUTSIDE HARDSTYLE SPED UP
20592,SICK LEGEND,ONLY GIRL HARDSTYLE


In [106]:
usuario_vistas = df_ratings[(df_ratings['user_id'] == usuario)]
print(usuario_vistas.shape)
usuario_vistas.head()

(104, 3)


Unnamed: 0,track_id,user_id,rating
0,9666,3,5
1,9667,3,5
2,9668,3,5
3,9669,3,5
4,9670,3,5


In [107]:
recomendaciones_usuario.drop(usuario_vistas['track_id'], inplace = True)
recomendaciones_usuario = recomendaciones_usuario.reset_index()
recomendaciones_usuario

Unnamed: 0,track_id,Artist,Track
0,0,Gorillaz,Feel Good Inc.
1,1,Gorillaz,Rhinestone Eyes
2,2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown)
3,3,Gorillaz,On Melancholy Hill
4,4,Gorillaz,Clint Eastwood
...,...,...,...
20483,20589,SICK LEGEND,JUST DANCE HARDSTYLE
20484,20590,SICK LEGEND,SET FIRE TO THE RAIN HARDSTYLE
20485,20591,SICK LEGEND,OUTSIDE HARDSTYLE SPED UP
20486,20592,SICK LEGEND,ONLY GIRL HARDSTYLE


In [110]:
recomendaciones_usuario['Estimate_Score'] = recomendaciones_usuario['track_id'].apply(lambda x: algo.predict(str(usuario), str(x)).est)

In [111]:
recomendaciones_usuario = recomendaciones_usuario.sort_values('Estimate_Score', ascending=False)
# print(recomendaciones_usuario.head(10))
recomendaciones_usuario.head(100)

Unnamed: 0,track_id,Artist,Track,Estimate_Score
0,0,Gorillaz,Feel Good Inc.,2.685897
1279,1291,Nelly Furtado,Say It Right - Sped Up Remix,2.685897
3,3,Gorillaz,On Melancholy Hill,2.685897
4,4,Gorillaz,Clint Eastwood,2.685897
5,5,Gorillaz,DARE,2.685897
...,...,...,...,...
95,97,Black Eyed Peas,Meet Me Halfway,2.685897
96,98,Black Eyed Peas,RITMO (Bad Boys For Life),2.685897
97,99,Black Eyed Peas,SIMPLY THE BEST,2.685897
90,92,Black Eyed Peas,Where Is The Love?,2.685897


In [112]:
algo.predict(str(3),str(13661))

Prediction(uid='3', iid='13661', r_ui=None, est=2.6858974358974357, details={'was_impossible': False})

### 3.3 Evaluación

Para el conjunto de `testset`, evaluamos el error RMSE entre las predicciones y las verdaderas calificaciones que le habían dado a las películas. Para eso, buscar en la documentación cómo se hace.

In [185]:
# from surprise import COMPLETAR
from surprise import accuracy

# Calculate RMSE for test dataset
accuracy.rmse(predictions)

RMSE: 0.9699


0.9698944335756162