# Unsupervised Machine Learning: SVD

## Importar Librerías

In [2]:
# ===== Librerías ==========================================
import warnings

import numpy as np
import pandas as pd   

## Cargar Datos

In [14]:
# Load the dataset
df = pd.read_csv('./datasets/kaggle/spotify_cleaned_dataset.csv')
print(df.shape)
df.head()

(20594, 24)


Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Title,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon
0,Gorillaz,Feel Good Inc.,Demon Days,album,0.818,0.705,-6.679,0.177,0.00836,0.00233,...,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221.0,6220896.0,169907.0,True,True,1040235000.0,1.150082,Spotify
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,0.676,0.703,-5.815,0.0302,0.0869,0.000687,...,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645.0,1079128.0,31003.0,True,True,310083700.0,15.183585,Spotify
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,0.695,0.923,-3.93,0.0522,0.0425,0.0469,...,Gorillaz - New Gold ft. Tame Impala & Bootie B...,Gorillaz,8435055.0,282142.0,7399.0,True,True,63063470.0,7.956897,Spotify
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,0.689,0.739,-5.81,0.026,1.5e-05,0.509,...,Gorillaz - On Melancholy Hill (Official Video),Gorillaz,211754952.0,1788577.0,55229.0,True,True,434663600.0,11.546875,Spotify
4,Gorillaz,Clint Eastwood,Gorillaz,album,0.663,0.694,-8.627,0.171,0.0253,0.0,...,Gorillaz - Clint Eastwood (Official Video),Gorillaz,618480958.0,6197318.0,155930.0,True,True,617259700.0,9.942693,Youtube


In [16]:
# Load the ratings dataset
df_ratings = pd.read_csv('./datasets/kaggle/ratings.csv')
print(df_ratings.shape)
df_ratings.head()

(168, 3)


Unnamed: 0,track_id,user_id,rating
0,9666,3,5
1,9667,3,5
2,9668,3,5
3,9669,3,5
4,9670,3,5


## Transformaciones y Extracción de Características (_Feature Extraction_)

In [5]:
# Transformaciones
df['Licensed'] = df['Licensed'].replace(df['Licensed'].value_counts().index[2:], 'False')
df['official_video'] = df['official_video'].replace(df['official_video'].value_counts().index[2:], 'False')
df['Licensed'] = df['Licensed'].astype(bool)
df['official_video'] = df['official_video'].astype(bool)

# Tratamiento de nulos
df = df.dropna()

## Preprocesamiento con StandardScaler

In [6]:
numericals = df.describe().columns
numericals

Index(['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness',
       'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_min',
       'Views', 'Likes', 'Comments', 'Stream', 'EnergyLiveness'],
      dtype='object')

In [7]:
df[numericals]

Unnamed: 0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Views,Likes,Comments,Stream,EnergyLiveness
0,0.818,0.705,-6.679,0.1770,0.008360,0.002330,0.6130,0.7720,138.559,3.710667,693555221.0,6220896.0,169907.0,1.040235e+09,1.150082
1,0.676,0.703,-5.815,0.0302,0.086900,0.000687,0.0463,0.8520,92.761,3.336217,72011645.0,1079128.0,31003.0,3.100837e+08,15.183585
2,0.695,0.923,-3.930,0.0522,0.042500,0.046900,0.1160,0.5510,108.014,3.585833,8435055.0,282142.0,7399.0,6.306347e+07,7.956897
3,0.689,0.739,-5.810,0.0260,0.000015,0.509000,0.0640,0.5780,120.423,3.897783,211754952.0,1788577.0,55229.0,4.346636e+08,11.546875
4,0.663,0.694,-8.627,0.1710,0.025300,0.000000,0.0698,0.5250,167.953,5.682000,618480958.0,6197318.0,155930.0,6.172597e+08,9.942693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20589,0.582,0.926,-6.344,0.0328,0.448000,0.000000,0.0839,0.6580,90.002,1.577783,71678.0,1113.0,0.0,9.227144e+06,11.036949
20590,0.531,0.936,-1.786,0.1370,0.028000,0.000000,0.0923,0.6570,174.869,2.514283,164741.0,2019.0,0.0,1.089818e+07,10.140845
20591,0.443,0.830,-4.679,0.0647,0.024300,0.000000,0.1540,0.4190,168.388,2.280700,35646.0,329.0,0.0,6.226110e+06,5.389610
20592,0.417,0.767,-4.004,0.4190,0.356000,0.018400,0.1080,0.5390,155.378,1.806450,6533.0,88.0,0.0,6.873961e+06,7.101852


In [8]:
X = df[numericals]  # Feature data
#y = None  # Target labels (not used in K-means clustering)

from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(X) 
X_scale = transformer.transform(X) 
X_scale

array([[ 1.19615216,  0.32570564,  0.21556468, ...,  0.748291  ,
         3.74473754, -0.97566723],
       [ 0.33760304,  0.31636805,  0.40181267, ...,  0.02172816,
         0.73211625,  2.43273074],
       [ 0.45247934,  1.34350316,  0.80815232, ..., -0.10173689,
        -0.28709538,  0.67754317],
       ...,
       [-1.07114304,  0.90930514,  0.64669428, ..., -0.14043872,
        -0.5216077 ,  0.05401158],
       [-1.22834217,  0.61517099,  0.79220052, ..., -0.14043872,
        -0.51893465,  0.46987353],
       [-0.73860641,  1.4135351 ,  0.67601109, ..., -0.14043872,
        -0.52379666,  0.42013427]])

## Sistema de Recomendación con SVD

El objetivo inicial es tener un dataframe con las siguiente columnas: track_id, user_id, rating

In [11]:
import gc #garbage collector

In [12]:
# !pip install scikit-surprise
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split

In [13]:
reader = Reader()

In [17]:
data = Dataset.load_from_df(df_ratings[['user_id','track_id','rating']], reader)

In [18]:
trainset, testset = train_test_split(data, test_size=0.25)

In [19]:
from surprise import SVD
algo = SVD()

In [20]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x15906e300>

In [22]:
predictions = algo.test(testset)
predictions[:5]

[Prediction(uid=3, iid=11964, r_ui=2.0, est=3.597994505216548, details={'was_impossible': False}),
 Prediction(uid=3, iid=13271, r_ui=4.0, est=3.5288608336117275, details={'was_impossible': False}),
 Prediction(uid=3, iid=9668, r_ui=5.0, est=3.417279174683316, details={'was_impossible': False}),
 Prediction(uid=4, iid=12962, r_ui=2.0, est=1.4132224036159762, details={'was_impossible': False}),
 Prediction(uid=4, iid=13275, r_ui=2.0, est=1.6935756380495317, details={'was_impossible': False})]

In [23]:
algo.predict(3,9666)

Prediction(uid=3, iid=9666, r_ui=None, est=3.9502461013918406, details={'was_impossible': False})

In [66]:
usuario = 4
rating = 5   # le pedimos peliculas a las que haya puesto 4 o 5 estrellas

In [67]:
df_user = df_ratings[(df_ratings['user_id'] == usuario) & (df_ratings['rating'] >= rating)]
df_user = df_user.reset_index(drop=True)
df_user['Track'] = df['Track'].loc[df_user.track_id].values
df_user

Unnamed: 0,track_id,user_id,rating,Track


In [68]:
recomendaciones_usuario = df[['Artist','Track']].copy()
print(recomendaciones_usuario.shape)
recomendaciones_usuario.head()

(20594, 2)


Unnamed: 0,Artist,Track
0,Gorillaz,Feel Good Inc.
1,Gorillaz,Rhinestone Eyes
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown)
3,Gorillaz,On Melancholy Hill
4,Gorillaz,Clint Eastwood


In [69]:
recomendaciones_usuario.index.name = 'track_id'
recomendaciones_usuario

Unnamed: 0_level_0,Artist,Track
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Gorillaz,Feel Good Inc.
1,Gorillaz,Rhinestone Eyes
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown)
3,Gorillaz,On Melancholy Hill
4,Gorillaz,Clint Eastwood
...,...,...
20589,SICK LEGEND,JUST DANCE HARDSTYLE
20590,SICK LEGEND,SET FIRE TO THE RAIN HARDSTYLE
20591,SICK LEGEND,OUTSIDE HARDSTYLE SPED UP
20592,SICK LEGEND,ONLY GIRL HARDSTYLE


In [70]:
usuario_vistas = df_ratings[(df_ratings['user_id'] == usuario)]
print(usuario_vistas.shape)
usuario_vistas.head()

(84, 3)


Unnamed: 0,track_id,user_id,rating
10,9666,4,0
11,9667,4,0
12,9668,4,0
13,9669,4,0
14,9670,4,0


In [71]:
recomendaciones_usuario.drop(usuario_vistas['track_id'], inplace = True)
recomendaciones_usuario = recomendaciones_usuario.reset_index()
recomendaciones_usuario

Unnamed: 0,track_id,Artist,Track
0,0,Gorillaz,Feel Good Inc.
1,1,Gorillaz,Rhinestone Eyes
2,2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown)
3,3,Gorillaz,On Melancholy Hill
4,4,Gorillaz,Clint Eastwood
...,...,...,...
20505,20589,SICK LEGEND,JUST DANCE HARDSTYLE
20506,20590,SICK LEGEND,SET FIRE TO THE RAIN HARDSTYLE
20507,20591,SICK LEGEND,OUTSIDE HARDSTYLE SPED UP
20508,20592,SICK LEGEND,ONLY GIRL HARDSTYLE


In [72]:
recomendaciones_usuario['Estimate_Score'] = recomendaciones_usuario['track_id'].apply(lambda x: algo.predict(usuario, x).est)

In [73]:
recomendaciones_usuario = recomendaciones_usuario.sort_values('Estimate_Score', ascending=False)
print(recomendaciones_usuario.head(10))

       track_id        Artist                        Track  Estimate_Score
0             0      Gorillaz               Feel Good Inc.        1.502542
13460     13523      Skrillex     HUMBLE. - SKRILLEX REMIX        1.502542
13678     13741    Alan Gomez  CALLEJERO FINO | Mission 10        1.502542
13677     13740    Alan Gomez       LA JOAQUI | Mission 08        1.502542
13676     13739    Alan Gomez                     Butakera        1.502542
13675     13738  RÜFÜS DU SOL      Innerbloom (Radio Edit)        1.502542
13674     13737  RÜFÜS DU SOL          I Don't Wanna Leave        1.502542
13673     13736  RÜFÜS DU SOL                     No Place        1.502542
13672     13735  RÜFÜS DU SOL                     Sundream        1.502542
13671     13734  RÜFÜS DU SOL             Treat You Better        1.502542


### 3.3 Evaluación

Para el conjunto de `testset`, evaluamos el error RMSE entre las predicciones y las verdaderas calificaciones que le habían dado a las películas. Para eso, buscar en la documentación cómo se hace.

In [74]:
# from surprise import COMPLETAR
from surprise import accuracy

# Calculate RMSE for test dataset
accuracy.rmse(predictions)

RMSE: 0.8700


0.870007303671703