In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import pickle
import tqdm

from surprise import KNNBasic, accuracy, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from sklearn.metrics import jaccard_score

In [65]:
def generate_intervals(dataframe, intervals):
    """
    Función para discretizar los valores de frecuencia de interacción del usuario con las diferentes canciones.
    """
    for interval in intervals:
        min_i = intervals[interval]['min']
        max_i = intervals[interval]['max']
        valor_i = intervals[interval]['valor']
        if max_i != None:
            dataframe.loc[(dataframe['rating']>=min_i) & (dataframe['rating']<max_i), 'rating'] = valor_i
        else:
            dataframe.loc[dataframe['rating']>=min_i, 'rating'] = valor_i
    return dataframe

def initialize():
    
    data_cols = ['user_id', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name']
    songs = pd.read_csv('dataset/userid-timestamp-artid-artname-traid-traname.tsv', sep='\t', on_bad_lines='skip', \
                   names = data_cols)
    
    songs_id = pd.DataFrame(songs['track_name'].unique(), columns = ['track_name'])
    songs_id['track_id'] = range(1, len(songs_id)+1)
    
    artists_id = pd.DataFrame(songs['artist_name'].unique(), columns = ['artist_name'])
    artists_id['artist_id'] = range(1, len(artists_id)+1)
    
    songs = songs[['user_id', 'artist_name', 'track_name']]
    songs = songs.merge(artists_id, on='artist_name', how='left')
    songs = songs.merge(songs_id, on='track_name', how='left')
    songs = songs[['user_id', 'artist_id', 'artist_name', 'track_id', 'track_name']]
    
    intervals = {
        'intervalo 1': {'min':0, 'max':10, 'valor':0},
        'intervalo 2': {'min':10, 'max':None, 'valor':1}
    }
    
    ratings_artists = songs[['user_id', 'artist_id']].value_counts().to_frame().reset_index()
    ratings_artists.columns = ['user_id', 'artist_id', 'rating']
    ratings_artists = generate_intervals(ratings_artists, intervals)
    
    n_samples = 80
    ratings_artists = ratings_artists.groupby('user_id').sample(n=n_samples, random_state=1, replace=True).drop_duplicates()
    
    min_rating, max_rating = ratings_artists['rating'].min(), ratings_artists['rating'].max()
    reader = Reader(rating_scale=(min_rating, max_rating))
    ratings_artists_dataset = Dataset.load_from_df(ratings_artists[['user_id', 'artist_id', 'rating']], reader)
    
    rating_data = ratings_artists_dataset.build_full_trainset()
    test = rating_data.build_anti_testset()
    
    predictions = model.test(test)
    
    return songs, artists_id, predictions

In [74]:
def load_model(model_name):
    with open(model_name, 'rb') as file:
        model = pickle.load(file)
    return model

def get_K_recommendations(uid, ratings, items, top_k, model, predictions):
    
    items_user = list(ratings[ratings['user_id']==uid]['artist_id'].drop_duplicates())
    unseen_items = [x for x in items['artist_id'] if x not in items_user]
    
    user_predictions = list(filter(lambda x: x[0]==uid, predictions))
    top_K_recommendations = [[x.iid, x.est] for x in user_predictions if x.details['was_impossible']==False and \
                              x.iid in unseen_items]
    
    if len(top_K_recommendations)<top_k:
        top_K_recommendations_tmp = [[x.iid, x.est] for x in user_predictions if x.details['was_impossible']==True and \
                                      x.iid in unseen_items]
        n_items = top_k - len(top_K_recommendations)
        top_K_recommendations = top_K_recommendations + top_K_recommendations_tmp[:n_items]
            
    top_K_recommendations_df = pd.DataFrame(data=top_K_recommendations, columns=['item', 'pred_rating'])
    top_K_recommendations_df = top_K_recommendations_df.sort_values(by='pred_rating', ascending = False).head(top_k)
    top_K_recommendations_df['pred_rating'] = top_K_recommendations_df['pred_rating']
    top_K_recommendations_df = top_K_recommendations_df.merge(items, left_on='item', right_on='artist_id', how='left')
    
    return top_K_recommendations_df[['artist_name', 'pred_rating', 'artist_id']].reset_index(drop=True)

In [68]:
model = load_model('best_model.pkl')
ratings, items, predictions = initialize()

In [75]:
top_k = 10
uid = 'user_000010'
recommendations = get_K_recommendations(uid=uid, ratings=ratings, items=items, top_k=top_k, model=model, predictions=predictions)
recommendations

Unnamed: 0,artist_name,pred_rating,artist_id
0,Ray Charles,1.0,881
1,Lamb,1.0,261
2,Interpol,1.0,1056
3,Arcade Fire,1.0,919
4,The New Pornographers,1.0,3761
5,Weezer,1.0,1005
6,Metallica,1.0,3929
7,Radiohead,1.0,80
8,Of Montreal,1.0,1348
9,Babyshambles,1.0,740
